# **Content-Based Filtering for Netflix Title Recommendations**


> **Group 8**


1.   2702305576 - Grace Esther D. S.
2.   2702269620 - Adhi Swasono Aryaning Bawono
3. 2702274015 - Axel Dimas Anugrah
---

This section imports all required libraries for preprocessing, vectorization, similarity computation, and model building.
It also suppresses warnings and ensures all required NLTK resources are downloaded.

In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from scipy.sparse import hstack
from collections import defaultdict
import pickle
import nltk
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import warnings
warnings.filterwarnings('ignore')

In [29]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

We define a function to normalize, tokenize, lemmatize, and remove stopwords from text columns such as description, genres, and cast.
This prepares the textual data for vectorization.

In [30]:
filepath = '/content/netflix_titles.csv'

In [31]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [32]:
def get_wordnet_pos(tag):
    """Map POS tag to WordNet POS tag for lemmatization."""
    tag = tag[0].upper() if tag else ''
    tag_dict = {"J": wordnet.ADJ, "V": wordnet.VERB, "N": wordnet.NOUN, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [33]:
def clean_text(text):
    """Normalize, tokenize, lemmatize, and remove stopwords."""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Keep alphanumeric + whitespace
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in tagged
        if word not in stop_words and len(word) > 2
    ]
    return ' '.join(lemmatized)

We load the `netflix_titles.csv` dataset, rename the `listed_in` column to `genres`, and fill missing values in key columns.
Then, we clean each text column using the previously defined cleaning pipeline.

In [34]:
# Data Loading & Cleaning
def load_and_preprocess_data(filepath):
    """Load data and handle missing values."""
    df = pd.read_csv(filepath)
    df.rename(columns={'listed_in': 'genres'}, inplace=True)

    # Fill missing values
    text_cols = ['description', 'director', 'cast', 'country', 'genres']
    df[text_cols] = df[text_cols].fillna('unknown')
    df['rating'] = df['rating'].fillna('unrated')

    # Clean text fields
    for col in text_cols:
        df[col] = df[col].apply(clean_text)

    return df

In [35]:
df = load_and_preprocess_data('netflix_titles.csv')

We convert cleaned text into numerical vectors using TF-IDF on the following fields:
- Description (highest weight)
- Genres
- Director
- Cast
- Country

Each feature is given a specific weight based on its assumed importance in user decision-making.

In [36]:
desc_vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
gen_vectorizer  = TfidfVectorizer(max_features=500)
dir_vectorizer  = TfidfVectorizer(max_features=300)
cast_vectorizer = TfidfVectorizer(max_features=500)
ctry_vectorizer = TfidfVectorizer(max_features=200)

In [37]:
desc_vec = desc_vectorizer.fit_transform(df['description'])
gen_vec  = gen_vectorizer.fit_transform(df['genres'])
dir_vec  = dir_vectorizer.fit_transform(df['director'])
cast_vec = cast_vectorizer.fit_transform(df['cast'])
ctry_vec = ctry_vectorizer.fit_transform(df['country'])

In [38]:
# Weighted feature combination
combined_features = hstack([
    1.0 * desc_vec,  # Highest weight for descriptions
    0.8 * gen_vec,   # Genres are very important
    0.5 * dir_vec,   # Director matters
    0.4 * cast_vec,  # Cast matters less
    0.2 * ctry_vec   # Country matters least
])

To reduce computation cost and handle the sparsity of high-dimensional vectors, we use **TruncatedSVD** to compress the feature matrix.
The result is then normalized for use in cosine similarity or nearest neighbor search.

In [39]:
# Dimensionality reduction
svd = TruncatedSVD(n_components=200, random_state=42)
reduced_features = svd.fit_transform(combined_features)
norm_features = normalize(reduced_features)

We use `NearestNeighbors` from scikit-learn to perform efficient similarity search using cosine distance.
If unavailable, we fall back to pairwise cosine similarity computation.

In [40]:
# Approximate Nearest Neighbors (for scalability)
try:
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=20, metric='cosine', algorithm='brute')
    nn.fit(norm_features)
    USE_ANN = True
except ImportError:
    USE_ANN = False
    cosine_sim = cosine_similarity(norm_features)

In [41]:
# Index Mapping
title_to_indices = defaultdict(list)
for idx, title in enumerate(df['title']):
    title_to_indices[title.lower()].append(idx)

The `recommend()` function:
1. Accepts a movie title and returns the top-N most similar titles.
2. If the title is not found (cold-start), it recommends items similar to popular recent titles based on genre similarity.

In [42]:
# Recommendation Engine
def recommend(title, top_n=5, fallback_to_popular=True):
    """Get recommendations with robust cold-start fallback."""
    title = title.lower()

    # Case 1: Title exists in dataset
    if title in title_to_indices:
        if USE_ANN:
            distances, indices = nn.kneighbors(
                norm_features[title_to_indices[title][0]].reshape(1, -1),
                n_neighbors=top_n+1
            )
            indices = indices[0][1:]  # Skip self-match
            similarity_scores = 1 - distances[0][1:]
        else:
            idx = title_to_indices[title][0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
            indices = [i[0] for i in sim_scores]
            similarity_scores = [round(i[1], 6) for i in sim_scores]

    # Case 2: Cold-start (title not found)
    else:
        if not fallback_to_popular:
            return pd.DataFrame({'Error': [f"Title '{title}' not found."]})

        print(f"Title '{title}' not found. Recommending popular items by genre...")

        # Fallback: Use genre similarity to popular items
        popular = df.sort_values('release_year', ascending=False).head(100)
        if len(popular) == 0:
            return pd.DataFrame({'Error': ["No popular items found for fallback."]})

        # Re-vectorize popular items' genres to ensure feature alignment
        genre_vectorizer = TfidfVectorizer(max_features=500).fit(df['genres'])
        gen_vec_popular = genre_vectorizer.transform(popular['genres'])

        # Get random popular item's genre vector as query
        query_idx = np.random.choice(len(popular))
        query_genre_vec = genre_vectorizer.transform([popular.iloc[query_idx]['genres']])

        # Compute similarities
        sim_scores = cosine_similarity(query_genre_vec, gen_vec_popular)[0]
        top_indices = np.argsort(sim_scores)[-top_n:][::-1]
        similarity_scores = sim_scores[top_indices]
        indices = popular.iloc[top_indices].index.tolist()

    # Prepare results
    recommendations = df.iloc[indices].copy()
    recommendations['similarity'] = similarity_scores

    return recommendations[[
        'title', 'director', 'cast', 'genres', 'country',
        'release_year', 'rating', 'description', 'similarity'
    ]]

We test the system on:
- A known movie (`Stranger Things`)
- A non-existent title to trigger the fallback mechanism

In [43]:
recommend("Stranger Things")

Unnamed: 0,title,director,cast,genres,country,release_year,rating,description,similarity
3187,Nightflyers,unknown,eoin macken david ajala jodie turnersmith angu...,horror mystery scifi fantasy,united state,2018,TV-MA,humankind future stake group scientist powerfu...,0.837727
6953,Helix,unknown,billy campbell hiroyuki sanada kyra zagorsky m...,horror mystery scifi fantasy,united state canada,2015,TV-MA,investigate possible outbreak arctic research ...,0.791089
1473,Chilling Adventures of Sabrina,unknown,kiernan shipka ross lynch miranda otto lucy da...,horror mystery scifi fantasy,united state,2020,TV-14,magic mischief collide halfhuman halfwitch sab...,0.77348
5287,The Vampire Diaries,unknown,nina dobrev paul wesley ian somerhalder steven...,drama mystery scifi fantasy,united state,2016,TV-14,trap adolescent body feud vampire brother stef...,0.737511
2303,Warrior Nun,unknown,alba baptista toya turner lorena andrea kristi...,action adventure mystery scifi fantasy,united state,2020,TV-MA,wake morgue orphan teen discovers possess supe...,0.721433


In [44]:
recommend("Non-Existent Movie")

Title 'non-existent movie' not found. Recommending popular items by genre...


Unnamed: 0,title,director,cast,genres,country,release_year,rating,description,similarity
110,Money Heist: From Tokyo to Berlin,luis alfaro javier gmez santander,unknown,docuseries international show spanishlanguage ...,unknown,2021,TV-MA,filmmaker actor behind money heist character l...,1.0
1392,Daughter From Another Mother,unknown,ludwika paleta paulina goto martn altomaro liz...,international show spanishlanguage show comedy,mexico,2021,TV-MA,realize baby exchange birth two woman develop ...,0.835556
1460,Monarca,unknown,irene azuela juan manuel bernal osvaldo benavi...,international show spanishlanguage show drama,mexico,2021,TV-MA,year ana mara return mexico vies control famil...,0.816591
109,La casa de papel,unknown,rsula corber itziar ituo lvaro morte paco tous...,crime show international show spanishlanguage ...,spain,2021,TV-MA,eight thief take hostage lock royal mint spain...,0.789807
19,Jaguar,unknown,blanca surez ivn marcos scar casas adrin lastr...,international show spanishlanguage show action...,unknown,2021,TV-MA,1960s holocaust survivor join group selftraine...,0.747369


## Save all necessary components

In [45]:
with open("desc_vectorizer.pkl", "wb") as f:
    pickle.dump(desc_vectorizer, f)

In [46]:
with open("gen_vectorizer.pkl", "wb") as f:
    pickle.dump(gen_vectorizer, f)

In [47]:
with open("dir_vectorizer.pkl", "wb") as f:
    pickle.dump(dir_vectorizer, f)

In [48]:
with open("cast_vectorizer.pkl", "wb") as f:
    pickle.dump(cast_vectorizer, f)

In [49]:
with open("ctry_vectorizer.pkl", "wb") as f:
    pickle.dump(ctry_vectorizer, f)

In [50]:
with open("svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

In [51]:
with open("nn_model.pkl", "wb") as f:
    pickle.dump(nn, f)

In [52]:
with open("norm_features.pkl", "wb") as f:
    pickle.dump(norm_features, f)

In [53]:
with open("title_to_indices.pkl", "wb") as f:
    pickle.dump(title_to_indices, f)

In [54]:
with open("df_metadata.pkl", "wb") as f:
    pickle.dump(df[[
        'title', 'director', 'cast', 'genres', 'country',
        'release_year', 'rating', 'description'
    ]], f)

save only metadata needed for output (not full text vectors)