# Netflix recommendation system

In [1]:
import pandas as pd
import numpy as np
from typing import List

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import string, re
from sklearn.metrics import pairwise_distances
from datasketch import  MinHash, MinHashLSH



# Normalization of the Titles and Descriptions

- Titles are lowercased, punctuation is removed, and parantheses stripped.
- Descriptions are cleaned by lowercasing, removing punctuation, and collapsing whitespace.


In [2]:
#Makes sure the titles and descriptions are lowercased, removes punctuation.
def normalize_title(title: str) -> str:
    if pd.isna(title):
        return ""
    return re.sub(r"\(.*?\)", "", str(title)).lower().strip()

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Tokenization

We extract lists from:
- **Genres**
- **Director names**
- **Cast names**
- **Country**


Each field becomes a list of normalized tokens, which we combine into a single `combined_features` list

In [3]:
## Tokenizes director/cast/country, builds genre_list, and joins them in combined_features
## Tokens are basically used so we can hash words to compare titles based on their hashing
## Loads and prepocesses the data

def tokenize_people(value: str) -> List[str]:
      return [p.strip().lower() for p in str(value).split(",") if p and p.strip()]

def tokenize_single(value: str) -> List[str]:
    v = str(value).strip().lower()
    return [v] if v else []




# Data Loading

Here, we load the Netflix dataset after applying some preprocessing steps (normalizing titles and descriptions, tokenizing) and removing duplicate normalized titles

In [4]:
def load_and_preprocess(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="latin1", sep=",", quotechar='"', engine="python")

    df["title_normalized"] = df["title"].fillna("").apply(normalize_title)
    df["description_clean"] = df["description"].fillna("").apply(clean_text)

    df = df.drop_duplicates(subset="title_normalized").reset_index(drop=True)

    df["genre_list"] = df["listed_in"].apply(
        lambda x: [g.strip().lower() for g in str(x).split(",") if g.strip()]
    )
    df["director_tokens"] = df["director"].fillna("").apply(tokenize_people)
    df["cast_tokens"] = df["cast"].fillna("").apply(tokenize_people)
    df["country_tokens"] = df["country"].fillna("").apply(tokenize_single)

    df["combined_features"] = (
        df["genre_list"] + df["director_tokens"] + df["cast_tokens"] + df["country_tokens"]
    )

    return df

df = load_and_preprocess("netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,title_normalized,description_clean,genre_list,director_tokens,cast_tokens,country_tokens,combined_features
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",dick johnson is dead,as her father nears the end of his life filmma...,[documentaries],[kirsten johnson],[],[united states],"[documentaries, kirsten johnson, united states]"
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",blood & water,after crossing paths at a party a cape town te...,"[international tv shows, tv dramas, tv mysteries]",[],"[ama qamata, khosi ngema, gail mabalane, thaba...",[south africa],"[international tv shows, tv dramas, tv mysteri..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,ganglands,to protect his family from a powerful drug lor...,"[crime tv shows, international tv shows, tv ac...",[julien leclercq],"[sami bouajila, tracy gotoas, samuel jouy, nab...",[],"[crime tv shows, international tv shows, tv ac..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",jailbirds new orleans,feuds flirtations and toilet talk go down amon...,"[docuseries, reality tv]",[],[],[],"[docuseries, reality tv]"
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,kota factory,in a city of coaching centers known to train i...,"[international tv shows, romantic tv shows, tv...",[],"[mayur more, jitendra kumar, ranjan raj, alam ...",[india],"[international tv shows, romantic tv shows, tv..."


# Feature Matrix Construction

We build two matrices that capture different information about each movie/TV series

- **MLB** - We use MLB to one-hot encode the combined metadata:
    
    - Genres
    - Directors
    - Cast
    - Country

- **TF-IDF Matrix** - We use TF-IDF to identify the most characteristic words in each description, which are used later for MinHash and LSH.

In [5]:
##This creates a multi metadata matrix using MLB
##And fits a TfIdf vectorizer
def build_feature_matrices(df: pd.DataFrame):
    mlb = MultiLabelBinarizer(sparse_output=True)
    combined_features_matrix = mlb.fit_transform(df["combined_features"])

    vectorizer = TfidfVectorizer(
        ngram_range=(1, 1),
        min_df=3,
        max_features=20000,
        stop_words="english"
    )
    tfidf_matrix = vectorizer.fit_transform(df["description_clean"])

    return combined_features_matrix, tfidf_matrix, mlb, vectorizer

combined_features_matrix, tfidf_matrix, mlb, vectorizer = build_feature_matrices(df)

# Sentence Emebdding Matrix

We use sentence embedding matrix to capture semantic similarity, we encode each movie/TV series description using a SentenceTransformer model.

These embeddings are later combined with metadata and MinHash similarities to create a stringer recommendation system.

In [6]:
##Biggest change we use embedding matrix for description
##The reson is that system embeddings allows synonyms which TfIdf doesnt do so we get a better description comparison
## Also we keep both TfIdf and system embeddings because TfIdf will use the tokens for Minhash and LSH candidates
## But system embeddings will be useful for similar words so we create a stronger recommendation system
def build_embedding_matrix(df):
    # default model we can change also this later
    model = SentenceTransformer("all-MiniLM-L6-v2")

    descriptions = df["description_clean"].fillna("").tolist()
    
    emb_matrix = model.encode(descriptions, convert_to_numpy=True, show_progress_bar=True)

    return emb_matrix

embedding_matrix = build_embedding_matrix(df)


Batches:   0%|          | 0/275 [00:00<?, ?it/s]

# Extracting Top TF-IDF Tokens
For each item, we extract the tokens with the highest TF-IDF weight. Tokens represent the most important word from each description.

In [7]:
### This caches TfIdf vocabulary and extracts the top weighted tokens per row
### Then hashes them with the active metadata label into Minhash signatures
### Which then is used to built LSH index
feature_names = vectorizer.get_feature_names_out()

def top_tfidf_tokens(row_idx: int, top_k: int = 40) -> List[str]:
    start, end = tfidf_matrix.indptr[row_idx], tfidf_matrix.indptr[row_idx + 1]
    indices = tfidf_matrix.indices[start:end]
    data = tfidf_matrix.data[start:end]
    if len(indices) == 0:
        return []
    order = np.argsort(data)
    top = order[-top_k:]
    return [feature_names[indices[i]] for i in top]


# MinHash Signature Construction

Each item is converted to a MinHash signature using:
1. Top TF-IDF tokens
2. Active metadata labels

In [8]:

def create_minhash(row_idx: int, num_perm: int = 128):
    m = MinHash(num_perm=num_perm)
    for token in top_tfidf_tokens(row_idx):
        m.update(token.encode("utf8"))
    active_meta = combined_features_matrix[row_idx].nonzero()[0]
    for col in active_meta:
        label = mlb.classes_[col]
        m.update(f"meta:{label}".encode("utf8"))
    return m


# Building the LSH index

We build an LSH index using MinHash signatures:

- `num_perm = 128`
- `threshold = 0.35`


In [9]:
def build_minhash_lsh(num_perm=128, threshold=0.35):
    minhashes = {}
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for idx in range(len(df)):
        m = create_minhash(idx, num_perm=num_perm)
        minhashes[idx] = m
        lsh.insert(str(idx), m)
    return minhashes, lsh

minhashes, lsh = build_minhash_lsh()

# Candidate collection

To find similar items for a target index:
1. Query the LSH index for approximate neigbors
2. If too few neighbors are found, we fill them by using:
    - Top cosine-similar sentence embeddings

In [10]:
### Queries LSH for neighbors
### If theres too few neighbors it increases the number of neighbors with system embedding neighbors so we have a good candidate pool
def collect_candidates(idx: int, fallback_k: int = 200):
    neighbors = [int(n) for n in lsh.query(minhashes[idx]) if int(n) != idx]
    if len(neighbors) < fallback_k:
        top_idx = np.argsort(
            embedding_matrix @ embedding_matrix[idx]
        )[-fallback_k:]
        neighbors.extend(top_idx.tolist())
    return np.unique(neighbors)

# Hybrid Distance Matrix 

In [11]:
##This builds a distance matrix using cosine similarities and jaccard values
##This matrix is used for clustering
def build_distance_matrix(indices, w_desc=0.6, w_meta=0.4):
    emb = embedding_matrix[indices]
    meta = combined_features_matrix[indices]

    desc_dist = pairwise_distances(emb, metric="cosine") 
    meta_dist = pairwise_distances(meta, metric="cosine")

    
    jac_sim = np.zeros((len(indices), len(indices)))
    for i, idx_i in enumerate(indices):
        for j, idx_j in enumerate(indices[i + 1:], start=i + 1):
            sim = minhashes[idx_i].jaccard(minhashes[idx_j])
            jac_sim[i, j] = jac_sim[j, i] = sim
    jac_dist = 1 - jac_sim

    hybrid_dist = 0.5 * (w_desc * desc_dist + w_meta * meta_dist) + 0.5 * jac_dist
    return hybrid_dist

sample_idx = np.random.choice(len(df), size=500, replace=False)
distance_matrix = build_distance_matrix(sample_idx)

# Similarity calculation

We compute three types of similarities for a target:

1. **Description similarity** using sentence embedding cosine similarity
2. **Metadata similarity** using metadata cosine similarity
3. **Jaccard similarity** from MinHash signatures

In [12]:
## This computes the hybrid similarity list for a random target
##  It uses system embedding cosine, metadata cosine and jaccard
def compute_full_similarity(idx, top_k=100, w_desc=0.6, w_meta=0.4, w_jaccard=0.2):
    
    candidate_ids = [int(n) for n in lsh.query(minhashes[idx]) if int(n) != idx]

    if len(candidate_ids) < top_k:
        extra = np.argsort(
            embedding_matrix @ embedding_matrix[idx]
        )[-(top_k + 1):]
        candidate_ids.extend(extra.tolist())

    candidates = np.unique([c for c in candidate_ids if c != idx])
    desc_sim = cosine_similarity(
        embedding_matrix[idx].reshape(1, -1),
        embedding_matrix[candidates]
    ).flatten()
    meta_sim = cosine_similarity(
        combined_features_matrix[idx].reshape(1, -1),
        combined_features_matrix[candidates]
    ).flatten()
    jac_sim = np.array([minhashes[idx].jaccard(minhashes[j]) for j in candidates])

    hybrid = (1 - w_jaccard) * (w_desc * desc_sim + w_meta * meta_sim) + w_jaccard * jac_sim
    ranked = candidates[np.argsort(hybrid)[::-1]]
    scores = np.sort(hybrid)[::-1]
    return list(zip(ranked, scores))

# Recommendation Output

In [13]:
## Finally this is the output
## Same as before we define a target and compute the top 10 most similar (this might change depending on how you guys want)
## Also computes a cross type so if a Tv series recommends a few movies (this can also change if u guys want)
def print_recommendations(idx, recs=None, top_n=10):
    if recs is None:
        recs = compute_full_similarity(idx, top_k=top_n * 5)

    target_title = df.loc[idx, "title"]
    target_type = df.loc[idx, "type"] if "type" in df.columns else "Unknown"
    target_genres = ", ".join(df.loc[idx, "genre_list"])

    print(f'\nTarget           : "{target_title}"')
    print(f'Target type      : "{target_type}"')
    print(f'Target genre(s)  : "{target_genres}"\n')

    same_type, other_type = [], []
    for rec_idx, sim in recs:
        entry = (
            df.loc[rec_idx, "title"],
            ", ".join(df.loc[rec_idx, "genre_list"]),
            sim,
        )
        item_type = df.loc[rec_idx, "type"] if "type" in df.columns else "Unknown"
        (same_type if item_type == target_type else other_type).append(entry)
    
    def truncate(text, max_len=40):
        text = str(text)
        return text if len(text) <= max_len else text[: max_len - 3] + "..."

    def print_table(title, rows):
        print(f"{title}:")
        print("Name".ljust(45), "Genres".ljust(40), "Similarity")
        print("-" * 100)
        for name, genres, score in rows[:top_n]:
            print(
                f'"{truncate(name, 42)}"'.ljust(45),
                truncate(genres, 40).ljust(40),
                f"{score:.3f}",
            )
        print()

    print_table(f"Similar {target_type}s", same_type)
    print_table("Cross-type suggestions", other_type)

# example usage
idx = np.random.randint(0, len(df)) ## this gives us a random target
recommendations = {idx: compute_full_similarity(idx)}
print_recommendations(idx, recommendations[idx])


Target           : "Real Rob"
Target type      : "TV Show"
Target genre(s)  : "tv comedies"

Similar TV Shows:
Name                                          Genres                                   Similarity
----------------------------------------------------------------------------------------------------
"Norm Macdonald Has a Show"                   stand-up comedy & talk shows, tv come... 0.403
"Stand Up and Away! with Brian Regan"         stand-up comedy & talk shows, tv come... 0.356
"Conan Without Borders"                       docuseries, tv comedies                  0.341
"Chappelle's Show"                            tv comedies                              0.335
"Middleditch & Schwartz"                      tv comedies                              0.333
"Daniel Sloss: Live Shows"                    stand-up comedy & talk shows, tv come... 0.330
"The Upshaws"                                 tv comedies                              0.324
"The Joel McHale Show with Joel McHale