In [44]:
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
import string, re

from datasketch import MinHash, MinHashLSH


In [45]:
def normalize_title(title: str) -> str:
    if pd.isna(title):
        return ""
    return re.sub(r"\(.*?\)", "", str(title)).lower().strip()

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [46]:
def load_and_preprocess(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, encoding="latin1", sep=",", quotechar='"', engine="python")

    df["title_normalized"] = df["title"].fillna("").apply(normalize_title)
    df["description_clean"] = df["description"].fillna("").apply(clean_text)

    # Remove duplicates
    df = df.drop_duplicates(subset="title_normalized").reset_index(drop=True)
    df = df.drop_duplicates(subset="description_clean").reset_index(drop=True)

    # Genre + country lists
    df["genre_list"] = df["listed_in"].apply(
        lambda x: [g.strip() for g in str(x).split(",")] if pd.notnull(x) else []
    )
    df["combined_features"] = df["genre_list"] + df["country"].fillna("").apply(lambda x: [x])

    return df

df = load_and_preprocess("netflix_titles.csv")
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,title_normalized,description_clean,genre_list,combined_features
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",dick johnson is dead,as her father nears the end of his life filmma...,[Documentaries],"[Documentaries, United States]"
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",blood & water,after crossing paths at a party a cape town te...,"[International TV Shows, TV Dramas, TV Mysteries]","[International TV Shows, TV Dramas, TV Mysteri..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,ganglands,to protect his family from a powerful drug lor...,"[Crime TV Shows, International TV Shows, TV Ac...","[Crime TV Shows, International TV Shows, TV Ac..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",jailbirds new orleans,feuds flirtations and toilet talk go down amon...,"[Docuseries, Reality TV]","[Docuseries, Reality TV, ]"
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,kota factory,in a city of coaching centers known to train i...,"[International TV Shows, Romantic TV Shows, TV...","[International TV Shows, Romantic TV Shows, TV..."


In [47]:
def build_feature_matrices(df: pd.DataFrame):
    # Genre + country one-hot encoding
    mlb = MultiLabelBinarizer()
    genre_country_matrix = mlb.fit_transform(df["combined_features"])

    # TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["description_clean"])
    feature_names = vectorizer.get_feature_names_out()

    # Extract top TF-IDF keywords
    rows, cols = tfidf_matrix.nonzero()
    tfidf_words = defaultdict(list)
    for r, c in zip(rows, cols):
        tfidf_words[r].append((feature_names[c], float(tfidf_matrix[r, c])))

    def top_words(doc_idx: int, n=20):
        words_scores = tfidf_words[doc_idx]
        words_scores.sort(key=lambda x: x[1], reverse=True)
        return " ".join([word for word, _ in words_scores[:n]])

    df["description_tfidf"] = [top_words(i) for i in range(len(df))]

    return df, genre_country_matrix, tfidf_matrix


df, genre_country_matrix, tfidf_matrix = build_feature_matrices(df)
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,title_normalized,description_clean,genre_list,combined_features,description_tfidf
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",dick johnson is dead,as her father nears the end of his life filmma...,[Documentaries],"[Documentaries, United States]",kirsten inevitable johnson comical inventive n...
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",blood & water,after crossing paths at a party a cape town te...,"[International TV Shows, TV Dramas, TV Mysteries]","[International TV Shows, TV Dramas, TV Mysteri...",cape swimming crossing abducted at whether bir...
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,ganglands,to protect his family from a powerful drug lor...,"[Crime TV Shows, International TV Shows, TV Ac...","[Crime TV Shows, International TV Shows, TV Ac...",mehdi turf pulled robbers skilled thief expert...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",jailbirds new orleans,feuds flirtations and toilet talk go down amon...,"[Docuseries, Reality TV]","[Docuseries, Reality TV, ]",orleans feuds flirtations toilet incarcerated ...
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,kota factory,in a city of coaching centers known to train i...,"[International TV Shows, Romantic TV Shows, TV...","[International TV Shows, Romantic TV Shows, TV...",collegiate unexceptional finest earnest coachi...


In [48]:
def create_minhash(text_tokens: str, genre_vec: np.ndarray, num_perm: int = 128):
    m = MinHash(num_perm=num_perm)
    # Add weighted TF-IDF words
    for word in set(text_tokens.split()):
        m.update(word.encode("utf8"))
    # Add genre+country flags
    for i, val in enumerate(genre_vec):
        if val == 1:
            m.update(f"genre_{i}".encode("utf8"))
    return m


def build_minhash_lsh(df, genre_country_matrix, num_perm=128, threshold=0.35):
    minhashes = {}
    for idx, row in df.iterrows():
        minhashes[idx] = create_minhash(row["description_tfidf"], genre_country_matrix[idx], num_perm)

    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for idx, m in minhashes.items():
        lsh.insert(str(idx), m)

    return minhashes, lsh


minhashes, lsh = build_minhash_lsh(df, genre_country_matrix)


In [49]:
def jaccard_datasketch(m1, m2):
    return m1.jaccard(m2)

def collect_lsh_similarities(df, minhashes, lsh, threshold=0.35):
    pairs = []
    for i in range(len(df)):
        neighbors = lsh.query(minhashes[i])
        neighbors = [int(x) for x in neighbors if int(x) != i]

        for j in neighbors:
            if i < j:
                sim = jaccard_datasketch(minhashes[i], minhashes[j])
                if sim >= threshold:
                    pairs.append((i, j, float(sim)))

    pairs.sort(key=lambda x: x[2], reverse=True)
    return pairs

lsh_pairs = collect_lsh_similarities(df, minhashes, lsh)
len(lsh_pairs)


27

In [50]:
def compute_full_similarity(df, idx, tfidf_matrix, genre_country_matrix, w_desc=0.7, w_meta=0.3):
    desc_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    meta_sim = cosine_similarity(genre_country_matrix[idx].reshape(1, -1), genre_country_matrix).flatten()
    final_sim = w_desc * desc_sim + w_meta * meta_sim

    # Jaccard via MinHash only if exists
    jaccard_scores = []
    for j in range(len(df)):
        if j == idx:
            jaccard_scores.append(0)
        else:
            try:
                jaccard_scores.append(minhashes[idx].jaccard(minhashes[j]))
            except:
                jaccard_scores.append(0)

    # Combine everything
    hybrid_sim = 0.5 * final_sim + 0.5 * np.array(jaccard_scores)
    return hybrid_sim


In [51]:
def print_recommendations_for_index_full(df, idx, tfidf_matrix, genre_country_matrix):
    target_title = df.loc[idx, "title"]
    target_type = df.loc[idx, "type"] if "type" in df.columns else "Unknown"
    target_genres = ", ".join(df.loc[idx, "genre_list"])

    print(f"Target           : \"{target_title}\"")
    print(f"Target type      : \"{target_type}\"")
    print(f"Target genres    : \"{target_genres}\"\n")

    # Compute similarity to ALL titles
    sims = compute_full_similarity(df, idx, tfidf_matrix, genre_country_matrix)

    same_type = []
    other_type = []

    for j in range(len(df)):
        if j == idx:
            continue
        sim = sims[j]
        item_type = df.loc[j, "type"] if "type" in df.columns else "Unknown"
        entry = (df.loc[j, "title"], ", ".join(df.loc[j, "genre_list"]), sim)

        if item_type == target_type:
            same_type.append(entry)
        else:
            other_type.append(entry)

    same_type.sort(key=lambda x: x[2], reverse=True)
    other_type.sort(key=lambda x: x[2], reverse=True)

    print(f"Similar {target_type}s (ALL):")
    print("Name".ljust(45), "Genres".ljust(35), "Similarity")
    print("-" * 90)
    for title, genres, sim in same_type:
        print(f"\"{title}\"".ljust(45), genres.ljust(35), f"{sim:.3f}")

    print("\nOther similar items (ALL):")
    print("Name".ljust(45), "Genres".ljust(35), "Similarity")
    print("-" * 90)
    for title, genres, sim in other_type:
        print(f"\"{title}\"".ljust(45), genres.ljust(35), f"{sim:.3f}")


In [52]:
idx = np.random.randint(0, len(df))
print_recommendations_for_index_full(df, idx, tfidf_matrix, genre_country_matrix)


Target           : "Next"
Target type      : "Movie"
Target genres    : "Action & Adventure, Sci-Fi & Fantasy"

Similar Movies (ALL):
Name                                          Genres                              Similarity
------------------------------------------------------------------------------------------
"The Peacemaker"                              Action & Adventure                  0.251
"How It Ends"                                 Action & Adventure, Sci-Fi & Fantasy 0.241
"Code Name: The Cleaner"                      Action & Adventure, Comedies        0.240
"Sleepless"                                   Action & Adventure                  0.235
"Ocean's Eleven"                              Action & Adventure, Classic Movies, Comedies 0.224
"The Matrix Revolutions"                      Action & Adventure, Sci-Fi & Fantasy 0.224
"The 2nd"                                     Action & Adventure                  0.221
"Skyline"                                     Action & 