Loading dataset

In [19]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from datasketch import MinHash, MinHashLSH
import re
import mmh3
from itertools import combinations
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string, re
from scipy.sparse import lil_matrix
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

df = pd.read_csv("netflix_titles.csv", encoding="latin1",sep=",",quotechar='"',engine="python")
### TODO: ADD PREPROCESSING

Cleaning the data

In [20]:
#%% Text cleaning
def normalize_title(title):
    if pd.isna(title):
        return ''
    return re.sub(r'\(.*?\)', '', title).lower().strip()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['title_normalized'] = df['title'].fillna('').apply(normalize_title)
df['title_clean'] = df['title'].fillna('').apply(clean_text)
df['description_clean'] = df['description'].fillna('').apply(clean_text)

# Drop duplicates
df = df.drop_duplicates(subset='title_normalized').reset_index(drop=True)
df = df.drop_duplicates(subset='description_clean').reset_index(drop=True)
print(f"Data loaded: {len(df)} unique titles.")

Data loaded: 8760 unique titles.


Genre based features

In [21]:
# Process genres and countries
df['genre_list'] = df['listed_in'].apply(lambda x: [g.strip() for g in x.split(',')] if pd.notnull(x) else [])
df['combined_features'] = df['genre_list'] + df['country'].fillna('').apply(lambda x: [x])

# One-hot encode genres + countries
mlb = MultiLabelBinarizer()
genre_country_matrix = mlb.fit_transform(df['combined_features'])


TF-IDF

In [22]:
#%% TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['description_clean'])
feature_names = vectorizer.get_feature_names_out()

rows, cols = tfidf_matrix.nonzero()
tfidf_words = defaultdict(list)
for r, c in zip(rows, cols):
    tfidf_words[r].append((feature_names[c], tfidf_matrix[r, c]))

top_n = 20
def top_words(doc_idx, n=top_n):
    words_scores = tfidf_words[doc_idx]
    words_scores.sort(key=lambda x: x[1], reverse=True)
    words = [w for w, _ in words_scores[:n]]
    return ' '.join(words)

df['description_tfidf'] = [top_words(i) for i in range(len(df))]
print("\nExample top words for first description:")
print(df.loc[0, 'description_tfidf'])


Example top words for first description:
kirsten inevitable johnson comical inventive nears stages filmmaker ways end both face death his help father them life as the


Shingles

In [23]:
#%% Shingling
def shingle(q, text):
    words = text.split()
    return [words[i:i+q] for i in range(len(words)-q+1)]

q = 1
shingle_vector = [shingle(q, text) for text in df['description_tfidf']]
print("\nExample shingles for first description:")
print(shingle_vector[0][:10])


Example shingles for first description:
[['kirsten'], ['inevitable'], ['johnson'], ['comical'], ['inventive'], ['nears'], ['stages'], ['filmmaker'], ['ways'], ['end']]


MinHash

In [24]:
from datasketch import MinHash, MinHashLSH

def create_minhash(text, genre_vec, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in set(text.split()):
        m.update(word.encode('utf8'))
    for i, val in enumerate(genre_vec):
        if val == 1:
            m.update(f'genre_{i}'.encode('utf8'))
    return m


minhashes = {}
for idx, row in df.iterrows():
    m = create_minhash(row['description_tfidf'], genre_country_matrix[idx])
    minhashes[idx] = m


lsh = MinHashLSH(threshold=0.35, num_perm=128)
for idx, m in minhashes.items():
    lsh.insert(str(idx), m)


Jaccard

In [25]:
#%% Compute Jaccard similarity for LSH pairs using datasketch MinHash

def jaccard_datasketch(idx1, idx2):
    m1 = minhashes[idx1]
    m2 = minhashes[idx2]
    return m1.jaccard(m2)

similarities = []
threshold = 0.35

# Get LSH candidates for every item
for i in range(len(df)):
    neighbors = lsh.query(minhashes[i])
    neighbors = [int(n) for n in neighbors if int(n) != i]

    for j in neighbors:
        # Avoid duplicates (i,j) and (j,i)
        if i < j:
            sim = jaccard_datasketch(i, j)
            if sim >= threshold:
                similarities.append((i, j, sim))

# Sort by similarity
similarities.sort(key=lambda x: x[2], reverse=True)

print(f"\nTop 5 similar pairs (Jaccard ≥ {threshold}):")
for i, j, sim in similarities[:5]:
    print(f"- {df.loc[i, 'title']} ↔ {df.loc[j, 'title']} | similarity: {sim:.2f}")


Top 5 similar pairs (Jaccard ≥ 0.35):
- InuYasha the Movie 4: Fire on the Mystic Island ↔ Inuyasha the Movie - L'isola del fuoco scarlatto | similarity: 1.00
- Seven Souls in the Skull Castle: Season Bird ↔ Seven Souls in the Skull Castle: Season Wind | similarity: 0.95
- Seven Souls in the Skull Castle: Season Moon Jogen ↔ Seven Souls in the Skull Castle: Season Moon Kagen | similarity: 0.92
- Seven Souls in the Skull Castle: Season Moon Kagen ↔ Seven Souls in the Skull Castle: Season Wind | similarity: 0.91
- Seven Souls in the Skull Castle: Season Moon Jogen ↔ Seven Souls in the Skull Castle: Season Wind | similarity: 0.90


Build recommendation

In [26]:
#%% Build recommendations and ensure all movies have top-N
recommendations = defaultdict(list)

# Fill from MinHash similarities first
for i, j, sim in similarities:
    if df.loc[i, 'title_normalized'] == df.loc[j, 'title_normalized']:
        continue
    recommendations[i].append((j, sim))
    recommendations[j].append((i, sim))

# Calculate cosine similarity for descriptions
desc_similarity = cosine_similarity(tfidf_matrix)

# Calculate cosine similarity for genre + country
genre_similarity = cosine_similarity(genre_country_matrix)

# Combine both: You can adjust weights (e.g., 0.7 for descriptions, 0.3 for genres)
cosine_sim = 0.7 * desc_similarity + 0.3 * genre_similarity

top_n = 5
for i in range(len(df)):
    if len(recommendations[i]) < top_n:
        sims = cosine_sim[i]
        best_idx = np.argsort(sims)[::-1]
        added = 0
        for j in best_idx:
            if i == j:
                continue
            if any(r[0] == j for r in recommendations[i]):
                continue
            recommendations[i].append((j, float(sims[j])))
            added += 1
            if added >= (top_n - len(recommendations[i])):
                break

# Truncate to top-N total
for k, recs in recommendations.items():
    recommendations[k] = sorted(recs, key=lambda x: x[1], reverse=True)[:top_n]

example_idx = np.random.randint(0, len(df))
print(f"\nFinal recommendations for '{df.loc[5, 'title']}':")
for rec_idx, sim in recommendations[example_idx]:
    print(f"- {df.loc[rec_idx, 'title']} (similarity: {sim:.2f})")


Final recommendations for 'Midnight Mass':
- The Dirty Picture (similarity: 0.40)
- Matichya Chuli (similarity: 0.36)
- Thottappan (similarity: 0.35)


In [27]:
# Change the index below to test a different title
idx_query = 0

query_minhash = create_minhash(df.loc[idx_query, 'description_clean'], genre_country_matrix[idx_query])
results = lsh.query(query_minhash)

print(f"Recommendations for: {df.loc[idx_query, 'title']}, {df.loc[idx_query, 'combined_features']}")
for i in results:
    i = int(i)
    if i != idx_query:
        print(" -", df.loc[i, 'title'],df.loc[i,'combined_features'])


Recommendations for: Dick Johnson Is Dead, ['Documentaries', 'United States']
 - Command and Control ['Documentaries', 'United States']
 - What the Health ['Documentaries', 'United States']
 - The Hurt Business ['Documentaries', 'Sports Movies', 'United States']
 - 13TH ['Documentaries', 'United States']
 - Fire in Paradise ['Documentaries', 'United States']
 - Paris Is Burning ['Classic Movies', 'Cult Movies', 'Documentaries', 'United States']
 - Quincy ['Documentaries', 'Music & Musicals', 'United States']
 - The Last Days ['Documentaries', 'United States']
 - Maddman: The Steve Madden Story ['Documentaries', 'United States']
 - Let It Fall: Los Angeles 1982-1992 ['Documentaries', 'United States']
 - Ariana grande: excuse me, i love you ['Documentaries', 'Music & Musicals', 'United States']
 - Take Your Pills ['Documentaries', 'United States']
 - De Palma ['Documentaries', 'United States']
 - Nature: Raising the Dinosaur Giant ['Documentaries', 'United States']
 - The Force ['Documen

Build Matrix

In [28]:
similarity_matrix = cosine_sim

distance_matrix = 1 - similarity_matrix ## this should be used for the clustering
distance_matrix


array([[0.        , 0.99113316, 0.97229944, ..., 0.86713563, 0.86172156,
        0.96087197],
       [0.99113316, 0.        , 0.92269044, ..., 0.99766908, 1.        ,
        1.        ],
       [0.97229944, 0.92269044, 0.        , ..., 0.99004171, 0.98954891,
        0.93452146],
       ...,
       [0.86713563, 0.99766908, 0.99004171, ..., 0.        , 0.78204466,
        0.99437644],
       [0.86172156, 1.        , 0.98954891, ..., 0.78204466, 0.        ,
        0.99498107],
       [0.96087197, 1.        , 0.93452146, ..., 0.99437644, 0.99498107,
        0.        ]])

Output for a target

In [29]:
target_index = 5 ## Changing the index changes the target 

target_title = df.loc[target_index, 'title']
target_genres= ', '.join(df.loc[target_index, 'genre_list'])
target_type = df.loc[target_index, 'type'] if 'type' in df.columns else "Unknown"

print(f'\nTarget           : "{target_title}"')
print(f'Target type      : "{target_type}"')
print(f'Target genre(s)  : "{target_genres}"\n')

same_type = []
other_type = []

for rec_idx, sim in recommendations[target_index]:
    item_type = df.loc[rec_idx, 'type'] if 'type' in df.columns else "Unknown"
    entry = (
        df.loc[rec_idx, 'title'],
        ", ".join(df.loc[rec_idx, 'genre_list']),
        sim
    )
    if item_type == target_type:
        same_type.append(entry)
    else:
        other_type.append(entry)


print(f'Similar {target_type}s:')
print("Name".ljust(45), "Genres".ljust(35), "Similarity")
print("-" * 90)

for title, genres, sim in same_type:
    print(f'"{title}"'.ljust(45), f'{genres}'.ljust(35), f"{sim:.3f}")


Target           : "Midnight Mass"
Target type      : "TV Show"
Target genre(s)  : "TV Dramas, TV Horror, TV Mysteries"

Similar TV Shows:
Name                                          Genres                              Similarity
------------------------------------------------------------------------------------------
"Goodnight DJ 1"                              International TV Shows, TV Dramas, TV Horror 0.242
"Brand New Cherry Flavor"                     TV Dramas, TV Horror, TV Mysteries  0.240
"The Originals"                               TV Dramas, TV Horror, TV Mysteries  0.239
