Loading dataset

In [42]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from datasketch import MinHash, MinHashLSH
import re
import mmh3
from itertools import combinations
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string, re
from scipy.sparse import lil_matrix
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

df = pd.read_csv("netflix_titles.csv", encoding="latin1",sep=",",quotechar='"',engine="python")
### TODO: ADD PREPROCESSING

Cleaning the data

In [43]:
#%% Text cleaning
def normalize_title(title):
    if pd.isna(title):
        return ''
    return re.sub(r'\(.*?\)', '', title).lower().strip()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['title_normalized'] = df['title'].fillna('').apply(normalize_title)
df['title_clean'] = df['title'].fillna('').apply(clean_text)
df['description_clean'] = df['description'].fillna('').apply(clean_text)

# Drop duplicates
df = df.drop_duplicates(subset='title_normalized').reset_index(drop=True)
df = df.drop_duplicates(subset='description_clean').reset_index(drop=True)
print(f"Data loaded: {len(df)} unique titles.")

Data loaded: 8760 unique titles.


Genre based features

In [44]:
# Process genres and countries
df['genre_list'] = df['listed_in'].apply(lambda x: [g.strip() for g in x.split(',')] if pd.notnull(x) else [])
df['combined_features'] = df['genre_list'] + df['country'].fillna('').apply(lambda x: [x])

# One-hot encode genres + countries
mlb = MultiLabelBinarizer()
genre_country_matrix = mlb.fit_transform(df['combined_features'])


TF-IDF

In [45]:
#%% TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['description_clean'])
feature_names = vectorizer.get_feature_names_out()

rows, cols = tfidf_matrix.nonzero()
tfidf_words = defaultdict(list)
for r, c in zip(rows, cols):
    tfidf_words[r].append((feature_names[c], tfidf_matrix[r, c]))

top_n = 20
def top_words(doc_idx, n=top_n):
    words_scores = tfidf_words[doc_idx]
    words_scores.sort(key=lambda x: x[1], reverse=True)
    words = [w for w, _ in words_scores[:n]]
    return ' '.join(words)

df['description_tfidf'] = [top_words(i) for i in range(len(df))]
print("\nExample top words for first description:")
print(df.loc[0, 'description_tfidf'])


Example top words for first description:
kirsten inevitable johnson comical inventive nears stages filmmaker ways end both face death his help father them life as the


Shingles

In [46]:
#%% Shingling
def shingle(q, text):
    words = text.split()
    return [words[i:i+q] for i in range(len(words)-q+1)]

q = 1
shingle_vector = [shingle(q, text) for text in df['description_tfidf']]
print("\nExample shingles for first description:")
print(shingle_vector[0][:10])


Example shingles for first description:
[['kirsten'], ['inevitable'], ['johnson'], ['comical'], ['inventive'], ['nears'], ['stages'], ['filmmaker'], ['ways'], ['end']]


MinHash

In [47]:
def listhash(l, seed):
    val = 0
    for e in l:
        val ^= mmh3.hash(' '.join(e), seed)
    return val

def minhash_k(shingles, k):
    return [min([listhash(shingle, seed) for shingle in shingles]) for seed in range(1, k+1)]

k = 50
minhash_signatures = np.array([minhash_k(shingles, k) for shingles in shingle_vector])
print("\nExample MinHash signature for first doc:")
print(minhash_signatures[0])


Example MinHash signature for first doc:
[-1735738829 -1832300011 -2118588981 -1818711025 -2098281750 -1614472792
 -2084924011 -1984257982 -1372695722 -1559323272 -2089208976 -1960808834
 -2027474104 -1615755659 -1690528979 -2087023085 -2048304548 -1995669687
 -1772009878 -2041343340 -1541090568 -2012499769 -2102064348 -2114282030
 -1659137596 -1589348634 -2130005453 -1825880913 -1994940666 -2073509781
 -2066368374 -1925363874 -2029035528 -1883797657 -2067436519 -2110145076
 -1839633183 -1980926784 -1958820688 -1907894037 -2082984586 -2077980288
 -2141595500 -2082472082 -1219200554 -1805234070 -1855078378 -2055577738
 -2081648617 -1919529610]


LSH 

In [48]:
def lsh_candidates(signatures, bands, rows):
    assert bands * rows == signatures.shape[1], "bands * rows must equal signature length"
    candidates = set()
    n = signatures.shape[0]
    
    for b in range(bands):
        buckets = defaultdict(list)
        for i in range(n):
            band_sig = tuple(signatures[i, b*rows:(b+1)*rows])
            buckets[band_sig].append(i)
        for bucket_docs in buckets.values():
            if len(bucket_docs) > 1:
                for i_idx in range(len(bucket_docs)):
                    for j_idx in range(i_idx+1, len(bucket_docs)):
                        candidates.add(tuple(sorted((bucket_docs[i_idx], bucket_docs[j_idx]))))
    return candidates

bands = 10
rows = 5
candidates = lsh_candidates(minhash_signatures, bands, rows)
print(f"\nNumber of candidate pairs: {len(candidates)}")


Number of candidate pairs: 190


Jaccard

In [49]:
#%% Jaccard similarity for candidate pairs
def jaccard_list(doc1_idx, doc2_idx, signatures):
    sig1 = signatures[doc1_idx]
    sig2 = signatures[doc2_idx]
    matches = np.sum(sig1 == sig2)
    return matches / len(sig1)

threshold = 0.35
similarities = []
for i, j in candidates:
    sim = jaccard_list(i, j, minhash_signatures)
    if sim >= threshold:
        similarities.append((i, j, sim))

similarities.sort(key=lambda x: x[2], reverse=True)
print(f"\nTop 5 similar pairs (threshold={threshold}):")
for i, j, sim in similarities[:5]:
    print(f"- {df.loc[i, 'title']} ↔ {df.loc[j, 'title']} | similarity: {sim:.2f}")


Top 5 similar pairs (threshold=0.35):
- InuYasha the Movie 4: Fire on the Mystic Island ↔ Inuyasha the Movie - L'isola del fuoco scarlatto | similarity: 1.00
- Seven Souls in the Skull Castle: Season Bird ↔ Seven Souls in the Skull Castle: Season Wind | similarity: 0.96
- Seven Souls in the Skull Castle: Season Bird ↔ Seven Souls in the Skull Castle: Season Flower | similarity: 0.94
- Seven Souls in the Skull Castle: Season Moon Jogen ↔ Seven Souls in the Skull Castle: Season Bird | similarity: 0.94
- Seven Souls in the Skull Castle: Season Moon Jogen ↔ Seven Souls in the Skull Castle: Season Moon Kagen | similarity: 0.94


Build recommendation

In [50]:
#%% Build recommendations and ensure all movies have top-N
recommendations = defaultdict(list)

# Fill from MinHash similarities first
for i, j, sim in similarities:
    if df.loc[i, 'title_normalized'] == df.loc[j, 'title_normalized']:
        continue
    recommendations[i].append((j, sim))
    recommendations[j].append((i, sim))

# Calculate cosine similarity for descriptions
desc_similarity = cosine_similarity(tfidf_matrix)

# Calculate cosine similarity for genre + country
genre_similarity = cosine_similarity(genre_country_matrix)

# Combine both: You can adjust weights (e.g., 0.7 for descriptions, 0.3 for genres)
cosine_sim = 0.7 * desc_similarity + 0.3 * genre_similarity

top_n = 5
for i in range(len(df)):
    if len(recommendations[i]) < top_n:
        sims = cosine_sim[i]
        best_idx = np.argsort(sims)[::-1]
        added = 0
        for j in best_idx:
            if i == j:
                continue
            if any(r[0] == j for r in recommendations[i]):
                continue
            recommendations[i].append((j, float(sims[j])))
            added += 1
            if added >= (top_n - len(recommendations[i])):
                break

# Truncate to top-N total
for k, recs in recommendations.items():
    recommendations[k] = sorted(recs, key=lambda x: x[1], reverse=True)[:top_n]

example_idx = np.random.randint(0, len(df))
print(f"\nFinal recommendations for '{df.loc[5, 'title']}':")
for rec_idx, sim in recommendations[example_idx]:
    print(f"- {df.loc[rec_idx, 'title']} (similarity: {sim:.2f})")


Final recommendations for 'Midnight Mass':
- Trial 4 (similarity: 0.41)
- Making a Murderer (similarity: 0.36)
- The Devil Next Door (similarity: 0.35)


Build Matrix

In [53]:
similarity_matrix = cosine_sim

distance_matrix = 1 - similarity_matrix ## this should be used for the clustering
distance_matrix


array([[0.        , 0.99113316, 0.97229944, ..., 0.86713563, 0.86172156,
        0.96087197],
       [0.99113316, 0.        , 0.92269044, ..., 0.99766908, 1.        ,
        1.        ],
       [0.97229944, 0.92269044, 0.        , ..., 0.99004171, 0.98954891,
        0.93452146],
       ...,
       [0.86713563, 0.99766908, 0.99004171, ..., 0.        , 0.78204466,
        0.99437644],
       [0.86172156, 1.        , 0.98954891, ..., 0.78204466, 0.        ,
        0.99498107],
       [0.96087197, 1.        , 0.93452146, ..., 0.99437644, 0.99498107,
        0.        ]])

Export the recommendation

In [52]:
#%% Export recommendations
# rec_rows = []
# for movie_idx, recs in recommendations.items():
#     for rec_idx, sim in recs:
#         rec_rows.append({
#             'movie_title': df.loc[movie_idx, 'title'],
#             'recommended_title': df.loc[rec_idx, 'title'],
#             'similarity': sim
#         })

# recommendations_df = pd.DataFrame(rec_rows)
# recommendations_df.to_csv("movie_recommendations.csv", index=False)
# print("\nSaved top-N movie recommendations to 'movie_recommendations.csv'.")
