# Clustering & Recommendations on Netflix  
**Pipeline:** Cleaning → TF‑IDF → Shingling → MinHash → LSH → Top‑N Recommendations → Distance Matrix for Clustering → Max’s sections

> This notebook was originally made by Ronja and Eris. I continued working on it from around part 11 onward.


## 1) (Ronja & Eris) Setup & Imports

In [None]:

import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from datasketch import MinHash, MinHashLSH
import re
import mmh3
from itertools import combinations
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string, re
from scipy.sparse import lil_matrix
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

# Load data
# Make sure the CSV is in the same folder or adjust the path.
df = pd.read_csv("netflix_titles.csv", encoding="latin1", sep=",", quotechar='"', engine="python")
print(f"Raw rows loaded: {len(df)}")





## 2) (Ronja & Eris) Text Cleaning & Normalization
- Entfernt Klammern/Anhänge aus Titeln, vereinheitlicht Text (lowercase, Satzzeichen raus).
- Dedupliziert anhand von normalisiertem Titel **und** bereinigter Beschreibung.




In [None]:
def normalize_title(title):
    if pd.isna(title):
        return ''
    return re.sub(r'\(.*?\)', '', title).lower().strip()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['title_normalized'] = df['title'].fillna('').apply(normalize_title)
df['title_clean'] = df['title'].fillna('').apply(clean_text)
df['description_clean'] = df['description'].fillna('').apply(clean_text)

# Drop duplicates
df = df.drop_duplicates(subset='title_normalized').reset_index(drop=True)
df = df.drop_duplicates(subset='description_clean').reset_index(drop=True)
print(f"Data loaded: {len(df)} unique titles after dedup.")





## 3) Genres & Countries -> Multi‑Hot Features
- `listed_in` (Genres, kommagetrennt) -> Liste
- Kombiniert mit `country` -> MultiLabelBinarizer




In [None]:

# Process genres and countries
df['genre_list'] = df['listed_in'].apply(lambda x: [g.strip() for g in x.split(',')] if pd.notnull(x) else [])
df['combined_features'] = df['genre_list'] + df['country'].fillna('').apply(lambda x: [x])

# One-hot encode genres + countries
mlb = MultiLabelBinarizer()
genre_country_matrix = mlb.fit_transform(df['combined_features'])
print("Genre+Country feature matrix shape:", genre_country_matrix.shape)




## 4) TF‑IDF auf Beschreibungen & Top‑Wörter je Titel
> Note: Das hier zählt nicht als einzelne Methode, die man hernehmen könnte. 



In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['description_clean'])
feature_names = vectorizer.get_feature_names_out()

rows, cols = tfidf_matrix.nonzero()
tfidf_words = defaultdict(list)
for r, c in zip(rows, cols):
    tfidf_words[r].append((feature_names[c], tfidf_matrix[r, c]))

top_n = 20
def top_words(doc_idx, n=top_n):
    words_scores = tfidf_words[doc_idx]
    words_scores.sort(key=lambda x: x[1], reverse=True)
    words = [w for w, _ in words_scores[:n]]
    return ' '.join(words)

df['description_tfidf'] = [top_words(i) for i in range(len(df))]
print("\nExample top words for first description:")
print(df.loc[0, 'description_tfidf'])





## 5) Shingling (q‑grams)
> By default q=1 (unigrams). You can set q=2 (bigrams) to strengthen semantic similarity. (Should have been lecture 3 or 4)




In [None]:
def shingle(q, text):
    words = text.split()
    return [words[i:i+q] for i in range(len(words)-q+1)]

q = 1
shingle_vector = [shingle(q, text) for text in df['description_tfidf']]
print("\nExample shingles for first description:")
print(shingle_vector[0][:10])





## 6) MinHash Signatures
> Erzeugt pro Dokument eine MinHash‑Signatur der Länge `k`. Der Anteil gleicher Positionen zwischen zwei Signaturen approximiert die **Jaccard‑Ähnlichkeit**.




In [None]:

def listhash(l, seed):
    val = 0
    for e in l:
        val ^= mmh3.hash(' '.join(e), seed)
    return val

def minhash_k(shingles, k):
    return [min([listhash(shingle, seed) for shingle in shingles]) for seed in range(1, k+1)]

k = 50
minhash_signatures = np.array([minhash_k(shingles, k) for shingles in shingle_vector])
print("\nExample MinHash signature for first doc:")
print(minhash_signatures[0])





## 7) LSH (Bands × Rows) -> Kandidatenpaare
> Teilt die Signaturen in `bands × rows` (hier 10 × 5) und sammelt Paare, die in mindestens einem Band identisch sind. (Das müsste Die VL 3 gewesen sein, wenn ich mich nicht irre)




In [None]:

def lsh_candidates(signatures, bands, rows):
    assert bands * rows == signatures.shape[1], "bands * rows must equal signature length"
    candidates = set()
    n = signatures.shape[0]
    
    for b in range(bands):
        buckets = defaultdict(list)
        for i in range(n):
            band_sig = tuple(signatures[i, b*rows:(b+1)*rows])
            buckets[band_sig].append(i)
        for bucket_docs in buckets.values():
            if len(bucket_docs) > 1:
                for i_idx in range(len(bucket_docs)):
                    for j_idx in range(i_idx+1, len(bucket_docs)):
                        candidates.add(tuple(sorted((bucket_docs[i_idx], bucket_docs[j_idx]))))
    return candidates

bands = 10
rows = 5
candidates = lsh_candidates(minhash_signatures, bands, rows)
print(f"\nNumber of candidate pairs: {len(candidates)}")





## 8) MinHash‑basierte Jaccard‑Schätzung & Filter
> Schätzt die Jaccard‑Ähnlichkeit als Anteil übereinstimmender Signaturpositionen und filtert Paare mit `threshold`.




In [None]:
def jaccard_list(doc1_idx, doc2_idx, signatures):
    sig1 = signatures[doc1_idx]
    sig2 = signatures[doc2_idx]
    matches = np.sum(sig1 == sig2)
    return matches / len(sig1)

threshold = 0.35
similarities = []
for i, j in candidates:
    sim = jaccard_list(i, j, minhash_signatures)
    if sim >= threshold:
        similarities.append((i, j, sim))

similarities.sort(key=lambda x: x[2], reverse=True)
print(f"\nTop 5 similar pairs (threshold={threshold}):")
for i, j, sim in similarities[:5]:
    print(f"- {df.loc[i, 'title']} ↔ {df.loc[j, 'title']} | similarity: {sim:.2f}")





## 9) Recommendations: Hybrid of MinHash & Cosine (TF-IDF + Genres/Country)
-Start with MinHash matches.
-Add weighted cosine similarities (0.7 content, 0.3 metadata).
-Ensure that each title has Top-N recommendations.




In [None]:
recommendations = defaultdict(list)

# Fill from MinHash similarities first
for i, j, sim in similarities:
    if df.loc[i, 'title_normalized'] == df.loc[j, 'title_normalized']:
        continue
    recommendations[i].append((j, sim))
    recommendations[j].append((i, sim))

# Calculate cosine similarity for descriptions
desc_similarity = cosine_similarity(tfidf_matrix)

# Calculate cosine similarity for genre + country
genre_similarity = cosine_similarity(genre_country_matrix)

# Combine both: You can adjust weights (e.g., 0.7 for descriptions, 0.3 for genres)
cosine_sim = 0.7 * desc_similarity + 0.3 * genre_similarity

top_n = 5
for i in range(len(df)):
    if len(recommendations[i]) < top_n:
        sims = cosine_sim[i]
        best_idx = np.argsort(sims)[::-1]
        added = 0
        for j in best_idx:
            if i == j:
                continue
            if any(r[0] == j for r in recommendations[i]):
                continue
            recommendations[i].append((j, float(sims[j])))
            added += 1
            if added >= (top_n - len(recommendations[i])):
                break

# Truncate to top-N total
for k_idx, recs in recommendations.items():
    recommendations[k_idx] = sorted(recs, key=lambda x: x[1], reverse=True)[:top_n]

example_idx = np.random.randint(0, len(df))
print(f"\nFinal recommendations for '{df.loc[example_idx, 'title']}':")
for rec_idx, sim in recommendations[example_idx]:
    print(f"- {df.loc[rec_idx, 'title']} (similarity: {sim:.2f})")





## 10) (Ronja & Eris) Similarity -> Distance for Clustering
> Later use this distance matrix for hierarchical clustering/DBSCAN.




In [None]:

similarity_matrix = cosine_sim
distance_matrix = 1 - similarity_matrix  # this should be used for the clustering
distance_matrix





## 11) (Max) Hierarchical Clustering (Subsample)
>Full pairwise distances are O(n²). For large data, take a subsample or compute only the Top-k neighbors.



In [None]:

# Optional demo on a small subset to avoid O(n^2) blowup
subset = min(400, distance_matrix.shape[0])  # adjust as needed
if subset >= 3: # ensures there are at least 3 objects
    Z = linkage(squareform(distance_matrix[:subset, :subset], checks=False), method='average')
    labels = fcluster(Z, t=0.7, criterion='distance')
    print("Cluster labels (first few):", labels[:10])
else:
    print("Not enough items for clustering demo.")




## 12) (Max)    Make clusters visible
>After the clustering, this makes visible which movies are in the clusters. This is also a simple task that can be done.



In [None]:
# ------------------------------------------------
# Voraussetzung:
# - distance_matrix = 1 - cosine_sim   (cosine_sim = 0.7*desc + 0.3*genre/country)
# - df['listed_in'], df['country'], df['description_tfidf'] existieren

import numpy as np
from collections import Counter
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

# --- Parameter ---
subset_size = min(400, distance_matrix.shape[0])  # anpassen, falls man mehr/weniger will
linkage_method = 'average'
distance_threshold = 0.7
np.random.seed(42)  # für Reproduzierbarkeit (optional)
# Verwende die ERSTEN 'subset_size' Einträge; alternativ: zufällige Indizes
idx = np.arange(subset_size)  # oder: np.random.choice(df.index, size=subset_size, replace=False)

# --- Submatrix extrahieren und clustern ---
D_sub = distance_matrix[np.ix_(idx, idx)]
Z = linkage(squareform(D_sub, checks=False), method=linkage_method)
labels = fcluster(Z, t=distance_threshold, criterion='distance')

# --- Labels sicher in df schreiben (nur für Subset-Indizes) ---
df['cluster'] = np.nan
df.loc[idx, 'cluster'] = labels
df['cluster'] = df['cluster'].astype('Int64')  # hübscher nullable-int dtype

print(f"Subset: {subset_size} Titel | Cluster-Methode: {linkage_method} | Distanz-Schwelle: {distance_threshold}")
print(f"Anzahl Cluster im Subset: {df.loc[idx, 'cluster'].nunique()}")

# --- Hilfsfunktionen für Cluster-Zusammenfassung ---
def top_genres(series_listed_in, k=5):
    # series_listed_in: Series mit Strings "Genre1, Genre2, ..."
    tokens = []
    for s in series_listed_in.dropna():
        tokens.extend([g.strip() for g in s.split(',') if g.strip()])
    return Counter(tokens).most_common(k)

def top_countries(series_country, k=5):
    tokens = []
    for s in series_country.dropna():
        # Manche Einträge haben mehrere Länder getrennt durch Komma
        tokens.extend([c.strip() for c in str(s).split(',') if c.strip()])
    return Counter(tokens).most_common(k)

def top_words_from_tfidf(series_tfidf, k=10):
    # series_tfidf: enthält bereits die Top-Wörter je Dokument als String
    tokens = []
    for s in series_tfidf.dropna():
        tokens.extend(str(s).split())
    return Counter(tokens).most_common(k)

# --- Pro Cluster: "wonach geclustert" (Profile) ---
unique_clusters = sorted(df.loc[idx, 'cluster'].dropna().unique())
for cid in unique_clusters:
    sub = df.loc[(df.index.isin(idx)) & (df['cluster'] == cid)]
    size = len(sub)

    genres = top_genres(sub['listed_in'], k=5)
    countries = top_countries(sub['country'], k=5)
    words = top_words_from_tfidf(sub['description_tfidf'], k=10)

    print(f"\n=== Cluster {cid} | Größe: {size} ===")
    print("Top-Genres:   ", ", ".join([f"{g} ({n})" for g, n in genres]) or "—")
    print("Top-Länder:   ", ", ".join([f"{c} ({n})" for c, n in countries]) or "—")
    print("Top-Wörter:   ", ", ".join([f"{w} ({n})" for w, n in words]) or "—") # basically same as in TF-IDF. 

    # Beispiel-Titel
    ex_titles = sub['title'].head(5).tolist()
    print("Beispiele:    ", "; ".join(ex_titles) if ex_titles else "—")






## 13) (Max) Only show the Top-k clusters:




In [None]:
# ------------------------------------------------------------
# Shows the Top-5 largest clusters and their most frequent features
# (genres, countries, TF-IDF words) + example titles
# ------------------------------------------------------------

import numpy as np
from collections import Counter
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

# ---------- Parameters ----------
subset_size = min(400, distance_matrix.shape[0])  # for demo; increase if needed
linkage_method = 'average'
distance_threshold = 0.7
top_k = 5
np.random.seed(42)

# ---------- Subset indices ----------
idx = np.arange(subset_size)  # or: np.random.choice(df.index, size=subset_size, replace=False)

# ---------- Clustering (if necessary) ----------
needs_cluster = ('cluster' not in df.columns) or (df.loc[idx, 'cluster'].isna().all())
if needs_cluster:
    D_sub = distance_matrix[np.ix_(idx, idx)]
    Z = linkage(squareform(D_sub, checks=False), method=linkage_method)
    labels = fcluster(Z, t=distance_threshold, criterion='distance')
    df['cluster'] = np.nan
    df.loc[idx, 'cluster'] = labels
    df['cluster'] = df['cluster'].astype('Int64')

print(f"Subset: {subset_size} | Method: {linkage_method} | Threshold: {distance_threshold}")
sub_df = df.loc[idx]  # consider only the subset

# ---------- Helper functions ----------
def top_genres(series_listed_in, k=5):
    tokens = []
    for s in series_listed_in.dropna():
        tokens.extend([g.strip() for g in s.split(',') if g.strip()])
    return Counter(tokens).most_common(k)

def top_countries(series_country, k=5):
    tokens = []
    for s in series_country.dropna():
        tokens.extend([c.strip() for c in str(s).split(',') if c.strip()])
    return Counter(tokens).most_common(k)

def top_words_from_tfidf(series_tfidf, k=10):
    tokens = []
    for s in series_tfidf.dropna():
        tokens.extend(str(s).split())
    return Counter(tokens).most_common(k)

# ---------- Determine Top-K largest clusters ----------
sizes = sub_df['cluster'].value_counts(dropna=True).sort_values(ascending=False)
top_clusters = list(sizes.head(top_k).index)

print(f"Found clusters in subset: {len(sizes)} | Showing Top-{min(top_k, len(sizes))} largest clusters.")

for rank, cid in enumerate(top_clusters, start=1):
    sub = sub_df.loc[sub_df['cluster'] == cid]
    size = len(sub)

    genres = top_genres(sub['listed_in'], k=5)
    countries = top_countries(sub['country'], k=5)
    words = top_words_from_tfidf(sub['description_tfidf'], k=10)

    print(f"\n=== #{rank} | Cluster {cid} | Size: {size} ===")
    print("Top genres:    ", ", ".join([f"{g} ({n})" for g, n in genres]) or "—")
    print("Top countries: ", ", ".join([f"{c} ({n})" for c, n in countries]) or "—")
    print("Top words:     ", ", ".join([f"{w} ({n})" for w, n in words]) or "—")
    print("Examples:      ", "; ".join(sub['title'].head(5).tolist()) if size else "—")






## 14) (Max) - K-Means auf Embeddings (TF-IDF & Genres)
> K-Means works in feature space (vectors). For cosine similarity, K-Means works well when vectors are L2-normalized (TF-IDF may already do this).



In [None]:
from sklearn.cluster import MiniBatchKMeans
from scipy.sparse import hstack, csr_matrix
import numpy as np
import pandas as pd

# Optional: Build hybrid embedding (TF-IDF + genres/countries).
# We weight the genres/countries a bit weaker (e.g. 0.6). Fine-tune as needed.
alpha = 0.6
X_hybrid = hstack([tfidf_matrix, csr_matrix(genre_country_matrix * alpha)], format='csr')

k = 20  # number of clusters (adjust / later determine via score)
km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=2048, n_init='auto')
labels_km = km.fit_predict(X_hybrid)

df['cluster_kmeans'] = labels_km
print("K-Means cluster sizes (Top 10):")
print(pd.Series(labels_km).value_counts().head(10))

# Small preview per cluster
for cid in pd.Series(labels_km).value_counts().index[:5]:
    sub = df.loc[df['cluster_kmeans'] == cid, ['title','listed_in','country']].head(5)
    print(f"\n=== K-Means Cluster {cid} ===")
    print(sub.to_string(index=False))

## 15) (Max&Dogukan) - DBSCAM auf Similarity/ Distance
>Is done directly with the distance matrix ⇒ fits the 1-cosine function


In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd

# Attention: distance_matrix is (n x n) → O(n^2) memory/time.
# For large n, use only subset/approx if needed.
eps = 0.30        # corresponds to: cosine_sim >= 0.70
min_samples = 5

db = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples, n_jobs=-1)
labels_db = db.fit_predict(distance_matrix)

df['cluster_dbscan'] = labels_db
print("DBSCAN cluster sizes (without noise -1):")
print(pd.Series(labels_db[labels_db != -1]).value_counts().head(10))

# Preview per cluster (without noise -1)
for cid in pd.Series(labels_db).value_counts().index:
    if cid == -1: 
        continue
    sub = df.loc[df['cluster_dbscan'] == cid, ['title','listed_in','country']].head(5)
    print(f"\n=== DBSCAN Cluster {cid} ===")
    print(sub.to_string(index=False))





## 16) (Max&Dogukan) DBSCAN directly on embeddings
> Saves memory because no distance matrix needs to be created.


In [None]:
from sklearn.cluster import DBSCAN

# DBSCAN with cosine metric (works on vectors, not precomputed)
eps = 0.30         # here it is the cosine distance threshold; closeness of the points to each other
min_samples = 5    # minimum number of points => there must be 5 within distance 0.3 for it to count

db_vec = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples, n_jobs=-1)
# DBSCAN is applied to the TF-IDF vectors
labels_db_vec = db_vec.fit_predict(tfidf_matrix)   # tfidf_matrix is CSR → ok

df['cluster_dbscan_vec'] = labels_db_vec  # storing cluster labels
print("DBSCAN(Vectors) cluster sizes (without noise -1):")  # output
print(pd.Series(labels_db_vec[labels_db_vec != -1]).value_counts().head(10))

## 17) (Max&Dogukan) Quickly print the Top‑5 clusters
> Profiling/preview is reused.


In [None]:
which = 'cluster_kmeans'  # or 'cluster_dbscan' / 'cluster_dbscan_vec'
# Select the column whose clusters we want to inspect.

# Value counts (cluster sizes); optionally remove noise (-1)
sizes = df[which].value_counts(dropna=True)
if -1 in sizes.index:
    sizes = sizes[sizes.index != -1]

if sizes.empty:
    print(f"No clusters found for column '{which}' (only noise or empty).")
else:
    clusters_top5 = sizes.index[:5]  # IDs of the top-5 (or fewer) clusters

    print(f"Top {len(clusters_top5)} largest clusters ({which}):")
    for rank, cid in enumerate(clusters_top5, 1):
        sub = df.loc[df[which] == cid]

        print(f"=== #{rank} | Cluster {cid} | Size: {len(sub)} ===")
        print(sub[['title','listed_in','country']].head(5).to_string(index=False))
