In [27]:
from ipynb.fs.full.LSH import *
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import MiniBatchKMeans, DBSCAN
from collections import Counter

In [24]:
# Building the distance matrix 
sample_idx = np.random.choice(len(df), size=400, replace=False)
distance_matrix = build_distance_matrix(sample_idx)


numpy.ndarray

# Hierarchical Clustering

In [19]:
Z = linkage(squareform(distance_matrix, checks=False), method='average')

labels_hier = fcluster(Z, t=0.7, criterion='distance')

df['cluster'] = np.nan
df.loc[sample_idx, 'cluster'] = labels_hier
df['cluster'] = df['cluster'].astype('Int64')

print("Number of clusters:", df['cluster'].nunique())

Number of clusters: 385


---------------

In [20]:
def top_genres(series, k=5):
    tokens = []
    for s in series.dropna():
        tokens.extend([g.strip() for g in s.split(',') if g.strip()])
    return Counter(tokens).most_common(k)

def top_countries(series, k=5):
    tokens = []
    for s in series.dropna():
        tokens.extend([c.strip() for c in str(s).split(',') if c.strip()])
    return Counter(tokens).most_common(k)

def top_words_from_embeddings(indices, k=10):
    vecs = embedding_matrix[indices]
    mean_vec = vecs.mean(axis=0)
    top_dims = np.argsort(mean_vec)[-k:]
    return [f"dim{d}" for d in top_dims]


# Printing the clusters

In [21]:
subset = df.loc[sample_idx]
unique_clusters = sorted(subset['cluster'].dropna().unique())

for cid in unique_clusters[:10]:  # show first 10 clusters
    sub = subset[subset['cluster'] == cid]
    idxs = sub.index.tolist()
    
    print(f"\n=== Cluster {cid} | Size: {len(sub)} ===")
    print("Top Genres:", top_genres(sub['listed_in']))
    print("Top Countries:", top_countries(sub['country']))
    print("Top Embedding Dimensions:", top_words_from_embeddings(idxs))
    print("Example Titles:", "; ".join(sub['title'].head(5).tolist()))



=== Cluster 1 | Size: 1 ===
Top Genres: [('Action & Adventure', 1)]
Top Countries: [('United Kingdom', 1), ('United States', 1)]
Top Embedding Dimensions: ['dim210', 'dim217', 'dim121', 'dim271', 'dim347', 'dim275', 'dim259', 'dim358', 'dim44', 'dim26']
Example Titles: Die Another Day

=== Cluster 2 | Size: 1 ===
Top Genres: [('Comedies', 1), ('International Movies', 1)]
Top Countries: [('India', 1)]
Top Embedding Dimensions: ['dim12', 'dim240', 'dim261', 'dim374', 'dim298', 'dim222', 'dim228', 'dim121', 'dim68', 'dim175']
Example Titles: Welcome 2 Karachi

=== Cluster 3 | Size: 1 ===
Top Genres: [("Kids' TV", 1)]
Top Countries: [('Canada', 1), ('Australia', 1)]
Top Embedding Dimensions: ['dim373', 'dim115', 'dim176', 'dim348', 'dim163', 'dim108', 'dim60', 'dim193', 'dim255', 'dim327']
Example Titles: The Deep

=== Cluster 4 | Size: 1 ===
Top Genres: [("Kids' TV", 1)]
Top Countries: []
Top Embedding Dimensions: ['dim112', 'dim241', 'dim64', 'dim122', 'dim55', 'dim123', 'dim212', 'dim2

# K-Means

In [22]:
from scipy.sparse import csr_matrix, hstack


emb_norm = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1, keepdims=True)
emb_sparse = csr_matrix(emb_norm)

meta = combined_features_matrix.copy()

row_norms = np.sqrt(meta.multiply(meta).sum(axis=1))  # sparse-safe
row_norms = np.asarray(row_norms).flatten()
row_norms[row_norms == 0] = 1.0

meta_norm = meta.multiply(1 / row_norms[:, None])   # stays sparse
meta_sparse = meta_norm * 0.6

X_hybrid = hstack([emb_sparse, meta_sparse], format='csr')

k = 20
km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=2048, n_init='auto')
labels_kmeans = km.fit_predict(X_hybrid)

df['cluster_kmeans'] = labels_kmeans
df['cluster_kmeans'].value_counts().head()

MemoryError: Unable to allocate 1.36 GiB for an array with shape (8775, 41697) and data type int32

# DBSCAN

In [None]:
distance_matrix_fixed = np.maximum(distance_matrix, 0)
np.fill_diagonal(distance_matrix_fixed, 0)

db = DBSCAN(metric='precomputed', eps=0.30, min_samples=5, n_jobs=-1)
labels_db = db.fit_predict(distance_matrix_fixed)

df['cluster_dbscan'] = np.nan
df.loc[sample_idx, 'cluster_dbscan'] = labels_db
df['cluster_dbscan'] = df['cluster_dbscan'].astype('Int64')


# Top-K Cluster 

In [None]:
which = 'cluster_kmeans'  # or cluster / cluster_dbscan
sizes = df[which].value_counts(dropna=True)

if -1 in sizes:
    sizes = sizes[sizes.index != -1]

for cid in sizes.index[:5]:
    sub = df[df[which] == cid]
    print(f"\n=== Cluster {cid} | Size: {len(sub)} ===")
    print(sub[['title','listed_in','country']].head(5).to_string(index=False))
