## 11) (Max) Hierarchisches Clustering (Subsample)
> Vollständige Paar‑Distanzen sind O(n²). Für große Daten nimmt Code ein Subsample oder berechne nur Top‑k Nachbarn.


In [2]:

# Optional demo on a small subset to avoid O(n^2) blowup
subset = min(400, distance_matrix.shape[0])  # adjust as needed
if subset >= 3: # ensures there are at least 3 objects
    Z = linkage(squareform(distance_matrix[:subset, :subset], checks=False), method='average')
    labels = fcluster(Z, t=0.7, criterion='distance')
    print("Cluster labels (first few):", labels[:10])
else:
    print("Not enough items for clustering demo.")

Cluster labels (first few): [ 98  72  48  42  35  70  31 155  13 205]


## 12) (Max) Cluster sichtbar machen
 >Nach dem Clustering wird hier sichtbar gemacht, welche Filme in den Clustern sind. Das ist auch einmal eine einfache aufgabe, die erledigt werden kann

In [3]:
#%% Cluster-Profiling: "Wonach wurde geclustert?"
# ------------------------------------------------
# Voraussetzung:
# - distance_matrix = 1 - cosine_sim   (cosine_sim = 0.7*desc + 0.3*genre/country)
# - df['listed_in'], df['country'], df['description_tfidf'] existieren

import numpy as np
from collections import Counter
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

# --- Parameter ---
subset_size = min(400, distance_matrix.shape[0])  # anpassen, falls man mehr/weniger will
linkage_method = 'average'
distance_threshold = 0.7
np.random.seed(42)  # für Reproduzierbarkeit (optional)
# Verwende die ERSTEN 'subset_size' Einträge; alternativ: zufällige Indizes
idx = np.arange(subset_size)  # oder: np.random.choice(df.index, size=subset_size, replace=False)

# --- Submatrix extrahieren und clustern ---
D_sub = distance_matrix[np.ix_(idx, idx)]
Z = linkage(squareform(D_sub, checks=False), method=linkage_method)
labels = fcluster(Z, t=distance_threshold, criterion='distance')

# --- Labels sicher in df schreiben (nur für Subset-Indizes) ---
df['cluster'] = np.nan
df.loc[idx, 'cluster'] = labels
df['cluster'] = df['cluster'].astype('Int64')  # hübscher nullable-int dtype

print(f"Subset: {subset_size} Titel | Cluster-Methode: {linkage_method} | Distanz-Schwelle: {distance_threshold}")
print(f"Anzahl Cluster im Subset: {df.loc[idx, 'cluster'].nunique()}")

# --- Hilfsfunktionen für Cluster-Zusammenfassung ---
def top_genres(series_listed_in, k=5):
    # series_listed_in: Series mit Strings "Genre1, Genre2, ..."
    tokens = []
    for s in series_listed_in.dropna():
        tokens.extend([g.strip() for g in s.split(',') if g.strip()])
    return Counter(tokens).most_common(k)

def top_countries(series_country, k=5):
    tokens = []
    for s in series_country.dropna():
        # Manche Einträge haben mehrere Länder getrennt durch Komma
        tokens.extend([c.strip() for c in str(s).split(',') if c.strip()])
    return Counter(tokens).most_common(k)

def top_words_from_tfidf(series_tfidf, k=10):
    # series_tfidf: enthält bereits die Top-Wörter je Dokument als String
    tokens = []
    for s in series_tfidf.dropna():
        tokens.extend(str(s).split())
    return Counter(tokens).most_common(k)

# --- Pro Cluster: "wonach geclustert" (Profile) ---
unique_clusters = sorted(df.loc[idx, 'cluster'].dropna().unique())
for cid in unique_clusters:
    sub = df.loc[(df.index.isin(idx)) & (df['cluster'] == cid)]
    size = len(sub)

    genres = top_genres(sub['listed_in'], k=5)
    countries = top_countries(sub['country'], k=5)
    words = top_words_from_tfidf(sub['description_tfidf'], k=10)

    print(f"\n=== Cluster {cid} | Größe: {size} ===")
    print("Top-Genres:   ", ", ".join([f"{g} ({n})" for g, n in genres]) or "—")
    print("Top-Länder:   ", ", ".join([f"{c} ({n})" for c, n in countries]) or "—")
    print("Top-Wörter:   ", ", ".join([f"{w} ({n})" for w, n in words]) or "—") # basically same as in TF-IDF. 

    # Beispiel-Titel
    ex_titles = sub['title'].head(5).tolist()
    print("Beispiele:    ", "; ".join(ex_titles) if ex_titles else "—")



Subset: 400 Titel | Cluster-Methode: average | Distanz-Schwelle: 0.7
Anzahl Cluster im Subset: 261

=== Cluster 1 | Größe: 1 ===
Top-Genres:    Kids' TV (1), TV Dramas (1), Teen TV Shows (1)
Top-Länder:    Australia (1)
Top-Wörter:    mercy (1), divers (1), cape (1), skillful (1), shores (1), signs (1), mysteriously (1), of (1), investigate (1), missing (1)
Beispiele:     Dive Club

=== Cluster 2 | Größe: 1 ===
Top-Genres:    Kids' TV (1), TV Comedies (1)
Top-Länder:    Australia (1)
Top-Wörter:    meteor (1), squishy (1), stretchy (1), evildoers (1), gooey (1), zoo (1), transforms (1), superheroes (1), crash (1), animals (1)
Beispiele:     Heroes of Goo Jit Zu

=== Cluster 3 | Größe: 1 ===
Top-Genres:    Kids' TV (1), TV Comedies (1)
Top-Länder:    Finland (1)
Top-Wörter:    eggs (1), nest (1), feathered (1), guarding (1), pesky (1), lots (1), pigs (1), chuck (1), birds (1), red (1)
Beispiele:     Angry Birds

=== Cluster 4 | Größe: 2 ===
Top-Genres:    Kids' TV (2), Korean TV Shows (

## 13) (Max) Nur die Top k Cluster zeigen:
 

In [None]:
#%% Top-K größte Cluster: Profiling & Preview
# ------------------------------------------------------------
# Zeigt die Top-5 größten Cluster und ihre häufigsten Merkmale
# (Genres, Länder, TF-IDF-Wörter) + Beispieltitel
# ------------------------------------------------------------

import numpy as np
from collections import Counter
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

# ---------- Parameter ----------
subset_size = min(400, distance_matrix.shape[0])  # für Demo; bei Bedarf erhöhen
linkage_method = 'average'
distance_threshold = 0.7
top_k = 5
np.random.seed(42)

# ---------- Subset-Indizes ----------
idx = np.arange(subset_size)  # oder: np.random.choice(df.index, size=subset_size, replace=False)

# ---------- Clustern (falls nötig) ----------
needs_cluster = ('cluster' not in df.columns) or (df.loc[idx, 'cluster'].isna().all())
if needs_cluster:
    D_sub = distance_matrix[np.ix_(idx, idx)]
    Z = linkage(squareform(D_sub, checks=False), method=linkage_method)
    labels = fcluster(Z, t=distance_threshold, criterion='distance')
    df['cluster'] = np.nan
    df.loc[idx, 'cluster'] = labels
    df['cluster'] = df['cluster'].astype('Int64')

print(f"Subset: {subset_size} | Methode: {linkage_method} | Schwelle: {distance_threshold}")
sub_df = df.loc[idx]  # nur das Subset betrachten

# ---------- Hilfsfunktionen ----------
def top_genres(series_listed_in, k=5):
    tokens = []
    for s in series_listed_in.dropna():
        tokens.extend([g.strip() for g in s.split(',') if g.strip()])
    return Counter(tokens).most_common(k)

def top_countries(series_country, k=5):
    tokens = []
    for s in series_country.dropna():
        tokens.extend([c.strip() for c in str(s).split(',') if c.strip()])
    return Counter(tokens).most_common(k)

def top_words_from_tfidf(series_tfidf, k=10):
    tokens = []
    for s in series_tfidf.dropna():
        tokens.extend(str(s).split())
    return Counter(tokens).most_common(k)

# ---------- Top-K größte Cluster bestimmen ----------
sizes = sub_df['cluster'].value_counts(dropna=True).sort_values(ascending=False)
top_clusters = list(sizes.head(top_k).index)

print(f"Gefundene Cluster im Subset: {len(sizes)} | Zeige Top-{min(top_k, len(sizes))} größte Cluster.")

for rank, cid in enumerate(top_clusters, start=1):
    sub = sub_df.loc[sub_df['cluster'] == cid]
    size = len(sub)

    genres = top_genres(sub['listed_in'], k=5)
    countries = top_countries(sub['country'], k=5)
    words = top_words_from_tfidf(sub['description_tfidf'], k=10)

    print(f"\n=== #{rank} | Cluster {cid} | Größe: {size} ===")
    print("Top-Genres:   ", ", ".join([f"{g} ({n})" for g, n in genres]) or "—")
    print("Top-Länder:   ", ", ".join([f"{c} ({n})" for c, n in countries]) or "—")
    print("Top-Wörter:   ", ", ".join([f"{w} ({n})" for w, n in words]) or "—")
    print("Beispiele:    ", "; ".join(sub['title'].head(5).tolist()) if size else "—")