# Image clustering and label mapping (per_image_descriptive_summary)

This notebook clusters images using features from `per_image_descriptive_summary.csv` and maps/apply labels via `labels_per_id.csv`.

It will:
- Load and prepare per-image features.
- Standardize and cluster images (K-Means by default).
- Visualize clusters in 2D (PCA).
- Join labels by `image_id` from `labels_per_id.csv`.
- Provide a simple override workflow to apply corrected labels and save them for later comparison.

In [68]:
# Setup and paths
import os, re, json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, normalize as sk_normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import defaultdict, Counter
import networkx as nx
from matplotlib.patches import FancyBboxPatch

sns.set_context('talk')
sns.set_style('whitegrid')

def find_project_root(start: Path):
    for cand in [start, *start.parents]:
        if (cand / 'labels_per_id.csv').exists() or (cand / 'data_analysis').exists():
            return cand
    return start

nb_dir = Path.cwd()
project_root = find_project_root(nb_dir)
summary_csv = project_root / 'data_analysis' / 'descriptive_analysis' / 'per_image_descriptive_summary_pretty.csv'
labels_csv = project_root / 'labels_per_id.csv'
out_dir = project_root / 'data_analysis' / 'label_analysis' / 'outputs'
out_dir.mkdir(parents=True, exist_ok=True)

print(f'Project root: {project_root}')
print(f'Summary CSV: {summary_csv} (exists={summary_csv.exists()})')
print(f'Labels CSV: {labels_csv} (exists={labels_csv.exists()})')
print(f'Outputs: {out_dir}')

Project root: c:\Users\SWixforth\Uni\eye-tracking-ai
Summary CSV: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\descriptive_analysis\per_image_descriptive_summary_pretty.csv (exists=True)
Labels CSV: c:\Users\SWixforth\Uni\eye-tracking-ai\labels_per_id.csv (exists=True)
Outputs: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs


In [69]:
# Load data
if not summary_csv.exists():
    raise FileNotFoundError(f'Missing {summary_csv}')
df = pd.read_csv(summary_csv)
# Ensure image_id as 3-digit string
if 'image_id' in df.columns:
    df['image_id'] = df['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
else:
    raise KeyError('image_id column not found in per_image_descriptive_summary_pretty.csv')
print(df.shape)
df.head()

(152, 21)


Unnamed: 0,image_id,primary_label,fixations_total,fixations_first_third,fixations_last_third,view_time_total_sum_ms,fixation_duration_mean_ms,fixation_duration_median_ms,fix_dur_mean_first_third_ms,fix_dur_mean_last_third_ms,...,bcea68_mean_px2,bcea95_mean_px2,pupil_mm_mean,pupil_mm_std,pupil_norm_mean,pupil_norm_std,pupil_norm_abs_mean,pupil_norm_rms,pupil_size_norm_n,avg_pupil_size_n
0,1,meme,1159,406,378,425375.0,284.3,232.4,274.6,311.5,...,72009.0,189497.0,3.722,0.172,0.0,1.0,0.795,0.97,23.632653,23.653061
1,2,meme,1284,440,434,452560.0,274.6,232.9,270.6,286.0,...,60359.0,158841.0,3.706,0.146,-0.0,1.0,0.782,0.976,26.204082,26.204082
2,3,meme,1110,415,350,411436.0,296.9,245.3,261.3,375.5,...,40009.0,105288.0,3.839,0.182,0.0,1.0,0.797,0.974,23.104167,23.125
3,4,meme,1354,477,445,469790.0,280.2,216.6,269.5,306.1,...,87693.0,230772.0,3.761,0.164,-0.0,1.0,0.792,0.977,27.632653,27.632653
4,5,meme,1359,480,435,460515.0,265.5,216.4,255.6,281.6,...,90696.0,238674.0,4.008,0.204,-0.0,1.0,0.776,0.975,27.734694,27.734694


In [70]:
# Feature selection and preprocessing
# Pick numeric columns automatically, exclude obvious identifiers
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude = {'image_id', }
feature_cols = [c for c in num_cols if c not in exclude]
if not feature_cols:
    raise ValueError('No numeric feature columns found for clustering.')
X = df[feature_cols].copy()
# Fill missing with column medians
X = X.fillna(X.median(numeric_only=True))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 2D projection for visualization
pca = PCA(n_components=2, random_state=42)
Z = pca.fit_transform(X_scaled)
print(f'Features used ({len(feature_cols)}):', feature_cols[:10], '...')
pd.DataFrame({'PC1': Z[:,0], 'PC2': Z[:,1], 'image_id': df['image_id']}).head()

Features used (19): ['fixations_total', 'fixations_first_third', 'fixations_last_third', 'view_time_total_sum_ms', 'fixation_duration_mean_ms', 'fixation_duration_median_ms', 'fix_dur_mean_first_third_ms', 'fix_dur_mean_last_third_ms', 'scanpath_length_mean_px', 'bcea68_mean_px2'] ...


Unnamed: 0,PC1,PC2,image_id
0,-2.127166,0.321788,1
1,-1.1603,-1.123852,2
2,-3.332609,-0.90518,3
3,-0.295759,0.507351,4
4,-0.325878,1.101427,5


In [71]:
# Optimal k selection using elbow method and silhouette score
def find_optimal_k(X, k_range=range(2, 11), random_state=42):
    """Find optimal k using elbow method and silhouette score."""
    inertias = []
    silhouettes = []
    k_values = list(k_range)
    
    for k in k_values:
        km = KMeans(n_clusters=k, n_init=25, random_state=random_state)
        clusters = km.fit_predict(X)
        inertias.append(km.inertia_)
        sil_score = silhouette_score(X, clusters)
        silhouettes.append(sil_score)
    
    # Elbow method: find point with maximum curvature
    # Using second derivative approximation
    if len(k_values) >= 3:
        second_derivs = []
        for i in range(1, len(inertias) - 1):
            second_deriv = inertias[i-1] - 2*inertias[i] + inertias[i+1]
            second_derivs.append(second_deriv)
        elbow_idx = np.argmax(second_derivs) + 1  # +1 because we start from index 1
        elbow_k = k_values[elbow_idx]
    else:
        elbow_k = k_values[0]
    
    # Best silhouette score
    sil_idx = np.argmax(silhouettes)
    sil_k = k_values[sil_idx]
    
    return {
        'k_values': k_values,
        'inertias': inertias,
        'silhouettes': silhouettes,
        'elbow_k': elbow_k,
        'silhouette_k': sil_k,
        'elbow_score': silhouettes[elbow_idx] if len(k_values) >= 3 else silhouettes[0],
        'max_silhouette': max(silhouettes)
    }

# Find optimal k
print("Finding optimal k...")
k_analysis = find_optimal_k(X_scaled, k_range=range(2, min(12, len(df)//2 + 1)))

# Plot elbow and silhouette curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Elbow plot
ax1.plot(k_analysis['k_values'], k_analysis['inertias'], 'bo-')
ax1.axvline(k_analysis['elbow_k'], color='red', linestyle='--', alpha=0.7, 
           label=f'Chosen k={k_analysis["elbow_k"]}')
ax1.set_xlabel('k (number of clusters)')
ax1.set_ylabel('Inertia (within-cluster sum of squares)')
ax1.set_title('Elbow Method (Quantity Selection)')
ax1.legend()
ax1.grid(True)

# Silhouette plot
ax2.plot(k_analysis['k_values'], k_analysis['silhouettes'], 'go-')
ax2.axvline(k_analysis['elbow_k'], color='red', linestyle='--', alpha=0.7,
           label=f'Elbow-chosen k={k_analysis["elbow_k"]}')
ax2.axvline(k_analysis['silhouette_k'], color='orange', linestyle=':', alpha=0.7,
           label=f'Best silhouette k={k_analysis["silhouette_k"]}')
ax2.set_xlabel('k (number of clusters)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Analysis (Quality Evaluation)')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
k_analysis_png = out_dir / 'k_selection_analysis.png'
plt.savefig(k_analysis_png, dpi=150)
plt.close()
print(f'Saved k analysis plot -> {k_analysis_png}')

# Choose k using Elbow Method (for quantity) and evaluate quality with Silhouette Score
# The Elbow Method determines the reasonable number of clusters
chosen_k = k_analysis['elbow_k']
elbow_silhouette = k_analysis['elbow_score']

# Fit final KMeans with elbow-chosen k
km = KMeans(n_clusters=chosen_k, n_init=25, random_state=42)
clusters = km.fit_predict(X_scaled)
final_sil = silhouette_score(X_scaled, clusters)

print(f"\nK selection results:")
print(f"  Elbow method chose: k={chosen_k} (provides reasonable cluster quantity)")
print(f"  Silhouette score for k={chosen_k}: {final_sil:.3f} (evaluates clustering quality)")
print(f"  Best possible silhouette was: {k_analysis['max_silhouette']:.3f} at k={k_analysis['silhouette_k']}")

# Quality assessment
if final_sil > 0.7:
    quality_assessment = "Excellent clustering quality"
elif final_sil > 0.5:
    quality_assessment = "Good clustering quality"
elif final_sil > 0.25:
    quality_assessment = "Fair clustering quality"
else:
    quality_assessment = "Poor clustering quality - consider different approach"

print(f"  Quality assessment: {quality_assessment}")

# Build per-image cluster assignment (include PCA coords for convenience)
cluster_df = pd.DataFrame({
    'image_id': df['image_id'],
    'cluster': clusters,
    'PC1': Z[:, 0],
    'PC2': Z[:, 1],
})

# Save centroids in original feature space for reference
centroids_scaled = km.cluster_centers_
centroids = pd.DataFrame(scaler.inverse_transform(centroids_scaled), columns=feature_cols)
centroids.insert(0, 'cluster', np.arange(chosen_k))
centroids_out = out_dir / 'cluster_feature_centroids.csv'
centroids.to_csv(centroids_out, index=False)
print(f'Saved centroids -> {centroids_out}')

# Save k analysis results
k_analysis_out = out_dir / 'k_selection_results.json'
with open(k_analysis_out, 'w') as f:
    json.dump({k: v for k, v in k_analysis.items() if k != 'k_values'}, f, indent=2)
print(f'Saved k analysis -> {k_analysis_out}')

cluster_df.head()

Finding optimal k...
Saved k analysis plot -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\k_selection_analysis.png

K selection results:
  Elbow method chose: k=4 (provides reasonable cluster quantity)
  Silhouette score for k=4: 0.190 (evaluates clustering quality)
  Best possible silhouette was: 0.431 at k=2
  Quality assessment: Poor clustering quality - consider different approach
Saved centroids -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_feature_centroids.csv
Saved k analysis -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\k_selection_results.json
Saved k analysis plot -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\k_selection_analysis.png

K selection results:
  Elbow method chose: k=4 (provides reasonable cluster quantity)
  Silhouette score for k=4: 0.190 (evaluates clustering quality)
  Best possible silhouette was: 0.431 at k=2
  Quality a

Unnamed: 0,image_id,cluster,PC1,PC2
0,1,2,-2.127166,0.321788
1,2,2,-1.1603,-1.123852
2,3,0,-3.332609,-0.90518
3,4,2,-0.295759,0.507351
4,5,2,-0.325878,1.101427


In [72]:
# PCA scatter colored by cluster
plt.figure(figsize=(8,6))
sns.scatterplot(x=Z[:,0], y=Z[:,1], hue=clusters, palette='tab10', s=60, edgecolor='white', linewidth=0.5)
for i, img in enumerate(df['image_id']):
    plt.text(Z[i,0], Z[i,1], img, fontsize=7, alpha=0.8, ha='center', va='center')
plt.title(f'Images clustered (k={chosen_k}, silhouette={final_sil:.3f})\nPCA 2D projection')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.legend(title='cluster', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
pca_png = out_dir / 'image_clusters_pca.png'
plt.savefig(pca_png, dpi=150, bbox_inches='tight')
plt.close()
print(f'Saved {pca_png}')

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\image_clusters_pca.png


In [73]:
# Join labels_per_id.csv by image_id
labels_df = None
if labels_csv.exists():
    labels_df = pd.read_csv(labels_csv)
    if 'image_id' in labels_df.columns:
        labels_df['image_id'] = labels_df['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
    else:
        for c in ['id','image','img_id']:
            if c in labels_df.columns:
                labels_df['image_id'] = labels_df[c].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
                break
    
    # Normalize labels: remove parentheses from low-weight tags for better overview
    # Convert "meme text (politik)" -> "meme text politik", "person (text)" -> "person text", etc.
    if 'labels_txt' in labels_df.columns:
        labels_df['labels_txt_normalized'] = labels_df['labels_txt'].str.replace(r'\s*\([^)]+\)', '', regex=True)
        labels_df['labels_txt_normalized'] = labels_df['labels_txt_normalized'].str.replace(r'\([^)]+\)\s*', '', regex=True)
        # Clean up extra spaces and rebuild with consistent format
        def normalize_label(label):
            if pd.isna(label):
                return label
            # Extract individual components
            parts = []
            # Remove parentheses and split by common separators
            clean_label = str(label).replace('(', ' ').replace(')', ' ')
            for part in clean_label.replace(',', ' ').split():
                part = part.strip()
                if part and part not in parts:
                    parts.append(part)
            return ' '.join(sorted(parts)) if parts else label
        
        labels_df['labels_txt_normalized'] = labels_df['labels_txt'].apply(normalize_label)
        print(f"Label normalization examples:")
        examples = labels_df[labels_df['labels_txt'] != labels_df['labels_txt_normalized']][['labels_txt', 'labels_txt_normalized']].head(5)
        if not examples.empty:
            for _, row in examples.iterrows():
                print(f"  '{row['labels_txt']}' -> '{row['labels_txt_normalized']}'")
        else:
            print("  No changes needed (no parentheses found)")
    
    print(f'Labels loaded: {labels_df.shape if labels_df is not None else None}')
else:
    print('labels_per_id.csv not found; continuing without label join')

merged = cluster_df.copy()
if labels_df is not None:
    merged = merged.merge(labels_df, on='image_id', how='left')
merged_out = out_dir / 'image_clusters_with_labels.csv'
merged.to_csv(merged_out, index=False)
print(f'Saved {merged_out} (rows={len(merged)})')
merged.head()

Label normalization examples:
  'ort (text)' -> 'ort text'
  'person (text)' -> 'person text'
  'person (text)' -> 'person text'
  'person (text)' -> 'person text'
  'person (text)' -> 'person text'
Labels loaded: (154, 15)
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\image_clusters_with_labels.csv (rows=152)


Unnamed: 0,image_id,cluster,PC1,PC2,labels_txt,strong_tags,weak_tags,meme,person,politik,ort,text,meme_weight,person_weight,politik_weight,ort_weight,text_weight,labels_txt_normalized
0,1,2,-2.127166,0.321788,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0,meme
1,2,2,-1.1603,-1.123852,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0,meme
2,3,0,-3.332609,-0.90518,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0,meme
3,4,2,-0.295759,0.507351,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0,meme
4,5,2,-0.325878,1.101427,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0,meme


In [74]:
# Cluster vs label crosstab (if a label column is present)
label_col_guess = None
if 'labels_txt_normalized' in merged.columns:
    label_col_guess = 'labels_txt_normalized'
elif 'labels_txt' in merged.columns:
    label_col_guess = 'labels_txt'
elif 'primary_label' in merged.columns:
    label_col_guess = 'primary_label'
elif 'category' in merged.columns:
    label_col_guess = 'category'
if label_col_guess:
    ct = pd.crosstab(merged['cluster'], merged[label_col_guess])
    ct_out = out_dir / 'cluster_label_crosstab.csv'
    ct.to_csv(ct_out)
    print(f'Saved {ct_out} (using {label_col_guess})')
    ct
else:
    print('No obvious label column found to crosstab.')

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_label_crosstab.csv (using labels_txt_normalized)


### Apply/override labels
Create a CSV template you can edit (column `applied_label`). Reload it to apply overrides and save a final mapping for downstream use.

In [75]:
# Create overrides template (one row per image) if not exists
template_path = out_dir / 'label_overrides_template.csv'
template = merged[['image_id']].drop_duplicates().copy()
# carry over existing label guess if available (prefer normalized labels)
if 'labels_txt_normalized' in merged.columns:
    template['current_label'] = merged.groupby('image_id')['labels_txt_normalized'].first().reindex(template['image_id']).values
elif 'labels_txt' in merged.columns:
    template['current_label'] = merged.groupby('image_id')['labels_txt'].first().reindex(template['image_id']).values
elif 'primary_label' in merged.columns:
    template['current_label'] = merged.groupby('image_id')['primary_label'].first().reindex(template['image_id']).values
else:
    template['current_label'] = ''
template['applied_label'] = template['current_label']
if not template_path.exists():
    template.to_csv(template_path, index=False)
    print(f'Wrote override template -> {template_path}')
else:
    print(f'Override template already exists: {template_path}')
template.head()

Override template already exists: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\label_overrides_template.csv


Unnamed: 0,image_id,current_label,applied_label
0,1,meme,meme
1,2,meme,meme
2,3,meme,meme
3,4,meme,meme
4,5,meme,meme


In [76]:
# Load overrides (edit the CSV externally, then run this cell)
overrides = pd.read_csv(out_dir / 'label_overrides_template.csv') if (out_dir / 'label_overrides_template.csv').exists() else template.copy()
overrides['image_id'] = overrides['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
final_map = merged[['image_id','cluster']].drop_duplicates().merge(overrides[['image_id','applied_label']], on='image_id', how='left')
final_out = out_dir / 'image_labels_applied.csv'
final_map.to_csv(final_out, index=False)
print(f'Saved final labels -> {final_out} (rows={len(final_map)})')
final_map.head()

Saved final labels -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\image_labels_applied.csv (rows=152)


Unnamed: 0,image_id,cluster,applied_label
0,1,2,meme
1,2,2,meme
2,3,0,meme
3,4,2,meme
4,5,2,meme


## Cluster compositions and feature profiles

The next cells visualize cluster sizes and label distributions, then profile each cluster via z-scored feature centroids and auto-generated explanations.

In [77]:
# Cluster sizes and optional label distribution
assert 'cluster_df' in globals(), 'cluster_df missing; run clustering cell first.'

# If merged (with labels) exists, use it; else fall back to cluster_df
df_for_labels = globals().get('merged', None)
if df_for_labels is None:
    df_for_labels = cluster_df.copy()

# Cluster size counts
counts = cluster_df['cluster'].value_counts().sort_index()
plt.figure(figsize=(6,4))
sns.barplot(x=counts.index.astype(str), y=counts.values, color='steelblue')
plt.title('Cluster sizes (image count)')
plt.xlabel('cluster')
plt.ylabel('images')
plt.tight_layout()
bar_out = out_dir / 'cluster_sizes.png'
plt.savefig(bar_out, dpi=150)
plt.close()
print(f'Saved {bar_out}')

# Label distribution per cluster (if a label column exists)
label_col_guess = None
for cand in ['applied_label', 'labels_txt_normalized', 'labels_txt', 'primary_label', 'category']:
    if cand in df_for_labels.columns:
        label_col_guess = cand
        break

if label_col_guess:
    ct = pd.crosstab(df_for_labels['cluster'], df_for_labels[label_col_guess])
    plt.figure(figsize=(10, max(4, 0.35*ct.shape[1])))
    (ct.T / ct.sum(axis=1)).T.plot(kind='bar', stacked=True, ax=plt.gca(), colormap='tab20')
    plt.title(f'Label distribution per cluster ({label_col_guess})')
    plt.xlabel('cluster')
    plt.ylabel('share')
    plt.legend(title=label_col_guess, bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.tight_layout()
    stacked_out = out_dir / 'cluster_label_distribution.png'
    plt.savefig(stacked_out, dpi=150)
    plt.close()
    print(f'Saved {stacked_out}')
else:
    print('No label column found; skipped label distribution plot.')

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_sizes.png
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_label_distribution.png
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_label_distribution.png


In [78]:
# Feature profiles and auto-explanations
# Compute cluster centroids in scaled space and z-score by feature
assert 'km' in globals() and 'feature_cols' in globals(), 'Run clustering first.'

centroids_scaled = km.cluster_centers_
centroids_scaled_df = pd.DataFrame(centroids_scaled, columns=feature_cols)
# z-score across clusters (per feature) for comparability
centroids_z = (centroids_scaled_df - centroids_scaled_df.mean(axis=0)) / (centroids_scaled_df.std(axis=0) + 1e-9)
centroids_z['cluster'] = np.arange(centroids_scaled_df.shape[0])
centroids_z = centroids_z.set_index('cluster').sort_index()

plt.figure(figsize=(max(8, 0.5*len(feature_cols)), 1.2*len(centroids_z)))
sns.heatmap(centroids_z, cmap='coolwarm', center=0, cbar_kws={'label':'z-score (vs other clusters)'})
plt.title('Cluster feature profiles (z-scored centroids)')
plt.xlabel('feature')
plt.ylabel('cluster')
plt.tight_layout()
heatmap_out = out_dir / 'cluster_feature_profiles_heatmap.png'
plt.savefig(heatmap_out, dpi=150)
plt.close()
print(f'Saved {heatmap_out}')

# Auto-generate short per-cluster explanations based on top +/- features
explanations = []
for c in range(centroids_z.shape[0]):
    row = centroids_z.loc[c]
    top_pos = row.sort_values(ascending=False).head(3)
    top_neg = row.sort_values(ascending=True).head(3)
    pos_feats = ", ".join([f"{k} (+{v:.2f})" for k, v in top_pos.items()])
    neg_feats = ", ".join([f"{k} ({v:.2f})" for k, v in top_neg.items()])
    explanations.append({
        'cluster': c,
        'top_positive_features': pos_feats,
        'top_negative_features': neg_feats,
    })

exp_df = pd.DataFrame(explanations)
exp_out = out_dir / 'cluster_explanations.csv'
exp_df.to_csv(exp_out, index=False)
print(f'Saved {exp_out}')
exp_df

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_feature_profiles_heatmap.png
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\cluster_explanations.csv


Unnamed: 0,cluster,top_positive_features,top_negative_features
0,0,"pupil_mm_std (+1.38), pupil_mm_mean (+1.32), f...","bcea95_mean_px2 (-1.16), bcea68_mean_px2 (-1.1..."
1,1,"view_time_total_sum_ms (+1.41), fixations_last...","pupil_norm_abs_mean (-1.32), fixation_duration..."
2,2,"pupil_norm_abs_mean (+0.53), fixation_duration...","pupil_size_norm_n (-0.58), avg_pupil_size_n (-..."
3,3,"bcea68_mean_px2 (+0.96), bcea95_mean_px2 (+0.9...","fix_dur_mean_first_third_ms (-0.71), fixation_..."


In [80]:
# Hierarchical Label Tree Visualization
def create_label_tree_visualization():
    """Create a tree visualization of label hierarchy and frequencies."""
    
    # Get label data
    if 'merged' not in globals() or 'labels_txt_normalized' not in merged.columns:
        print("No normalized labels found. Run label processing first.")
        return
    
    # Count label frequencies
    label_counts = merged['labels_txt_normalized'].value_counts()
    print(f"Found {len(label_counts)} unique label combinations")
    
    # Parse labels into hierarchical structure
    label_hierarchy = defaultdict(list)
    base_labels = set()
    
    for label, count in label_counts.items():
        if pd.isna(label):
            continue
        
        # Split into individual components
        components = sorted([c.strip() for c in str(label).split() if c.strip()])
        
        if len(components) == 1:
            # Single label - this is a base/root node
            base_labels.add(components[0])
            label_hierarchy['root'].append((components[0], count))
        else:
            # Combination - find the best parent (most specific subset)
            for base in components:
                base_labels.add(base)
            
            # For combinations, we'll organize by primary component (first alphabetically)
            primary = components[0]
            label_hierarchy[primary].append((' '.join(components), count))
    
    # Create the tree visualization
    fig, ax = plt.subplots(1, 1, figsize=(16, 12))
    
    # Calculate positions
    base_labels_sorted = sorted(base_labels)
    n_base = len(base_labels_sorted)
    
    # Position base labels horizontally
    base_positions = {}
    base_y = 0.8
    spacing = 0.8 / max(1, n_base - 1) if n_base > 1 else 0
    
    for i, base_label in enumerate(base_labels_sorted):
        x_pos = 0.1 + i * spacing
        base_positions[base_label] = (x_pos, base_y)
    
    # Draw base labels
    base_label_counts = {}
    for label, count in label_counts.items():
        if pd.isna(label):
            continue
        components = [c.strip() for c in str(label).split() if c.strip()]
        if len(components) == 1:
            base_label_counts[components[0]] = count
    
    # Draw nodes and connections
    drawn_positions = {}
    
    # Draw base labels (top level)
    for base_label in base_labels_sorted:
        x, y = base_positions[base_label]
        count = base_label_counts.get(base_label, 0)
        
        # Node size proportional to frequency
        max_count = label_counts.max()
        node_size = 20 + (count / max_count) * 80
        
        # Color by base label type
        colors = {'meme': '#ff7f0e', 'person': '#2ca02c', 'ort': '#d62728', 
                 'text': '#9467bd', 'politik': '#8c564b'}
        color = colors.get(base_label, '#1f77b4')
        
        # Draw node
        circle = plt.Circle((x, y), node_size/1000, color=color, alpha=0.7, zorder=2)
        ax.add_patch(circle)
        
        # Label
        ax.text(x, y + 0.05, f'{base_label}\n({count})', ha='center', va='bottom', 
               fontsize=10, fontweight='bold', zorder=3)
        
        drawn_positions[base_label] = (x, y)
    
    # Draw combination labels (child nodes)
    child_y_offset = 0.3
    for base_label in base_labels_sorted:
        if base_label not in label_hierarchy:
            continue
            
        combinations = label_hierarchy[base_label]
        if not combinations:
            continue
            
        base_x, base_y = base_positions[base_label]
        
        # Position child nodes
        n_children = len(combinations)
        if n_children == 1:
            child_positions = [(base_x, base_y - child_y_offset)]
        else:
            child_spacing = 0.2 / max(1, n_children - 1)
            start_x = base_x - 0.1
            child_positions = [(start_x + i * child_spacing, base_y - child_y_offset) 
                             for i in range(n_children)]
        
        for (combo_label, combo_count), (child_x, child_y) in zip(combinations, child_positions):
            # Skip single labels (already drawn as base)
            if len(combo_label.split()) <= 1:
                continue
                
            # Node size proportional to frequency
            node_size = 15 + (combo_count / max_count) * 60
            
            # Color mixing based on components
            components = combo_label.split()
            if len(components) >= 2:
                # Blend colors of components
                color1 = colors.get(components[0], '#1f77b4')
                color2 = colors.get(components[1], '#1f77b4')
                # Simple color mixing (just use first component's color with alpha)
                color = color1
            else:
                color = colors.get(base_label, '#1f77b4')
            
            # Draw child node
            circle = plt.Circle((child_x, child_y), node_size/1000, color=color, alpha=0.5, zorder=2)
            ax.add_patch(circle)
            
            # Draw connection line
            ax.plot([base_x, child_x], [base_y - 0.02, child_y + 0.02], 
                   'k-', alpha=0.3, linewidth=1, zorder=1)
            
            # Label
            ax.text(child_x, child_y - 0.08, f'{combo_label}\n({combo_count})', 
                   ha='center', va='top', fontsize=8, zorder=3)
    
    # Additional combinations not tied to single base
    remaining_combos = []
    for label, count in label_counts.items():
        if pd.isna(label):
            continue
        components = [c.strip() for c in str(label).split() if c.strip()]
        if len(components) > 1:
            # Check if it's already handled
            primary = components[0]
            if (label, count) not in label_hierarchy.get(primary, []):
                remaining_combos.append((label, count))
    
    # Draw remaining complex combinations at the bottom
    if remaining_combos:
        bottom_y = 0.1
        n_remaining = len(remaining_combos)
        remaining_spacing = 0.8 / max(1, n_remaining - 1) if n_remaining > 1 else 0
        
        for i, (combo_label, combo_count) in enumerate(remaining_combos):
            x_pos = 0.1 + i * remaining_spacing
            node_size = 15 + (combo_count / max_count) * 60
            
            # Use neutral color for complex combinations
            circle = plt.Circle((x_pos, bottom_y), node_size/1000, color='gray', alpha=0.6, zorder=2)
            ax.add_patch(circle)
            
            ax.text(x_pos, bottom_y - 0.05, f'{combo_label}\n({combo_count})', 
                   ha='center', va='top', fontsize=7, zorder=3)
    
    # Formatting
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_aspect('equal')
    ax.axis('off')
    ax.set_title('Hierarchical Label Structure\n(Node size = frequency, Lines = relationships)', 
                fontsize=14, fontweight='bold', pad=20)
    
    # Legend
    legend_elements = []
    for label, color in colors.items():
        if label in base_labels:
            legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                                            markerfacecolor=color, markersize=10, label=label))
    
    if legend_elements:
        ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(0.98, 0.98))
    
    plt.tight_layout()
    tree_out = out_dir / 'label_hierarchy_tree.png'
    plt.savefig(tree_out, dpi=150, bbox_inches='tight')
    plt.close()
    print(f'Saved label tree visualization -> {tree_out}')
    
    # Create summary statistics
    stats = {
        'total_images': len(merged),
        'unique_combinations': len(label_counts),
        'base_labels': sorted(base_labels),
        'most_common': label_counts.head(10).to_dict()
    }
    
    stats_out = out_dir / 'label_hierarchy_stats.json'
    with open(stats_out, 'w') as f:
        json.dump(stats, f, indent=2)
    print(f'Saved label statistics -> {stats_out}')
    
    return label_counts, base_labels

# Create the visualization
if 'merged' in globals():
    label_counts, base_labels = create_label_tree_visualization()
    print(f"\nLabel hierarchy summary:")
    print(f"Base labels: {', '.join(sorted(base_labels))}")
    print(f"Top 5 combinations: {list(label_counts.head().index)}")
else:
    print("Run label loading first to create tree visualization.")

Found 12 unique label combinations
Saved label tree visualization -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\label_hierarchy_tree.png
Saved label statistics -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\outputs\label_hierarchy_stats.json

Label hierarchy summary:
Base labels: meme, ort, person, politik, text
Top 5 combinations: ['person', 'ort', 'meme text', 'person politik', 'person politik text']
