# Image clustering and label mapping (per_image_descriptive_summary)

This notebook clusters images using features from `per_image_descriptive_summary.csv` and maps/apply labels via `labels_per_id.csv`.

It will:
- Load and prepare per-image features.
- Standardize and cluster images (K-Means by default).
- Visualize clusters in 2D (PCA).
- Join labels by `image_id` from `labels_per_id.csv`.
- Provide a simple override workflow to apply corrected labels and save them for later comparison.

In [5]:
# Setup and paths
import os, re, json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, normalize as sk_normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set_context('talk')
sns.set_style('whitegrid')

def find_project_root(start: Path):
    for cand in [start, *start.parents]:
        if (cand / 'labels_per_id.csv').exists() or (cand / 'data_analysis').exists():
            return cand
    return start

nb_dir = Path.cwd()
project_root = find_project_root(nb_dir)
summary_csv = project_root / 'data_analysis' / 'descriptive_analysis' / 'per_image_descriptive_summary.csv'
labels_csv = project_root / 'labels_per_id.csv'
out_dir = project_root / 'data_analysis' / 'label_analysis'
out_dir.mkdir(parents=True, exist_ok=True)

print(f'Project root: {project_root}')
print(f'Summary CSV: {summary_csv} (exists={summary_csv.exists()})')
print(f'Labels CSV: {labels_csv} (exists={labels_csv.exists()})')
print(f'Outputs: {out_dir}')

Project root: c:\Users\SWixforth\Uni\eye-tracking-ai
Summary CSV: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\descriptive_analysis\per_image_descriptive_summary.csv (exists=True)
Labels CSV: c:\Users\SWixforth\Uni\eye-tracking-ai\labels_per_id.csv (exists=True)
Outputs: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis


In [6]:
# Load data
if not summary_csv.exists():
    raise FileNotFoundError(f'Missing {summary_csv}')
df = pd.read_csv(summary_csv)
# Ensure image_id as 3-digit string
if 'image_id' in df.columns:
    df['image_id'] = df['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
else:
    raise KeyError('image_id column not found in per_image_descriptive_summary.csv')
print(df.shape)
df.head()

(152, 15)


Unnamed: 0,image_id,number_of_fixations,view_time_total_sum,fixation_duration_mean_weighted,fixation_duration_median_approx,scanpath_length_mean,BCEA_68_mean,BCEA_95_mean,primary_label_top,pupil_size_norm_mean,pupil_size_norm_std,fix_dur_mean_first_third,fix_dur_mean_last_third,n_fix_first_third,n_fix_last_third
0,1,1159,425375.042,284.32902,232.363,3688.463661,72008.690066,189496.552805,meme,2.101475e-16,1.0,274.59095,311.45011,406,378
1,2,1284,452560.267,274.577679,232.927,2665.561573,60359.429235,158840.603249,meme,-2.377767e-16,1.0,270.570046,286.048076,440,434
2,3,1110,411435.771,296.86351,245.25175,3022.834081,40009.274302,105287.563953,meme,2.772367e-16,1.0,261.323237,375.535697,415,350
3,4,1354,469790.081,280.162546,216.588,3536.890776,87693.362497,230772.006571,meme,-7.328157e-17,1.0,269.514004,306.053531,477,445
4,5,1359,460515.114,265.496762,216.431,3734.527053,90696.081085,238673.897592,meme,-4.174451e-17,1.0,255.631797,281.616679,480,435


In [7]:
# Feature selection and preprocessing
# Pick numeric columns automatically, exclude obvious identifiers
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude = {'image_id'}
feature_cols = [c for c in num_cols if c not in exclude]
if not feature_cols:
    raise ValueError('No numeric feature columns found for clustering.')
X = df[feature_cols].copy()
# Fill missing with column medians
X = X.fillna(X.median(numeric_only=True))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 2D projection for visualization
pca = PCA(n_components=2, random_state=42)
Z = pca.fit_transform(X_scaled)
print(f'Features used ({len(feature_cols)}):', feature_cols[:10], '...')
pd.DataFrame({'PC1': Z[:,0], 'PC2': Z[:,1], 'image_id': df['image_id']}).head()

Features used (13): ['number_of_fixations', 'view_time_total_sum', 'fixation_duration_mean_weighted', 'fixation_duration_median_approx', 'scanpath_length_mean', 'BCEA_68_mean', 'BCEA_95_mean', 'pupil_size_norm_mean', 'pupil_size_norm_std', 'fix_dur_mean_first_third'] ...


Unnamed: 0,PC1,PC2,image_id
0,-1.431684,0.463348,1
1,-1.289031,-1.055962,2
2,-2.957876,-0.728327,3
3,-0.123038,0.228047,4
4,0.336004,0.421475,5


In [8]:
# Choose number of clusters (k) and fit KMeans
k = 5  # adjust as needed
km = KMeans(n_clusters=k, n_init=25, random_state=42)
clusters = km.fit_predict(X_scaled)
sil = silhouette_score(X_scaled, clusters) if len(np.unique(clusters)) > 1 else np.nan
print(f'KMeans: k={k}, silhouette={sil:.3f}')

# Build per-image cluster assignment (include PCA coords for convenience)
cluster_df = pd.DataFrame({
    'image_id': df['image_id'],
    'cluster': clusters,
    'PC1': Z[:, 0],
    'PC2': Z[:, 1],
})

# Save centroids in original feature space for reference
centroids_scaled = km.cluster_centers_
centroids = pd.DataFrame(scaler.inverse_transform(centroids_scaled), columns=feature_cols)
centroids.insert(0, 'cluster', np.arange(k))
centroids_out = out_dir / 'cluster_feature_centroids.csv'
centroids.to_csv(centroids_out, index=False)
print(f'Saved centroids -> {centroids_out}')

cluster_df.head()

KMeans: k=5, silhouette=0.230
Saved centroids -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\cluster_feature_centroids.csv


Unnamed: 0,image_id,cluster,PC1,PC2
0,1,1,-1.431684,0.463348
1,2,3,-1.289031,-1.055962
2,3,3,-2.957876,-0.728327
3,4,1,-0.123038,0.228047
4,5,1,0.336004,0.421475


In [9]:
# PCA scatter colored by cluster
plt.figure(figsize=(7,6))
sns.scatterplot(x=Z[:,0], y=Z[:,1], hue=clusters, palette='tab10', s=50, edgecolor='none')
for i, img in enumerate(df['image_id']):
    plt.text(Z[i,0], Z[i,1], img, fontsize=8, alpha=0.7)
plt.title('Images clustered (PCA 2D)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='cluster', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
pca_png = out_dir / 'image_clusters_pca.png'
plt.savefig(pca_png, dpi=150)
plt.close()
print(f'Saved {pca_png}')

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\image_clusters_pca.png


In [10]:
# Join labels_per_id.csv by image_id
labels_df = None
if labels_csv.exists():
    labels_df = pd.read_csv(labels_csv)
    if 'image_id' in labels_df.columns:
        labels_df['image_id'] = labels_df['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
    else:
        for c in ['id','image','img_id']:
            if c in labels_df.columns:
                labels_df['image_id'] = labels_df[c].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
                break
    print(f'Labels loaded: {labels_df.shape if labels_df is not None else None}')
else:
    print('labels_per_id.csv not found; continuing without label join')

merged = cluster_df.copy()
if labels_df is not None:
    merged = merged.merge(labels_df, on='image_id', how='left')
merged_out = out_dir / 'image_clusters_with_labels.csv'
merged.to_csv(merged_out, index=False)
print(f'Saved {merged_out} (rows={len(merged)})')
merged.head()

Labels loaded: (154, 14)
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\image_clusters_with_labels.csv (rows=152)


Unnamed: 0,image_id,cluster,PC1,PC2,labels_txt,strong_tags,weak_tags,meme,person,politik,ort,text,meme_weight,person_weight,politik_weight,ort_weight,text_weight
0,1,1,-1.431684,0.463348,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
1,2,3,-1.289031,-1.055962,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
2,3,3,-2.957876,-0.728327,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
3,4,1,-0.123038,0.228047,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
4,5,1,0.336004,0.421475,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0


In [11]:
# Cluster vs label crosstab (if a label column is present)
label_col_guess = None
if 'labels_txt' in merged.columns:
    label_col_guess = 'labels_txt'
elif 'primary_label' in merged.columns:
    label_col_guess = 'primary_label'
elif 'category' in merged.columns:
    label_col_guess = 'category'
if label_col_guess:
    ct = pd.crosstab(merged['cluster'], merged[label_col_guess])
    ct_out = out_dir / 'cluster_label_crosstab.csv'
    ct.to_csv(ct_out)
    print(f'Saved {ct_out}')
    ct
else:
    print('No obvious label column found to crosstab.')

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\cluster_label_crosstab.csv


### Apply/override labels
Create a CSV template you can edit (column `applied_label`). Reload it to apply overrides and save a final mapping for downstream use.

In [12]:
# Create overrides template (one row per image) if not exists
template_path = out_dir / 'label_overrides_template.csv'
template = merged[['image_id']].drop_duplicates().copy()
# carry over existing label guess if available
if 'labels_txt' in merged.columns:
    template['current_label'] = merged.groupby('image_id')['labels_txt'].first().reindex(template['image_id']).values
elif 'primary_label' in merged.columns:
    template['current_label'] = merged.groupby('image_id')['primary_label'].first().reindex(template['image_id']).values
else:
    template['current_label'] = ''
template['applied_label'] = template['current_label']
if not template_path.exists():
    template.to_csv(template_path, index=False)
    print(f'Wrote override template -> {template_path}')
else:
    print(f'Override template already exists: {template_path}')
template.head()

Wrote override template -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\label_overrides_template.csv


Unnamed: 0,image_id,current_label,applied_label
0,1,meme,meme
1,2,meme,meme
2,3,meme,meme
3,4,meme,meme
4,5,meme,meme


In [13]:
# Load overrides (edit the CSV externally, then run this cell)
overrides = pd.read_csv(out_dir / 'label_overrides_template.csv') if (out_dir / 'label_overrides_template.csv').exists() else template.copy()
overrides['image_id'] = overrides['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
final_map = merged[['image_id','cluster']].drop_duplicates().merge(overrides[['image_id','applied_label']], on='image_id', how='left')
final_out = out_dir / 'image_labels_applied.csv'
final_map.to_csv(final_out, index=False)
print(f'Saved final labels -> {final_out} (rows={len(final_map)})')
final_map.head()

Saved final labels -> c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\image_labels_applied.csv (rows=152)


Unnamed: 0,image_id,cluster,applied_label
0,1,1,meme
1,2,3,meme
2,3,3,meme
3,4,1,meme
4,5,1,meme


## Cluster compositions and feature profiles

The next cells visualize cluster sizes and label distributions, then profile each cluster via z-scored feature centroids and auto-generated explanations.

In [14]:
# Cluster sizes and optional label distribution
assert 'cluster_df' in globals(), 'cluster_df missing; run clustering cell first.'

# If merged (with labels) exists, use it; else fall back to cluster_df
df_for_labels = globals().get('merged', None)
if df_for_labels is None:
    df_for_labels = cluster_df.copy()

# Cluster size counts
counts = cluster_df['cluster'].value_counts().sort_index()
plt.figure(figsize=(6,4))
sns.barplot(x=counts.index.astype(str), y=counts.values, color='steelblue')
plt.title('Cluster sizes (image count)')
plt.xlabel('cluster')
plt.ylabel('images')
plt.tight_layout()
bar_out = out_dir / 'cluster_sizes.png'
plt.savefig(bar_out, dpi=150)
plt.close()
print(f'Saved {bar_out}')

# Label distribution per cluster (if a label column exists)
label_col_guess = None
for cand in ['applied_label', 'labels_txt', 'primary_label', 'category']:
    if cand in df_for_labels.columns:
        label_col_guess = cand
        break

if label_col_guess:
    ct = pd.crosstab(df_for_labels['cluster'], df_for_labels[label_col_guess])
    plt.figure(figsize=(10, max(4, 0.35*ct.shape[1])))
    (ct.T / ct.sum(axis=1)).T.plot(kind='bar', stacked=True, ax=plt.gca(), colormap='tab20')
    plt.title(f'Label distribution per cluster ({label_col_guess})')
    plt.xlabel('cluster')
    plt.ylabel('share')
    plt.legend(title=label_col_guess, bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.tight_layout()
    stacked_out = out_dir / 'cluster_label_distribution.png'
    plt.savefig(stacked_out, dpi=150)
    plt.close()
    print(f'Saved {stacked_out}')
else:
    print('No label column found; skipped label distribution plot.')

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\cluster_sizes.png
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\cluster_label_distribution.png


In [15]:
# Feature profiles and auto-explanations
# Compute cluster centroids in scaled space and z-score by feature
assert 'km' in globals() and 'feature_cols' in globals(), 'Run clustering first.'

centroids_scaled = km.cluster_centers_
centroids_scaled_df = pd.DataFrame(centroids_scaled, columns=feature_cols)
# z-score across clusters (per feature) for comparability
centroids_z = (centroids_scaled_df - centroids_scaled_df.mean(axis=0)) / (centroids_scaled_df.std(axis=0) + 1e-9)
centroids_z['cluster'] = np.arange(centroids_scaled_df.shape[0])
centroids_z = centroids_z.set_index('cluster').sort_index()

plt.figure(figsize=(max(8, 0.5*len(feature_cols)), 1.2*len(centroids_z)))
sns.heatmap(centroids_z, cmap='coolwarm', center=0, cbar_kws={'label':'z-score (vs other clusters)'})
plt.title('Cluster feature profiles (z-scored centroids)')
plt.xlabel('feature')
plt.ylabel('cluster')
plt.tight_layout()
heatmap_out = out_dir / 'cluster_feature_profiles_heatmap.png'
plt.savefig(heatmap_out, dpi=150)
plt.close()
print(f'Saved {heatmap_out}')

# Auto-generate short per-cluster explanations based on top +/- features
explanations = []
for c in range(centroids_z.shape[0]):
    row = centroids_z.loc[c]
    top_pos = row.sort_values(ascending=False).head(3)
    top_neg = row.sort_values(ascending=True).head(3)
    pos_feats = ", ".join([f"{k} (+{v:.2f})" for k, v in top_pos.items()])
    neg_feats = ", ".join([f"{k} ({v:.2f})" for k, v in top_neg.items()])
    explanations.append({
        'cluster': c,
        'top_positive_features': pos_feats,
        'top_negative_features': neg_feats,
    })

exp_df = pd.DataFrame(explanations)
exp_out = out_dir / 'cluster_explanations.csv'
exp_df.to_csv(exp_out, index=False)
print(f'Saved {exp_out}')
exp_df

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\cluster_feature_profiles_heatmap.png
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\label_analysis\cluster_explanations.csv


Unnamed: 0,cluster,top_positive_features,top_negative_features
0,0,"BCEA_68_mean (+1.19), BCEA_95_mean (+1.19), pu...","fixation_duration_median_approx (-0.72), fix_d..."
1,1,"fixation_duration_median_approx (+0.76), fix_d...","n_fix_first_third (-0.66), number_of_fixations..."
2,2,"view_time_total_sum (+1.70), n_fix_last_third ...","fix_dur_mean_last_third (-1.23), fixation_dura..."
3,3,"fix_dur_mean_last_third (+1.44), fix_dur_mean_...","n_fix_first_third (-1.02), scanpath_length_mea..."
4,4,"pupil_size_norm_std (+0.00), fix_dur_mean_last...","pupil_size_norm_mean (-1.37), BCEA_68_mean (-1..."
