# Cluster normalized heatmaps (image_norms) with label lookup

This notebook clusters per-image normalized gaze heatmaps stored in `visualization/visual_exploration/image_norms/` and joins labels from `labels_per_id.csv`.

It will:
- Load each image's normalized PDF (`.npz` files named `norm_pdf_image_{id}.npz`).
- Resize to a common grid, flatten, and L2-normalize feature vectors.
- Run K-Means to get clusters.
- Join labels by `image_id` from `labels_per_id.csv`.
- Save: clusters CSV, PCA scatter plot, and cluster prototype heatmaps.

In [1]:
# Setup and paths
import os, re, json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize as sk_normalize
from PIL import Image

sns.set_context('talk')
sns.set_style('whitegrid')

def find_project_root(start: Path):
    for cand in [start, *start.parents]:
        if (cand / 'labels_per_id.csv').exists() or (cand / 'visualization').exists() or (cand / 'fixations').exists():
            return cand
    return start

nb_dir = Path.cwd()
project_root = find_project_root(nb_dir)
viz_dir = project_root / 'visualization' / 'visual_exploration'
norms_dir = viz_dir / 'image_norms'
out_dir = viz_dir

print(f'Notebook CWD: {nb_dir}')
print(f'Project root: {project_root}')
print(f'Norms dir: {norms_dir} (exists={norms_dir.exists()})')
print(f'Outputs: {out_dir}')

Notebook CWD: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\visual_exploration
Project root: c:\Users\SWixforth\Uni\eye-tracking-ai
Norms dir: c:\Users\SWixforth\Uni\eye-tracking-ai\visualization\visual_exploration\image_norms (exists=True)
Outputs: c:\Users\SWixforth\Uni\eye-tracking-ai\visualization\visual_exploration


In [2]:
# Helpers: resize with PIL and load norm vectors
def resize_array_bilinear(H: np.ndarray, target_size=(64,64)) -> np.ndarray:
    H = np.asarray(H, dtype=np.float32)
    maxv = float(H.max()) if np.isfinite(H).all() and H.size > 0 else 0.0
    scaled = (H / maxv * 255.0).clip(0,255).astype(np.uint8) if maxv > 0 else (H * 0).astype(np.uint8)
    img = Image.fromarray(scaled, mode='L')
    img_r = img.resize((int(target_size[1]), int(target_size[0])), resample=Image.BILINEAR)
    arr = np.asarray(img_r).astype(np.float32) / 255.0
    return arr

def load_norm_vectors(norm_dir: Path, target_size=(64,64)):
    entries = []
    if not norm_dir.exists():
        print(f'ERROR: norms_dir not found: {norm_dir}')
        return entries
    files = sorted(norm_dir.glob('norm_pdf_image_*.npz'))
    print(f'Found {len(files)} NPZ PDFs in {norm_dir}')
    for fp in files:
        m = re.search(r'norm_pdf_image_(\d+)\.npz$', fp.name)
        if not m:
            continue
        image_id = m.group(1)
        try:
            data = np.load(fp)
            H = data['H']
        except Exception as e:
            print(f'Skip {fp.name}: {e}')
            continue
        if H.ndim != 2:
            print(f'Skip {fp.name}: H is not 2D')
            continue
        H_resized = resize_array_bilinear(H, target_size=target_size).astype(np.float32)
        v = H_resized.flatten()
        v = sk_normalize(v.reshape(1, -1), norm='l2').ravel()
        entries.append({'image_id': image_id, 'vec': v, 'H': H_resized})
    return entries

In [3]:
# Clustering pipeline with label join and outputs
def cluster_norms(norm_dir: Path, k=5, target_size=(64,64)):
    entries = load_norm_vectors(norm_dir, target_size=target_size)
    if not entries:
        print('No norms loaded; aborting clustering.')
        return None
    X = np.stack([e['vec'] for e in entries], axis=0)
    ids = [e['image_id'] for e in entries]
    km = KMeans(n_clusters=k, n_init=25, random_state=42)
    cluster_ids = km.fit_predict(X)
    df = pd.DataFrame({'image_id': ids, 'cluster': cluster_ids})
    # Join labels
    labels_csv = project_root / 'labels_per_id.csv'
    if labels_csv.exists():
        lab = pd.read_csv(labels_csv)
        if 'image_id' in lab.columns:
            lab['image_id'] = lab['image_id'].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
        else:
            for c in ['id','image','img_id']:
                if c in lab.columns:
                    lab['image_id'] = lab[c].astype(str).str.extract(r'(\d+)').fillna('').iloc[:,0].str.zfill(3)
                    break
        df = df.merge(lab, on='image_id', how='left')
    # Save CSV
    out_csv = out_dir / 'image_norms_clusters.csv'
    df.to_csv(out_csv, index=False)
    print(f'Saved {out_csv} (k={k}, n={len(df)})')
    # PCA scatter
    pca = PCA(n_components=2, random_state=42)
    Z = pca.fit_transform(X)
    plt.figure(figsize=(7,6))
    sc = plt.scatter(Z[:,0], Z[:,1], c=cluster_ids, cmap='tab10', alpha=0.85, edgecolors='none')
    for i, img in enumerate(ids):
        plt.text(Z[i,0], Z[i,1], img, fontsize=8, alpha=0.75)
    plt.title('PCA of image norms (colored by cluster)')
    plt.tight_layout()
    pca_png = out_dir / 'image_norms_pca_clusters.png'
    plt.savefig(pca_png, dpi=150)
    plt.close()
    print(f'Saved {pca_png}')
    # Cluster prototypes
    proto_dir = out_dir / 'image_norms_cluster_prototypes'
    proto_dir.mkdir(parents=True, exist_ok=True)
    for c_id in sorted(np.unique(cluster_ids)):
        idx = np.where(cluster_ids == c_id)[0]
        if len(idx) == 0:
            continue
        avgH = np.mean([entries[i]['H'] for i in idx], axis=0)
        plt.figure(figsize=(5,4))
        plt.imshow(avgH, cmap='viridis', origin='upper')
        plt.colorbar(fraction=0.046, pad=0.04)
        plt.title(f'Cluster {c_id} (n={len(idx)}) â€” prototype')
        plt.tight_layout()
        fp = proto_dir / f'cluster_{c_id}_prototype.png'
        plt.savefig(fp, dpi=150)
        plt.close()
    return df

In [4]:
# Run
k = 5
target_size = (64,64)
results = cluster_norms(norms_dir, k=k, target_size=target_size)
results.head() if isinstance(results, pd.DataFrame) else results

Found 152 NPZ PDFs in c:\Users\SWixforth\Uni\eye-tracking-ai\visualization\visual_exploration\image_norms


  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mode='L')
  img = Image.fromarray(scaled, mo

Saved c:\Users\SWixforth\Uni\eye-tracking-ai\visualization\visual_exploration\image_norms_clusters.csv (k=5, n=152)
Saved c:\Users\SWixforth\Uni\eye-tracking-ai\visualization\visual_exploration\image_norms_pca_clusters.png


Unnamed: 0,image_id,cluster,labels_txt,strong_tags,weak_tags,meme,person,politik,ort,text,meme_weight,person_weight,politik_weight,ort_weight,text_weight
0,1,4,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
1,2,4,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
2,3,0,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
3,4,4,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
4,5,4,meme,meme,,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
