# Bird‑phrase clustering – result exploration
Interactive notebook that reproduces the quick‑look analyses & plots we discussed.
Just run the cells top‑to‑bottom (or use **`Run All`**) — feel free to tweak paths or add new views.

In [1]:

import hashlib, textwrap
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# prettier, wider tables in notebooks
pd.set_option('display.max_columns', None)
plt.rcParams['figure.dpi'] = 110  # crisper on HiDPI
plt.rcParams['savefig.bbox'] = 'tight'

CORE_COLS = [
    'n_components', 'n_neighbors', 'min_dist', 'metric',
    'min_cluster_size', 'min_samples', 'smoothing_window'
]  # random_state is fixed (42)

def make_config_key(row, cols=CORE_COLS):
    """Short reproducible hash for a config row."""
    tup = tuple(row[c] for c in cols)
    return hashlib.md5(str(tup).encode()).hexdigest()[:10]

def deduplicate_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Remove '.1', '.2' duplicates (keep the first occurrence)."""
    _, first_pos = np.unique(df.columns, return_index=True)
    return df.iloc[:, np.sort(first_pos)]


In [2]:

# ⇩⇩  EDIT: point to your CSV  ⇩⇩
csv_path = Path('/home/george-vengrovski/Documents/projects/tweety_bert_paper/adaptive_search_V4_1M/adaptive_search_ALL_RESULTS_1000k.csv')

df_raw = pd.read_csv(csv_path)
df = deduplicate_columns(df_raw.copy())
df['config_key'] = df.apply(make_config_key, axis=1)

metric = 'total_fer'
df = df[df[metric].notna()].sort_values(metric)

print(f'Loaded {len(df):,} rows from {csv_path.name}')
df.head()


Loaded 2,916 rows from adaptive_search_ALL_RESULTS_1000k.csv


Unnamed: 0,fold_path_str,n_components,n_neighbors,min_dist,metric,random_state,min_cluster_size,min_samples,smoothing_window,total_fer,v_measure,total_fer.1,matched_fer,macro_fer,n_gt_types,pct_types_mapped,pct_frames_mapped,time_umap,time_hdbscan,time_eval_block_all_smoothing,oom_flag_umap,oom_flag_hdbscan,error_message,n_pred_clusters,config_key
1517,files/llb3_fold1.npz,32,50,0.1,cosine,42,5000,50,200,5.3124,0.887564,5.3124,4.862304,26.285686,20,85.0,99.5269,53.73863,48.315413,36.791339,False,False,,17,7672674bb4
1529,files/llb3_fold1.npz,32,50,0.1,cosine,42,5000,5,200,5.3679,0.885237,5.3679,4.918067,26.257538,20,85.0,99.5269,53.73863,51.583015,36.483638,False,False,,17,07f8462b62
1787,files/llb3_fold1.npz,8,50,0.1,cosine,42,5000,50,200,5.3928,0.88602,5.3928,4.943086,26.381817,20,85.0,99.5269,47.083767,47.963671,36.910322,False,False,,17,7a478823ca
1862,files/llb3_fold1.npz,8,100,0.1,euclidean,42,5000,50,200,5.4862,0.883524,5.4862,5.03693,26.575702,20,85.0,99.5269,34.895625,47.097017,36.952816,False,False,,17,6fab543c7d
1516,files/llb3_fold1.npz,32,50,0.1,cosine,42,5000,50,100,5.5137,0.889526,5.5137,5.06456,26.632071,20,85.0,99.5269,53.73863,48.315413,36.791339,False,False,,17,3ba96d3bb2


In [3]:
top_rows = 15
display_cols = ['fold_path_str', *CORE_COLS, metric,
                'v_measure', 'macro_fer', 'n_pred_clusters']
print('Top rows (lowest FER):')
display(df[display_cols].head(top_rows))


Top rows (lowest FER):


Unnamed: 0,fold_path_str,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window,total_fer,v_measure,macro_fer,n_pred_clusters
1517,files/llb3_fold1.npz,32,50,0.1,cosine,5000,50,200,5.3124,0.887564,26.285686,17
1529,files/llb3_fold1.npz,32,50,0.1,cosine,5000,5,200,5.3679,0.885237,26.257538,17
1787,files/llb3_fold1.npz,8,50,0.1,cosine,5000,50,200,5.3928,0.88602,26.381817,17
1862,files/llb3_fold1.npz,8,100,0.1,euclidean,5000,50,200,5.4862,0.883524,26.575702,17
1516,files/llb3_fold1.npz,32,50,0.1,cosine,5000,50,100,5.5137,0.889526,26.632071,17
1528,files/llb3_fold1.npz,32,50,0.1,cosine,5000,5,100,5.54,0.887032,26.522512,17
1790,files/llb3_fold1.npz,8,50,0.1,cosine,5000,5,200,5.5796,0.879946,26.605042,17
1786,files/llb3_fold1.npz,8,50,0.1,cosine,5000,50,100,5.5805,0.888611,26.599826,17
1841,files/llb3_fold1.npz,32,50,0.1,euclidean,5000,5,200,5.6719,0.877453,26.611307,17
1865,files/llb3_fold1.npz,8,100,0.1,euclidean,5000,5,200,5.6953,0.878598,26.491315,17


In [4]:

best_per_fold = (df.loc[df.groupby('fold_path_str')[metric].idxmin()]
                   .sort_values(metric)
                   .reset_index(drop=True))
print('Best config per fold:')
display(best_per_fold[['fold_path_str', metric, *CORE_COLS]])


Best config per fold:


Unnamed: 0,fold_path_str,total_fer,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window
0,files/llb3_fold1.npz,5.3124,32,50,0.1,cosine,5000,50,200
1,files/llb3_fold2.npz,6.0859,32,100,0.1,euclidean,2500,50,200
2,files/llb3_fold4.npz,6.3935,8,100,0.1,euclidean,2500,50,200
3,files/llb11_fold2.npz,7.9244,8,50,0.25,euclidean,2500,50,200
4,files/llb11_fold4.npz,8.4289,8,100,0.1,cosine,5000,50,200
5,files/llb11_fold1.npz,8.7613,8,100,0.1,euclidean,2500,50,200
6,files/llb16_fold2.npz,13.7049,8,15,0.1,euclidean,5000,5,200
7,files/llb16_fold1.npz,13.8128,8,15,0.1,euclidean,500,50,200
8,files/llb16_fold4.npz,19.4205,8,50,0.1,euclidean,5000,50,100


In [5]:
mean_cfg = (df.groupby('config_key')[metric]
              .mean()
              .sort_values()
              .to_frame('mean_'+metric)
              .reset_index())
lookup = df.drop_duplicates('config_key').set_index('config_key')[CORE_COLS]
mean_cfg = mean_cfg.join(lookup, on='config_key')

print('Mean FER by config (top 15):')
display(mean_cfg.head(15))

# Mean macro FER by config
mean_cfg_macro = (df.groupby('config_key')['macro_fer']
                    .mean()
                    .sort_values()
                    .to_frame('mean_macro_fer')
                    .reset_index())
mean_cfg_macro = mean_cfg_macro.join(lookup, on='config_key')

print('Mean Macro FER by config (top 15):')
display(mean_cfg_macro.head(15))

# Mean v_measure by config
mean_cfg_vmeasure = (df.groupby('config_key')['v_measure']
                       .mean()
                       .sort_values(ascending=False)
                       .to_frame('mean_v_measure')
                       .reset_index())
mean_cfg_vmeasure = mean_cfg_vmeasure.join(lookup, on='config_key')

print('Mean V-Measure by config (top 15):')
display(mean_cfg_vmeasure.head(15))


Mean FER by config (top 15):


Unnamed: 0,config_key,mean_total_fer,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window
0,e2e73ed9dd,12.2469,32,15,0.25,euclidean,2500,250,200
1,9946f6ab55,12.5308,32,15,0.25,euclidean,5000,250,200
2,c6db33eef7,12.637278,8,100,0.25,euclidean,5000,50,200
3,907209ea5a,12.6472,32,15,0.25,euclidean,5000,250,100
4,a514f96504,12.700478,8,50,0.1,cosine,5000,50,100
5,61c7ffbea2,12.7795,8,15,0.1,euclidean,5000,250,100
6,443c948b74,12.819111,8,100,0.1,euclidean,5000,50,100
7,cf0329b7f0,12.820233,8,50,0.25,euclidean,5000,50,200
8,30f1ea99b9,12.821644,8,50,0.1,euclidean,5000,50,100
9,c5c03a1658,12.896956,32,50,0.25,cosine,5000,50,100


Mean Macro FER by config (top 15):


Unnamed: 0,config_key,mean_macro_fer,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window
0,44094698e0,16.770556,8,50,0.1,euclidean,2500,250,200
1,83087dcff3,19.620121,8,50,0.1,euclidean,2500,250,100
2,31d79285b6,20.004571,8,100,0.1,euclidean,2500,250,100
3,0fe8491b51,20.193943,8,100,0.25,euclidean,2500,250,200
4,469ea21519,20.361142,32,50,0.1,euclidean,2500,250,200
5,f2d947a170,20.376852,8,50,0.25,euclidean,2500,250,100
6,67fccb6d55,20.382998,8,100,0.1,euclidean,2500,250,200
7,001a499371,20.856644,8,50,0.25,euclidean,2500,250,200
8,bbf9b18028,21.383499,32,50,0.1,euclidean,500,250,100
9,75238ae75d,21.60818,8,100,0.1,euclidean,2500,250,0


Mean V-Measure by config (top 15):


Unnamed: 0,config_key,mean_v_measure,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window
0,c2f6aa1e1b,0.874651,8,50,0.1,euclidean,500,250,100
1,75e6053f2a,0.872702,8,100,0.25,euclidean,500,250,100
2,17a54a2ea0,0.872617,8,50,0.1,euclidean,500,250,200
3,c5fb75895a,0.869506,8,100,0.1,euclidean,5000,250,200
4,83087dcff3,0.86878,8,50,0.1,euclidean,2500,250,100
5,6da2137f3d,0.868212,32,50,0.1,euclidean,5000,250,100
6,e494dd9d83,0.86814,8,100,0.1,euclidean,5000,250,100
7,e23118df5d,0.867031,8,100,0.25,euclidean,500,250,200
8,44094698e0,0.866027,8,50,0.1,euclidean,2500,250,200
9,1ea7856d86,0.865788,32,50,0.1,euclidean,2500,250,100


In [8]:
# Filter to only configs with smoothing_window == 0
lookup_sw0 = lookup[lookup['smoothing_window'] == 0]
sw0_keys = set(lookup_sw0.index)

mean_cfg_sw0 = mean_cfg[mean_cfg['config_key'].isin(sw0_keys)].reset_index(drop=True)
mean_cfg_macro_sw0 = mean_cfg_macro[mean_cfg_macro['config_key'].isin(sw0_keys)].reset_index(drop=True)
mean_cfg_vmeasure_sw0 = mean_cfg_vmeasure[mean_cfg_vmeasure['config_key'].isin(sw0_keys)].reset_index(drop=True)

# 1. extract the ranks for each metric. the index is the rank.
fer_ranks = mean_cfg_sw0[['config_key']].reset_index().rename(columns={'index': 'fer_rank'})
macro_fer_ranks = mean_cfg_macro_sw0[['config_key']].reset_index().rename(columns={'index': 'macro_fer_rank'})
v_measure_ranks = mean_cfg_vmeasure_sw0[['config_key']].reset_index().rename(columns={'index': 'v_measure_rank'})

# 2. merge the ranks into a single dataframe
agg_ranks = fer_ranks.merge(macro_fer_ranks, on='config_key')
agg_ranks = agg_ranks.merge(v_measure_ranks, on='config_key')

# 3. sum the ranks to get the borda score
agg_ranks['total_rank'] = agg_ranks['fer_rank'] + agg_ranks['macro_fer_rank'] + agg_ranks['v_measure_rank']

# 4. sort by the total rank to find the best overall configuration
final_leaderboard = agg_ranks.sort_values('total_rank')

# join with the actual hyperparameter values for interpretability
final_leaderboard = final_leaderboard.join(lookup, on='config_key')

print("Principled Leaderboard using Rank Aggregation (Borda Count) (smoothing_window == 0):")
display(final_leaderboard.head(15))

# Print a second table where the rank columns are replaced with the actual metric values
# Get the actual values for each config_key in the same order as the leaderboard
value_table = final_leaderboard[['config_key']].copy()
value_table = value_table.merge(
    mean_cfg_sw0[['config_key', 'mean_total_fer']], on='config_key', how='left'
).merge(
    mean_cfg_macro_sw0[['config_key', 'mean_macro_fer']], on='config_key', how='left'
).merge(
    mean_cfg_vmeasure_sw0[['config_key', 'mean_v_measure']], on='config_key', how='left'
).set_index(final_leaderboard.index)

# Add hyperparameters for interpretability
value_table = value_table.join(lookup, on='config_key')

print("Leaderboard with actual metric values (smoothing_window == 0):")
display(value_table.head(15))

Principled Leaderboard using Rank Aggregation (Borda Count) (smoothing_window == 0):


Unnamed: 0,fer_rank,config_key,macro_fer_rank,v_measure_rank,total_rank,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window
0,0,f18b707674,1,6,7,8,15,0.1,euclidean,5000,250,0
6,6,dbb8e7099f,6,0,12,8,100,0.1,euclidean,5000,250,0
12,12,cc871cf837,8,1,21,32,50,0.1,euclidean,5000,250,0
8,8,6efb65a5b9,15,13,36,8,100,0.1,euclidean,2500,50,0
24,24,769b182123,11,7,42,8,100,0.25,euclidean,5000,250,0
38,38,75238ae75d,0,5,43,8,100,0.1,euclidean,2500,250,0
20,20,e959965d26,12,15,47,8,50,0.1,euclidean,2500,50,0
15,15,3556a941c5,22,23,60,8,100,0.1,cosine,2500,50,0
2,2,952478a7a9,39,19,60,8,50,0.1,euclidean,5000,50,0
31,31,87f95960c5,21,9,61,8,50,0.25,euclidean,5000,250,0


Leaderboard with actual metric values (smoothing_window == 0):


Unnamed: 0,config_key,mean_total_fer,mean_macro_fer,mean_v_measure,n_components,n_neighbors,min_dist,metric,min_cluster_size,min_samples,smoothing_window
0,f18b707674,13.7184,23.826193,0.841566,8,15,0.1,euclidean,5000,250,0
6,dbb8e7099f,15.3245,26.230819,0.851341,8,100,0.1,euclidean,5000,250,0
12,cc871cf837,16.2362,26.521146,0.847447,32,50,0.1,euclidean,5000,250,0
8,6efb65a5b9,15.5325,28.480731,0.835766,8,100,0.1,euclidean,2500,50,0
24,769b182123,16.9056,27.51518,0.840012,8,100,0.25,euclidean,5000,250,0
38,75238ae75d,17.7466,21.60818,0.84194,8,100,0.1,euclidean,2500,250,0
20,e959965d26,16.713122,28.014351,0.834993,8,50,0.1,euclidean,2500,50,0
15,3556a941c5,16.428144,29.672934,0.831492,8,100,0.1,cosine,2500,50,0
2,952478a7a9,14.655667,31.866626,0.833456,8,50,0.1,euclidean,5000,50,0
31,87f95960c5,17.5047,29.523408,0.838744,8,50,0.25,euclidean,5000,250,0


In [7]:

rt_cols = ['time_umap', 'time_hdbscan']
rt_stats = pd.DataFrame({
    'mean': df[rt_cols].mean(),
    'median': df[rt_cols].median(),
    'p95': df[rt_cols].quantile(0.95)
}).round(2)
rt_stats


Unnamed: 0,mean,median,p95
time_umap,43.21,44.08,53.74
time_hdbscan,45.56,44.97,53.64


In [2]:
import pandas as pd
from pathlib import Path

# ───────────────────────── CONFIG ─────────────────────────
CSV_PATH = Path("/home/george-vengrovski/Documents/projects/tweety_bert_paper/"
                "adaptive_search_V4_1M/adaptive_search_ALL_RESULTS_1000k.csv")
METRIC   = "total_fer"
PARAMS   = ["n_components", "n_neighbors", "min_dist",
            "metric", "min_cluster_size", "min_samples"]
# ───────────────────────────────────────────────────────────

# 1) load data
df = pd.read_csv(CSV_PATH)

# 2) cast numeric parameters where possible
for p in PARAMS:
    if p != "metric":
        try:
            df[p] = pd.to_numeric(df[p])
        except ValueError:
            pass

# 3) compute mean metric per parameter and print in plain text
for p in PARAMS:
    print(f"\n{p}:")
    grouped = df.groupby(p)[METRIC].mean().sort_index()
    for val, mean_val in grouped.items():
        print(f"{val}\t{mean_val:.6f}")



n_components:
2	40.675388
8	21.240676
32	21.476786

n_neighbors:
15	29.753693
50	24.358658
100	23.930150

min_dist:
0.1	26.011366
0.25	25.636083

metric:
cosine	25.672997
euclidean	25.970848

min_cluster_size:
500	41.761065
2500	22.569797
5000	19.518564

min_samples:
5	27.351882
50	23.957326
250	33.740501
