In [2]:
import os
if not os.environ.get("PREAMBLE_RUN", False): 
    %run -i 'preamble.py'

In [3]:
import logging

logging.getLogger('matplotlib').setLevel(logging.ERROR)

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 


from src.constants import  PATH_MIGRATION_SPEECHES_EMBEDDED

In [4]:
df = pd.read_parquet(PATH_MIGRATION_SPEECHES_EMBEDDED)

In [5]:
df.columns

Index(['Unnamed: 0', 'speaker', 'text', 'date', 'agenda', 'speechnumber',
       'procedure_ID', 'partyfacts_ID', 'period', 'chair', 'MEP', 'commission',
       'written', 'multispeaker', 'link', 'translatedText',
       'translationSource', 'year', 'block', 'party', 'migration_prob',
       'jinaai/jina-embeddings-v3', 'jinaai/jina-embeddings-v4',
       'Snowflake/snowflake-arctic-embed-l-v2.0', 'BAAI/bge-m3',
       'sentence-transformers/all-MiniLM-L6-v2',
       'sentence-transformers/all-mpnet-base-v2', 'Qwen/Qwen3-Embedding-0.6B',
       'google/embeddinggemma-300m', 'document', 'Clustering',
       'Classification', 'STS', 'Retrieval', 'Summarization'],
      dtype='object')

# Final Leaderboard

In [6]:
from src.embedding_metrics import compute_predictive_power, get_cohesiveness, get_cluster_quality


In [7]:
MODELS = ["jinaai/jina-embeddings-v3",
           "Snowflake/snowflake-arctic-embed-l-v2.0", "BAAI/bge-m3", 
           'sentence-transformers/all-MiniLM-L6-v2', 
           'sentence-transformers/all-mpnet-base-v2', 
           'Qwen/Qwen3-Embedding-0.6B', 'google/embeddinggemma-300m',
           # different prompts of google embeddings
           'document', 'Clustering',
        'Classification', 'STS', 'Retrieval', 'Summarization'
          ]


In [8]:
from tqdm import tqdm
results = []
for model in tqdm(MODELS):
    for target in ['party', 'block']:
        f1 = compute_predictive_power(df, model, target).mean()
        cohesivness = get_cohesiveness(df, model, target)
        cluster_quality = get_cluster_quality(df, model, target)
        results.append({"model": model, "target": target, "f1": f1, "cohesivness": cohesivness, "v_measure": cluster_quality['v_measure']})
    f1_mig = compute_predictive_power(df, model, "migration_prob", continues=True).mean()
    cohesivness_weighted = get_cohesiveness(df, model, "party", weighted=True)
    cluster_quality_weighted = get_cluster_quality(df, model, "party", weighted=True)
    results.append({"model": model, "target": "weighted", "f1": f1, "cohesivness": cohesivness_weighted, "v_measure": cluster_quality_weighted['v_measure']})

  0%|          | 0/13 [00:00<?, ?it/s]

Predicting party
#Classes 8
Mean Macro F1: 0.38157485311140343
STD Macro F1: 0.016080044315200312
Predicting block
#Classes 6
Mean Macro F1: 0.4000895226287435
STD Macro F1: 0.013410759839280095
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.41247713461034896
STD Macro F1: 0.010161514418395202


  8%|▊         | 1/13 [01:00<12:01, 60.15s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.40919837728034264
STD Macro F1: 0.012412817150810914
Predicting block
#Classes 6
Mean Macro F1: 0.4318902037289979
STD Macro F1: 0.017964356713270378
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.42261832037361347
STD Macro F1: 0.015294181244434421


 15%|█▌        | 2/13 [01:50<09:58, 54.40s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.37534168299003057
STD Macro F1: 0.013699573688665242
Predicting block
#Classes 6
Mean Macro F1: 0.39590502730719274
STD Macro F1: 0.01503203608863446
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.41590510407617076
STD Macro F1: 0.01164295319827708


 23%|██▎       | 3/13 [02:41<08:46, 52.64s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.3460605767972095
STD Macro F1: 0.013141820314790007
Predicting block
#Classes 6
Mean Macro F1: 0.3708802467127568
STD Macro F1: 0.019684857157545366
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.3871570270124399
STD Macro F1: 0.011220613452735465


 31%|███       | 4/13 [02:56<05:41, 37.93s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.35820165183551006
STD Macro F1: 0.012634622283114997
Predicting block
#Classes 6
Mean Macro F1: 0.37354566196795214
STD Macro F1: 0.012677282184697714
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.3902937897814082
STD Macro F1: 0.01461335199620246


 38%|███▊      | 5/13 [03:24<04:34, 34.36s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.3989874775726005
STD Macro F1: 0.02014066161775379
Predicting block
#Classes 6
Mean Macro F1: 0.42098331887726703
STD Macro F1: 0.019061059011468567
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.4234529903266628
STD Macro F1: 0.01623598714689568


 46%|████▌     | 6/13 [04:10<04:28, 38.35s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.4259068978206801
STD Macro F1: 0.007206718325855024
Predicting block
#Classes 6
Mean Macro F1: 0.4397342817772773
STD Macro F1: 0.014968418979386564
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.42685101562981886
STD Macro F1: 0.012612072419571891


 54%|█████▍    | 7/13 [04:36<03:26, 34.41s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.4265586729836067
STD Macro F1: 0.023722652198589607
Predicting block
#Classes 6
Mean Macro F1: 0.4416571617413763
STD Macro F1: 0.017617465691233986
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.4236830047205381
STD Macro F1: 0.009254762674881544


 62%|██████▏   | 8/13 [05:02<02:38, 31.75s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.3788779300547699
STD Macro F1: 0.015906399532036743
Predicting block
#Classes 6
Mean Macro F1: 0.38222489215458977
STD Macro F1: 0.01473555604852335
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.3886102847625391
STD Macro F1: 0.011772504768313959


 69%|██████▉   | 9/13 [05:30<02:02, 30.56s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.4133506718721237
STD Macro F1: 0.01456834603373482
Predicting block
#Classes 6
Mean Macro F1: 0.4194218712723323
STD Macro F1: 0.01935403937192627
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.41158773031488244
STD Macro F1: 0.004723296058121697


 77%|███████▋  | 10/13 [05:58<01:29, 29.70s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.3870647480468106
STD Macro F1: 0.01720554732694997
Predicting block
#Classes 6
Mean Macro F1: 0.4087893867741509
STD Macro F1: 0.01512747666768819
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.41923741627809435
STD Macro F1: 0.01411417467008348


 85%|████████▍ | 11/13 [06:25<00:57, 28.70s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.43548599501778745
STD Macro F1: 0.015550786571287011
Predicting block
#Classes 6
Mean Macro F1: 0.44777097828331314
STD Macro F1: 0.01732193851359477
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.42350618726808376
STD Macro F1: 0.010014291603364287


 92%|█████████▏| 12/13 [06:51<00:28, 28.06s/it]

Predicting party
#Classes 8
Mean Macro F1: 0.42158523989383
STD Macro F1: 0.012734916720740206
Predicting block
#Classes 6
Mean Macro F1: 0.42927249103136556
STD Macro F1: 0.012390714328084624
Predicting migration_prob
#Classes 9702




Mean Macro F1: 0.4164437843267394
STD Macro F1: 0.009451403351853186


100%|██████████| 13/13 [07:20<00:00, 33.87s/it]


In [10]:
leaderboard = pd.DataFrame(results)
leaderboard

Unnamed: 0,model,target,f1,cohesivness,v_measure
0,jinaai/jina-embeddings-v3,party,0.377219,0.014801,0.040203
1,jinaai/jina-embeddings-v3,block,0.400167,0.012534,0.014848
2,jinaai/jina-embeddings-v3,weighted,0.400167,0.016866,0.046818
3,Snowflake/snowflake-arctic-embed-l-v2.0,party,0.413237,0.025893,0.046347
4,Snowflake/snowflake-arctic-embed-l-v2.0,block,0.430251,0.021443,0.015761
5,Snowflake/snowflake-arctic-embed-l-v2.0,weighted,0.430251,0.028102,0.018535
6,BAAI/bge-m3,party,0.377846,0.014498,0.049823
7,BAAI/bge-m3,block,0.391944,0.011695,0.030183
8,BAAI/bge-m3,weighted,0.391944,0.016576,0.052122
9,sentence-transformers/all-MiniLM-L6-v2,party,0.349142,0.022678,0.023183


In [12]:
# normalize metrics across targets
metrics = ["f1", "cohesivness", "v_measure"]
targets = ["party", "block", "weighted"]
for target in targets:
    for metric in metrics:
       leaderboard.loc[leaderboard["target"] == target, metric] /= leaderboard.loc[leaderboard["target"] == target, metric].sum()
    

In [13]:
leaderboard['mean_score'] = leaderboard[metrics].apply(np.mean, axis=1)

In [14]:
leaderboard.groupby('model')['mean_score'].mean()

model
BAAI/bge-m3                                0.066067
Classification                             0.112705
Clustering                                 0.065418
Qwen/Qwen3-Embedding-0.6B                  0.065158
Retrieval                                  0.079643
STS                                        0.097635
Snowflake/snowflake-arctic-embed-l-v2.0    0.072374
Summarization                              0.103926
document                                   0.076831
google/embeddinggemma-300m                 0.081775
jinaai/jina-embeddings-v3                  0.060993
sentence-transformers/all-MiniLM-L6-v2     0.060479
sentence-transformers/all-mpnet-base-v2    0.056997
Name: mean_score, dtype: float64

In [15]:
metrics = ["f1", "cohesivness", "v_measure"]
targets = ["party", "block", "weighted"]
summary = []
for target in targets:
    for metric in metrics:
        best = leaderboard.loc[leaderboard[leaderboard['target'] == target][metric].idxmax()]
        summary.append({"best_model": best['model'], "target": best['target'], "metric": metric, "score": best[metric]})
    

In [16]:
pd.DataFrame(summary).set_index(["target", "metric"])

Unnamed: 0_level_0,Unnamed: 1_level_0,best_model,score
target,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
party,f1,Retrieval,0.084003
party,cohesivness,Retrieval,0.103887
party,v_measure,Classification,0.140625
block,f1,Retrieval,0.082952
block,cohesivness,Classification,0.10608
block,v_measure,Summarization,0.198169
weighted,f1,Retrieval,0.082952
weighted,cohesivness,Retrieval,0.101234
weighted,v_measure,Classification,0.144017


Google's embedding with different prompts have highest scores in each category