In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

import preamble

from src.embedding_metrics import pls_coefficient_of_determination, get_cohesiveness, get_cluster_quality
from src.constants import  PATH_MIGRATION_SPEECHES_EMBEDDED

In [12]:
df = pd.read_parquet("/Users/nargizi/Desktop/Uni/Masters/Data Literacy/dataliteracy25/data/final/speech_embeddings.parquet")

In [13]:
df.columns

Index(['Unnamed: 0', 'speaker', 'text', 'date', 'agenda', 'speechnumber',
       'procedure_ID', 'partyfacts_ID', 'period', 'chair', 'MEP', 'commission',
       'written', 'multispeaker', 'link', 'translatedText',
       'translationSource', 'year', 'block', 'party', 'migration_prob',
       'jinaai/jina-embeddings-v3', 'jinaai/jina-embeddings-v4',
       'Snowflake/snowflake-arctic-embed-l-v2.0', 'BAAI/bge-m3',
       'sentence-transformers/all-MiniLM-L6-v2',
       'sentence-transformers/all-mpnet-base-v2', 'Qwen/Qwen3-Embedding-0.6B',
       'google/embeddinggemma-300m', 'document', 'Clustering',
       'Classification', 'STS', 'Retrieval', 'Summarization'],
      dtype='object')

# Final Leaderboard

In [33]:
MODELS = ["jinaai/jina-embeddings-v3",
           "Snowflake/snowflake-arctic-embed-l-v2.0", "BAAI/bge-m3", 
           'sentence-transformers/all-MiniLM-L6-v2', 
           'sentence-transformers/all-mpnet-base-v2', 
           'Qwen/Qwen3-Embedding-0.6B', 'google/embeddinggemma-300m',
          ]


In [34]:
from tqdm import tqdm
results = []
for model in tqdm(MODELS):
    for target in ['party', 'block']:
        r_2 = pls_coefficient_of_determination(df, model, target)
        cohesivness = get_cohesiveness(df, model, target)
        cluster_quality = get_cluster_quality(df, model, target)
        results.append({"model": model, "target": target, "r_2": r_2, "cohesivness": cohesivness, "v_measure": cluster_quality['v_measure']})
    r_2_mig = pls_coefficient_of_determination(df, model, "migration_prob", categorical=False)
    cohesivness_weighted = get_cohesiveness(df, model, "party", weighted=True)
    cluster_quality_weighted = get_cluster_quality(df, model, "party", weighted=True)
    results.append({"model": model, "target": "weighted", "r_2": r_2_mig, "cohesivness": cohesivness_weighted, "v_measure": cluster_quality_weighted['v_measure']})

100%|██████████| 7/7 [00:25<00:00,  3.60s/it]


In [35]:
leaderboard = pd.DataFrame(results)
leaderboard

Unnamed: 0,model,target,r_2,cohesivness,v_measure
0,jinaai/jina-embeddings-v3,party,0.074308,0.014801,0.040203
1,jinaai/jina-embeddings-v3,block,0.09101,0.012534,0.014848
2,jinaai/jina-embeddings-v3,weighted,0.457962,0.016866,0.046818
3,Snowflake/snowflake-arctic-embed-l-v2.0,party,0.081178,0.025893,0.046347
4,Snowflake/snowflake-arctic-embed-l-v2.0,block,0.107902,0.021443,0.015761
5,Snowflake/snowflake-arctic-embed-l-v2.0,weighted,0.505656,0.028102,0.018535
6,BAAI/bge-m3,party,0.067449,0.014498,0.049602
7,BAAI/bge-m3,block,0.091139,0.011695,0.030183
8,BAAI/bge-m3,weighted,0.506555,0.016576,0.052122
9,sentence-transformers/all-MiniLM-L6-v2,party,0.056338,0.022678,0.023183


In [36]:
# normalize metrics across targets
metrics = ["r_2", "cohesivness", "v_measure"]
targets = ["party", "block", "weighted"]
for target in targets:
    for metric in metrics:
       leaderboard.loc[leaderboard["target"] == target, metric] /= leaderboard.loc[leaderboard["target"] == target, metric].sum()
    

In [37]:
leaderboard['mean_score'] = leaderboard[metrics].apply(np.mean, axis=1)

In [38]:
leaderboard.groupby('model')['mean_score'].mean()

model
BAAI/bge-m3                                0.153011
Qwen/Qwen3-Embedding-0.6B                  0.139084
Snowflake/snowflake-arctic-embed-l-v2.0    0.156482
google/embeddinggemma-300m                 0.175837
jinaai/jina-embeddings-v3                  0.134784
sentence-transformers/all-MiniLM-L6-v2     0.123439
sentence-transformers/all-mpnet-base-v2    0.117364
Name: mean_score, dtype: float64

In [39]:
metrics = ["r_2", "cohesivness", "v_measure"]
targets = ["party", "block", "weighted"]
summary = []
for target in targets:
    for metric in metrics:
        best = leaderboard.loc[leaderboard[leaderboard['target'] == target][metric].idxmax()]
        summary.append({"best_model": best['model'], "target": best['target'], "metric": metric, "score": best[metric]})
    

In [40]:
pd.DataFrame(summary).set_index(["target", "metric"])

Unnamed: 0_level_0,Unnamed: 1_level_0,best_model,score
target,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
party,r_2,Snowflake/snowflake-arctic-embed-l-v2.0,0.171052
party,cohesivness,google/embeddinggemma-300m,0.195351
party,v_measure,google/embeddinggemma-300m,0.207206
block,r_2,google/embeddinggemma-300m,0.172119
block,cohesivness,google/embeddinggemma-300m,0.195637
block,v_measure,BAAI/bge-m3,0.232895
weighted,r_2,BAAI/bge-m3,0.156716
weighted,cohesivness,google/embeddinggemma-300m,0.189929
weighted,v_measure,BAAI/bge-m3,0.213106
