In [1]:
import os
import pandas as pd
import pickle
from gensim.models import KeyedVectors
from tqdm import tqdm 

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
analogy_questions_path = "../datasets/analogy_questions.txt"
restricted_vocab_path = "../vocabs/final_vocab.pkl"

# List of models to evaluate
models = {
    "FastText": "../models/restricted/fasttext_300.vec",
    "huBERT_x2": "../models/restricted/hubert.vec",
    "EFNILEX": "../models/restricted/efnilex_600.vec",
    "HuSpacy": "../models/restricted/huspacy.vec",
    "XLM-R_x2": "../models/restricted/roberta.vec",
    "ELMO": "../models/restricted/elmo_1024.vec",
    "huBERT_de": "../models/restricted/hubert_decontex.vec",
    "XLM-R_de": "../models/restricted/roberta_decontex.vec",
    "XLM-R_agg": "../models/restricted/roberta_aggregate.vec",
    "hubert_agg": "../models/restricted/hubert_aggregate.vec",
}

## Analogy questions

In [3]:
# Load vocabulary
with open(restricted_vocab_path, "rb") as file:
    vocab = pickle.load(file)

# Initialize DataFrame
restricted_df = pd.DataFrame(columns=["word1", "word2", "word3", "word4", "category"])
original_df = pd.DataFrame(columns=["word1", "word2", "word3", "word4", "category"])

# Import analogy questions
with open(analogy_questions_path, "r", encoding="utf-8") as f:
    current_category = None  # Track category lines
    
    for line in f:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check if it's a category line (starts with ": ")
        if line.startswith(": "):
            current_category = line[2:].strip()
            continue
        
        # Process analogy lines
        words = line.split()
        # Save in restricted_df if all words are in vocab and save all lines into original_df 
        if len(words) == 4 and all(word in vocab for word in words):  
            w1, w2, w3, w4 = words
            restricted_df = pd.concat([restricted_df, pd.DataFrame([{
                "word1": w1, "word2": w2, "word3": w3, "word4": w4, "category": current_category
            }])], ignore_index=True)

            original_df = pd.concat([original_df, pd.DataFrame([{
                "word1": w1, "word2": w2, "word3": w3, "word4": w4, "category": current_category
            }])], ignore_index=True)

        # Save out of vocab lines into original_df
        elif len(words) == 4:
            w1, w2, w3, w4 = words
            original_df = pd.concat([original_df, pd.DataFrame([{
                "word1": w1, "word2": w2, "word3": w3, "word4": w4, "category": current_category
            }])], ignore_index=True)

restricted_df

Unnamed: 0,word1,word2,word3,word4,category
0,Budapest,Magyarország,Moszkva,Oroszország,capital-common-countries
1,Budapest,Magyarország,London,Nagy-Britannia,capital-common-countries
2,Budapest,Magyarország,Berlin,Németország,capital-common-countries
3,Budapest,Magyarország,Pozsony,Szlovákia,capital-common-countries
4,Budapest,Magyarország,Helsinki,Finnország,capital-common-countries
...,...,...,...,...,...
13245,vonatkoznak,vonatkozik,léteznek,létezik,gram9-plural-verb
13246,vonatkoznak,vonatkozik,mutatnak,mutat,gram9-plural-verb
13247,szólnak,szól,léteznek,létezik,gram9-plural-verb
13248,szólnak,szól,mutatnak,mutat,gram9-plural-verb


In [4]:
temp_df = original_df.groupby("category", as_index=False)['word1'].count().rename(columns={"word1": "original"})
comp_df = temp_df.merge(restricted_df.groupby("category", as_index=False)['word1'].count().rename(columns={"word1": "restricted"}))
comp_df["ratio"] = (comp_df["restricted"] / comp_df["original"] * 100).round(1)
comp_df

Unnamed: 0,category,original,restricted,ratio
0,capital-common-countries,190,190,100.0
1,capital-world,13695,5995,43.8
2,county-center,171,171,100.0
3,currency,435,406,93.3
4,family,190,136,71.6
5,gram1-adjective-to-adverb,780,780,100.0
6,gram2-opposite,435,435,100.0
7,gram3-comparative,780,780,100.0
8,gram4-superlative,780,780,100.0
9,gram5-present-participle,780,496,63.6


In [5]:
category_counts = {}

def mean_reciprocal_rank(ranks):
    """
    Computes the Mean Reciprocal Rank (MRR) given a list of ranks.
    """
    return sum(1.0 / rank if rank > 0 else 0 for rank in ranks) / len(ranks) if ranks else 0

def evaluate_analogy_model(model, model_name, analogy_df, vocab_file=restricted_vocab_path, category_counts=category_counts):
    """
    Evaluates an embedding model on the analogy task and returns Mean Reciprocal Rank (MRR) per category.
    """
    # Load vocabulary
    with open(vocab_file, "rb") as file:
        vocab = pickle.load(file)

    overall_ranks = []
    total_cases = 0
    top1_correct = 0

    # Perform analogy test
    for _, row in tqdm(analogy_df.iterrows(), total=len(analogy_df), desc=f"Evaluating {model_name}"):
        w1, w2, w3, actual_w4, category = row["word1"], row["word2"], row["word3"], row["word4"], row["category"]

        try:
            predictions = model.most_similar(positive=[w3, w2], negative=[w1], topn=10)
            predicted_words = [word for word, _ in predictions]

            # Find rank of actual_w4 (1-based index, or 0 if not found)
            rank = predicted_words.index(actual_w4) + 1 if actual_w4 in predicted_words else 0
            overall_ranks.append(rank)
            category_counts[category]["ranks"].append(rank)

            if rank == 1:
                top1_correct += 1

            total_cases += 1
        except KeyError:
            continue

    # Compute overall MRR
    overall_mrr = mean_reciprocal_rank(overall_ranks)
    overall_acc = top1_correct / total_cases

    # Create DataFrame
    category_mrr_df = pd.DataFrame([
        {"Category": cat, 
         model_name: mean_reciprocal_rank(data["ranks"]) if data["ranks"] else 0}
        for cat, data in category_counts.items()
    ])

    # Append overall MRR
    category_mrr_df = pd.concat([category_mrr_df, pd.DataFrame([{"Category": "Overall MRR", model_name: overall_mrr},{"Category": "Overall accuracy", model_name: overall_acc}])], ignore_index=True)
    
    return category_mrr_df

# Run evaluations and merge results with tqdm
final_df = None

for model_name, model_path in tqdm(models.items(), desc="Loading models & running evaluations"):
    print(f"\nLoading {model_name}...")
    model = KeyedVectors.load_word2vec_format(model_path, binary=False)
    print(f"Loaded {model_name}...")
    for cat in restricted_df["category"].unique():
        category_counts[cat] = {"ranks": []}
    result_df = evaluate_analogy_model(model, model_name, restricted_df)

    if final_df is None:
        final_df = result_df
    else:
        final_df = final_df.merge(result_df, on="Category", how="outer")

# Display final results
print("\nFinal MRR Results Across Models:")
final_df

Loading models & running evaluations:   0%|          | 0/10 [00:00<?, ?it/s]


Loading FastText...
Loaded FastText...


Evaluating FastText: 100%|██████████| 13250/13250 [02:49<00:00, 78.37it/s]
Loading models & running evaluations:  10%|█         | 1/10 [03:18<29:43, 198.22s/it]


Loading huBERT_x2...
Loaded huBERT_x2...


Evaluating huBERT_x2: 100%|██████████| 13250/13250 [04:57<00:00, 44.51it/s]
Loading models & running evaluations:  20%|██        | 2/10 [09:32<40:12, 301.51s/it]


Loading EFNILEX...
Loaded EFNILEX...


Evaluating EFNILEX: 100%|██████████| 13250/13250 [04:15<00:00, 51.83it/s]
Loading models & running evaluations:  30%|███       | 3/10 [14:42<35:38, 305.56s/it]


Loading HuSpacy...
Loaded HuSpacy...


Evaluating HuSpacy: 100%|██████████| 13250/13250 [02:53<00:00, 76.16it/s]
Loading models & running evaluations:  40%|████      | 4/10 [18:04<26:27, 264.63s/it]


Loading XLM-R_x2...
Loaded XLM-R_x2...


Evaluating XLM-R_x2: 100%|██████████| 13250/13250 [04:57<00:00, 44.59it/s]
Loading models & running evaluations:  50%|█████     | 5/10 [24:16<25:17, 303.57s/it]


Loading ELMO...
Loaded ELMO...


Evaluating ELMO: 100%|██████████| 13250/13250 [05:54<00:00, 37.33it/s]
Loading models & running evaluations:  60%|██████    | 6/10 [31:46<23:33, 353.25s/it]


Loading huBERT_de...
Loaded huBERT_de...


Evaluating huBERT_de: 100%|██████████| 13250/13250 [04:54<00:00, 44.92it/s]
Loading models & running evaluations:  70%|███████   | 7/10 [37:53<17:53, 357.76s/it]


Loading XLM-R_de...
Loaded XLM-R_de...


Evaluating XLM-R_de: 100%|██████████| 13250/13250 [04:45<00:00, 46.38it/s]
Loading models & running evaluations:  80%|████████  | 8/10 [43:51<11:55, 357.73s/it]


Loading XLM-R_agg...
Loaded XLM-R_agg...


Evaluating XLM-R_agg: 100%|██████████| 13250/13250 [04:46<00:00, 46.22it/s]
Loading models & running evaluations:  90%|█████████ | 9/10 [49:49<05:57, 357.99s/it]


Loading hubert_agg...
Loaded hubert_agg...


Evaluating hubert_agg: 100%|██████████| 13250/13250 [04:34<00:00, 48.29it/s]
Loading models & running evaluations: 100%|██████████| 10/10 [55:36<00:00, 333.61s/it]


Final MRR Results Across Models:





Unnamed: 0,Category,FastText,huBERT_x2,EFNILEX,HuSpacy,XLM-R_x2,ELMO,huBERT_de,XLM-R_de,XLM-R_agg,hubert_agg
0,Overall MRR,0.771571,0.584121,0.460185,0.459904,0.451567,0.223469,0.219142,0.017782,0.230488,0.242267
1,Overall accuracy,0.709358,0.486717,0.389132,0.379472,0.371472,0.179623,0.169057,0.012604,0.179547,0.184604
2,capital-common-countries,0.769845,0.582982,0.454582,0.44009,0.398567,0.090551,0.16868,0.009649,0.260253,0.250729
3,capital-world,0.833,0.503311,0.284692,0.246899,0.299127,0.034469,0.079138,0.003213,0.17043,0.225653
4,county-center,0.876218,0.761863,0.307069,0.467799,0.253973,0.089731,0.23722,0.001462,0.074675,0.180286
5,currency,0.305776,0.095262,0.194954,0.152957,0.084856,0.123101,0.068424,0.000821,0.088254,0.065848
6,family,0.655494,0.665625,0.397803,0.586076,0.456629,0.325925,0.295927,0.045781,0.249945,0.218175
7,gram1-adjective-to-adverb,0.631606,0.585044,0.36767,0.606313,0.776716,0.130213,0.248623,0.066223,0.264824,0.197808
8,gram2-opposite,0.434839,0.162315,0.285733,0.23704,0.171484,0.099909,0.074059,0.012874,0.040579,0.013736
9,gram3-comparative,0.755579,0.812676,0.749232,0.744924,0.814748,0.469363,0.384272,0.096281,0.392271,0.295406
