In [None]:
import glob
import pickle
import pandas as pd
from func_emb_metrics import func_emb_metrics

# Compute embedding metrics
# This script processes the output from the morpheme vector pipeline:
# 1) extract the predicted neighbours
# 2) saves as an excel file
# 3) evaluates neighbors using the func_emb_metrics function

files = glob.glob('morphoEmbVec*.pkl')
fileName = files[0]  # CHOOSE WHICH ONE YOU WANT TO EVLUATE
print(f"Processing: {fileName}")

with open(fileName, 'rb') as f:
    data = pickle.load(f)

predictedNeighborsAll = data['predictedNeighborsAll']

n = len(predictedNeighborsAll)
expectedForms = []
neighborsTop10 = []

for i in range(n):
    expectedForms.append(predictedNeighborsAll[i]['testPlural'])
    theseNeighbors = predictedNeighborsAll[i]['neighbors']
    row = []
    for j in range(10):
        if j < len(theseNeighbors):
            row.append(theseNeighbors[j])
        else:
            row.append('')
    neighborsTop10.append(row)

neighborHeaders = [f'Neighbor_{x+1}' for x in range(10)]

# Create table
data_dict = {'Expected': expectedForms}
for i, header in enumerate(neighborHeaders):
    data_dict[header] = [row[i] for row in neighborsTop10]

T = pd.DataFrame(data_dict)

# Save as excel
baseName = fileName.replace('morphoEmbVec', '').replace('.pkl', '')
tableName = f'predNeighbors{baseName}.xlsx'
print(f"Saving table: {tableName}")
T.to_excel(tableName, index=False)

# Call the metrics function (Turkish as an example)
t1_tr, t10_tr, rank_tr = func_emb_metrics(tableName)
print(f'Turkish Top-1: {t1_tr*100:.2f}%, Top-10: {t10_tr*100:.2f}%, Mean Rank: {rank_tr:.2f}')