# Calculate the evaluation metrics

In [None]:
import os
import pandas as pd
import numpy as np
import pickle

In [None]:
def calculate_mrr_score(prep, pred_col, true_cols):
    # Remove duplicate QIDs while keeping the order
    prep[pred_col] = prep[pred_col].apply(lambda x: list(dict.fromkeys(x)))
    # Get the rank of each retrieved QID
    ranks = prep.apply(lambda x: [i+1 for i in range(len(x[pred_col])) if (x[pred_col][i] in x[true_cols])], axis=1)
    # Return the MRR
    return ranks.apply(lambda x: 1/x[0] if len(x)>0 else 0).mean()

def calculate_ndcg_score(prep, pred_col, true_cols):
    # Remove duplicate QIDs while keeping the order
    prep[pred_col] = prep[pred_col].apply(lambda x: list(dict.fromkeys(x)))
    # Get the rank of each retrieved QID
    ranks = prep.apply(lambda x: [i+1 for i in range(len(x[pred_col])) if (x[pred_col][i] in x[true_cols])], axis=1)
    # Calculate the DCG, the Ideal DCG and finally return the NDCG
    dcg = ranks.apply(lambda x: sum([1/np.log2(y+1) for y in x]) if len(x)>0 else 0)
    idcg = prep.apply(lambda x: sum([1/np.log2(y+1) for y in range(1, min(len(x[true_cols]), len(x[pred_col])) + 1)]), axis=1)
    return (dcg/idcg).mean()

def calculate_accuracy_score(df):
    highest_score_idx = df['Retrieval Score'].apply(np.argmax)
    top_qid = df.apply(lambda x: x['Retrieval QIDs'][highest_score_idx[x.name]], axis=1)
    return (top_qid == df['Correct QID']).mean()

def calculate_log_odds_ratio_score(df):
    def log_odds_ratio(row):
        correct_qid = row['Correct QID']
        wrong_qid = row['Wrong QID']

        # Find the maximum scores for the correct and wrong QIDs
        correct_scores = [score for qid, score in zip(row['Retrieval QIDs'], row['Retrieval Score']) if qid == correct_qid]
        wrong_scores = [score for qid, score in zip(row['Retrieval QIDs'], row['Retrieval Score']) if qid == wrong_qid]

        max_correct_score = max(correct_scores, default=0)
        max_wrong_score = max(wrong_scores, default=0)

        correct_log_odds = np.log(max_correct_score / (1 - max_correct_score))
        wrong_log_odds = np.log(max_wrong_score / (1 - max_wrong_score))
        return correct_log_odds - wrong_log_odds

    # Apply the log odds ratio calculation to each row
    return df.apply(log_odds_ratio, axis=1).mean()

In [None]:
filename = f"../data/Evaluation Data/retrieval_results_Mintaka-wikidata_test_v1-en.pkl"
prep = pickle.load(open(filename, "rb"))
assert ((pd.isna(prep['Retrieval QIDs']) | prep['Retrieval QIDs'].apply(lambda x: len(x) == 0)).sum() != 0), "Evaluation not complete"

# For Mintaka, LC_QuAD, and RuBQ
prep = prep[prep.apply(lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']), axis=1)]
prep['Correct QIDs'] = prep.apply(lambda x: x['Question QIDs'] + x['Answer QIDs'], axis=1)

# For REDFM
# prep = prep[prep['Correct in Wikipedia']]
# prep['Correct QIDs'] = prep['Correct QID'].apply(lambda x: [x])

print("Size Data: ", len(prep))
print("MRR:")
print(calculate_mrr_score(prep, 'Retrieval QIDs', 'Correct QIDs'))
print("NDCG:")
print(calculate_ndcg_score(prep, 'Retrieval QIDs', 'Correct QIDs'))

In [None]:
directory = '../data/Evaluation Data/Language Results Balanced/REDFM-noentity'
for file in os.listdir(directory):
    print(file)
    filename = f"{directory}/{file}"
    prep = pickle.load(open(filename, "rb"))
    if (pd.isna(prep['Retrieval QIDs']) | prep['Retrieval QIDs'].apply(lambda x: len(x) == 0)).sum() != 0:
        print("Evaluation not complete")
        continue

    # For Mintaka, LC_QuAD, and RuBQ
    # prep = prep[prep.apply(lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']), axis=1)]
    # prep['Correct QIDs'] = prep.apply(lambda x: x['Question QIDs'] + x['Answer QIDs'], axis=1)

    # For REDFM
    prep = prep[prep['Correct in Wikipedia']]
    prep['Correct QIDs'] = prep['Correct QID'].apply(lambda x: [x])

    print("Size Data: ", len(prep))
    print("MRR:")
    print(calculate_mrr_score(prep, 'Retrieval QIDs', 'Correct QIDs'))
    print("NDCG:")
    print(calculate_ndcg_score(prep, 'Retrieval QIDs', 'Correct QIDs'))
    print()

In [None]:
filename = f"../data/Evaluation Data/retrieval_results_Wikidata-Disamb-wikidata_test_v1-en.pkl"
prep = pickle.load(open(filename, "rb"))
assert ((pd.isna(prep['Retrieval QIDs']) | prep['Retrieval QIDs'].apply(lambda x: len(x) == 0)).sum() != 0), "Evaluation not complete"

print("Size Data: ", len(prep))
print("Accuracy:")
print(calculate_accuracy_score(prep))
print("Log Odds:")
print(calculate_log_odds_ratio_score(prep))