In [None]:
import os
from tqdm import tqdm
import re
import fasttext
import numpy as np
from numpy.linalg import norm
from scipy import stats
import time

In [None]:
def cosineSimilarity(vector1, vector2):
    if (norm(vector1) * norm(vector2)) == 0:
        return 0

    return np.dot(vector1, vector2) / (norm(vector1) * norm(vector2))

In [None]:
def get_rankings(fname, model):
    groundtruth_ranking = {}; embedding_ranking = {}
    for lnum, line in enumerate(tqdm(open(fname))):
        if lnum == 0: #Skipping first line, which is the header!
            continue
        line = line.strip().split("\t")
        src_pid = line[0]; src_ptitle = line[1]
        trgt_pid = line[2]; trgt_ptitle = line[3]
        mean_score = float(line[4])
        if not model.__contains__(src_pid) or not model.__contains__(trgt_pid):
            print(f'Either the source: {(src_pid, src_ptitle)} or the target: {(trgt_pid, trgt_ptitle)} is missing!')
            continue
        src_emb = model.get_word_vector(src_pid); trgt_emb = model.get_word_vector(trgt_pid)
        groundtruth_ranking[(int(src_pid), int(trgt_pid), lnum)] = mean_score
        embedding_ranking[(int(src_pid), int(trgt_pid), lnum)] = cosineSimilarity(src_emb, trgt_emb)
    return groundtruth_ranking, embedding_ranking

In [None]:
def get_rank_correlation(groundtruth_ranking, embedding_ranking):
    sorted_groundtruth_ranking = [k[2] for k, v in sorted(groundtruth_ranking.items(), key=lambda item: item[1], reverse=True)]
    sorted_embedding_ranking = [k[2] for k, v in sorted(embedding_ranking.items(), key=lambda item: item[1], reverse=True)]
    rho, pval = stats.spearmanr(sorted_groundtruth_ranking, sorted_embedding_ranking)
    return rho, pval, len(groundtruth_ranking), len(embedding_ranking)

In [None]:
def print_and_write_output(eval_type, fout, emb_type, d, c, rho, pval, num_pairs_gt, num_pairs_emb):
    print(f'Spearman Rank Correlation for the WikiSRS {eval_type} task!')
    print(f'Embedding Details. InfSource: {emb_type}, Dim: {d}, ContextWindowSize: {c}')
    print(f'rho: {rho}, pvalue: {pval}, num_queries_gt: {num_pairs_gt}, num_queries_emb: {num_pairs_emb}')
    fout.write(f'{emb_type}\t{d}\t{c}\t{rho}\t{pval}\t{num_pairs_gt}\t{num_pairs_emb}\n')

In [None]:
root_dir = os.path.abspath(os.path.join(os.getcwd(),os.pardir))
PATH_IN = root_dir
langlist = ['en', 'ru', 'ja', 'de', 'fr', 'it', 'pl', 'fa']
emb_types = ['real_nav', 'gen_clickstream_private', 'gen_clickstream_public', 'gen_graph']

In [None]:
for lang in langlist:
    relatedness_fname = os.path.join(PATH_IN, 'data', 'relatedness', f'{lang}_WikiSRS_relatedness.tsv')
    similarity_fname = os.path.join(PATH_IN, 'data', 'relatedness', f'{lang}_WikiSRS_similarity.tsv')
    
    model_path = os.path.join(PATH_IN, 'data', 'navigation_embeddings', lang)
    fout_relatedness_results = open(os.path.join(PATH_IN, 'downstream_tasks', 'relatedness_results', f'{lang}wiki_relatedness.tsv'), "w")
    fout_similarity_results = open(os.path.join(PATH_IN, 'downstream_tasks', 'relatedness_results', f'{lang}wiki_similarity.tsv'), "w")
    fout_relatedness_results.write(f'EmbeddingType\tDimensionality\tContextWindowSize\tRankCorrelation\tPvalue\tNumQueries_GT\tNumQueries_Emb\n')
    fout_similarity_results.write(f'EmbeddingType\tDimensionality\tContextWindowSize\tRankCorrelation\tPvalue\tNumQueries_GT\tNumQueries_Emb\n')

    for emb_type in emb_types:
        for d in [128]:
#             for c in [1,3,5,7]:
            for c in [5]:
                tmodel = time.time()
                model_fname = os.path.join(model_path, f'article_representations_{emb_type}_{d}_{c}.bin')
                model = fasttext.load_model(model_fname)
                print(f'Loaded model {model_fname} in {time.time()-tmodel} seconds!')

                groundtruth_ranking_rel, embedding_ranking_rel = get_rankings(relatedness_fname, model)
                rho_rel, pval_rel, num_pairs_gt_rel, num_pairs_emb_rel = get_rank_correlation(groundtruth_ranking_rel, embedding_ranking_rel)
                print_and_write_output('relatedness', fout_relatedness_results, emb_type, d, c, rho_rel, pval_rel, num_pairs_gt_rel, num_pairs_emb_rel)
                fout_relatedness_results.flush()

                groundtruth_ranking_sim, embedding_ranking_sim = get_rankings(similarity_fname, model)
                rho_sim, pval_sim, num_pairs_gt_sim, num_pairs_emb_sim = get_rank_correlation(groundtruth_ranking_sim, embedding_ranking_sim)
                print_and_write_output('similarity', fout_similarity_results, emb_type, d, c, rho_sim, pval_sim, num_pairs_gt_sim, num_pairs_emb_sim)
                fout_similarity_results.flush()

    fout_relatedness_results.close()
    fout_similarity_results.close()