In [None]:
import sensegram
from __future__ import print_function
import gensim
print(gensim.__version__) 
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau, linregress
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
import os

In [None]:
def calc_results(df, missing, model, results):
    """
    returns dataframe containing correlations scores
    
    """

    results = evaluate(df, results, "ESL", missing)
    results = evaluate(df, results, "SL", missing)
    return results
    
    
    
def evaluate(df, results, sim_set, model, missing):
    """
     calculates correlation coefficients
    
    """
    pearson = round(pearsonr(df[sim_set], df["model"])[0], 3)
    spearman = round(spearmanr(df[sim_set], df["model"])[0], 3)
    kendall = round(kendalltau(df[sim_set], df["model"])[0],3)
    results = results.append({"model":model, "sim_set":sim_set, "pearson":pearson, "spearman":spearman, "kendall":kendall, "missing":missing},
                            ignore_index=True)
    return results


In [None]:
models = os.listdir("F:\\models\models")
data = pd.read_excel("Ratings.xlsx")
results = pd.DataFrame(columns=["model", "sim_set", "pearson", "spearman", "kendall", "missing"]) 
similarity_scores = pd.DataFrame(columns=["sõna1", "sõna2","model","ESL", "SL"])

for model in models: 
    sense_vector = "F:\\models\\models\\"+model+"\\ettenten.txt.sense_vectors"
    sv = sensegram.SenseGram.load_word2vec_format(sense_vector, binary=False)
    missing = 0
    for i, row  in data.iterrows():
        s1 = row["sõna 1"]
        s2 = row["sõna 2"]
        esl = row["Average"]
        sl = row["SimLex999"]
        
        s1_senses = sv.get_senses(s1)
        s2_senses = sv.get_senses(s2)
        
        if len(s1_senses)!=0 and len(s2_senses)!=0:

            sim = sv.max_pairwise_sim(s1,s2)
            similarity_scores = similarity_scores.append({"sõna1":s1, "sõna2":s2, "model":sim, "ESL":esl,
                                                          "SL":sl}, ignore_index=True)

        else:
            missing += 1

    
    results = calc_results(similarity_scores, missing, model, results)    

In [None]:
results.to_excel("sense_embeddings.xlsx")