# Evaluating various pre-trained models on SimLex-999

In this file correlations between various pretrained models similarity and SimLex-999 similarity are calculated, also correlations between the models' similarity score and similarity attained from Estonian raters are calculated. 
<br> 
EstSimLex-999 data set is also filtered by POS and concreteness level and the correlations with these filtered datasets are also calculated. 

## Imports


In [None]:
from __future__ import print_function
import gensim
print(gensim.__version__) 
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau, linregress
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
import os


## Defining all the methods

In [None]:
def evaluate(model, modelname, data, saved_correlations):
    """
    first similarity between word pairs is calculated and then correlations 
    between model's similarity score and human scores is calculated
    """
    
    missing = 0
    sims_from_model = pd.DataFrame(columns=["sõna1", "sõna2", "SimLex999", "similarity_from_model", "EstSimLex999", "POS", "conc(w1)", "conc(w2)"])
    for i, row in data.iterrows():
        s1 = row["sõna 1"]
        s2 = row["sõna 2"]
        estsl = row["Average"]
        sl = row["SimLex999"]
        conc1 = row["conc(w1)"]
        conc2 = row["conc(w2)"]
        pos = row["POS"]
        if s1 in model and s2 in model: 
            similarity_from_model = model.similarity(s1, s2)
            sims_from_model = sims_from_model.append({"sõna1":s1, "sõna2":s2, "SimLex999":sl, "EstSimLex999":estsl,"similarity_from_model":similarity_from_model, "POS":pos, "conc(w1)":conc1, "conc(w2)":conc2 }, ignore_index=True)
            
        else:
            missing += 1

    # filtering by POS (A-adjective, N-noun, V-verb, conc-concrete, abst-abstract, orig-original)
    # saving results to the same dataframe    
    saved_correlations = calc_correlations(sims_from_model, saved_correlations, modelname+"_orig", missing)
    saved_correlations = calc_correlations(sims_from_model[sims_from_model["POS"] == "A"], saved_correlations, modelname+"_A", "-")
    saved_correlations = calc_correlations(sims_from_model[sims_from_model["POS"] == "N"], saved_correlations, modelname+"_N", "-")
    saved_correlations = calc_correlations(sims_from_model[sims_from_model["POS"] == "V"], saved_correlations, modelname+"_V", "-")
    saved_correlations = calc_correlations(sims_from_model.sort_values(["conc(w1)", "conc(w2)"], ascending=False)[:250], saved_correlations, modelname+"_conc", "-")
    saved_correlations = calc_correlations(sims_from_model.sort_values(["conc(w1)", "conc(w2)"], ascending=True)[:250], saved_correlations, modelname+"_abst", "-")
        
    return saved_correlations
                
        
def calc_correlations(sims_from_model, saved_correlations, name, missing):
    
    # calculating pearson, spearman, kendalltau correlations of model's similarity ans SimLex999 similarity scores
    pearsonSL = round(pearsonr(sims_from_model.similarity_from_model, sims_from_model.SimLex999)[0],3)
    spearmanSL = round(spearmanr(sims_from_model.similarity_from_model, sims_from_model.SimLex999)[0],3)
    kendallSL = round(kendalltau(sims_from_model.similarity_from_model, sims_from_model.SimLex999)[0],3)
    
    # calculating pearson, spearman, kendalltau correlations of model's similarity and EstSimLex999 similarity scores
    pearsonESL = round(pearsonr(sims_from_model.similarity_from_model, sims_from_model.EstSimLex999)[0],3)
    spearmanESL = round(spearmanr(sims_from_model.similarity_from_model, sims_from_model.EstSimLex999)[0],3)
    kendallESL = round(kendalltau(sims_from_model.similarity_from_model, sims_from_model.EstSimLex999)[0],3)
    
    
    saved_correlations = saved_correlations.append({"model":name, "SL_pearson":pearsonSL, "SL_spearman":spearmanSL, 
                                                       "SL_kendall":kendallSL, "ESL_pearson":pearsonESL, "ESL_spearman":spearmanESL,
                                                       "ESL_kendall":kendallESL, "missing":missing}, ignore_index=True)
    
    return saved_correlations

    
            
        
        

Saving EstSimLex-999 to dataframe and creating a new dataframe, where all the correlations will be saved. 

In [None]:
data = pd.read_excel("EstSimLex999.xlsx")
df = pd.DataFrame(columns=["model", "SL_pearson", "SL_spearman", "SL_kendall", "ESL_pearson", "ESL_spearman", "ESL_kendall", "missing"])

## Evaluating Facebook research fastText vectors 

Facebook research fastText vectors for English and Estonian were downloaded from https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
These pre-trained vectors were trained using CBOW with position-weights, in dimension 300, character n-grams of length 5, window size 5 and 10 negaitves. 
Used similarity measure is cosine similarity. This aquired similarities are compared with the EstSimLex-999 and SimLex-999 ones and correlation between them is computed.

In [None]:
# loading pretrained vectors
# specify the path, where the vectors can be accessed
path_to_model = 'F:\models\wiki.et.vec'

wiki_model_est = KeyedVectors.load_word2vec_format(path_to_model)

In [None]:
# evaluating model, saving correlations to dataframe
correlations = evaluate(wiki_model_est, "wiki_model_est", data, df)

## EstNLTK word2vec models

EstNLTK models are trained with word2vec software, on an Estonian Reference corpus. Embeddings can be downloaded from here https://github.com/estnltk/word2vec-models. <br>
Four of the models are trained with CBOW (two of them on lemmatized version of the corpus). <br>
Other four of the models are trained with Skip-gram (two of them on lemmatized version of the corpus). 

In [None]:
# Iterating over all the models and evaluating them, saving results to the same dataframe
estnltk_models = os.listdir("F:\\models\\estnltk models\\")
for model_name in estnltk_models:
    path = "F:\\models\\estnltk models\\"+model_name
    model = KeyedVectors.load_word2vec_format(path, binary=True)
    correlations = evaluate(model, model_name, data, correlations)

## Pretrained word and multi-sense embeddings for Estonian
Eleri Aedmaa's word and sense vectors can be downloaded from here https://github.com/eleriaedmaa/embeddings. 
<br>
Description of the models can also be viewed from this GitHub repository. All the embeddings are trained on lemmatized etTenTen: Corpus of the Estonian Web.

In [None]:
# evaluating all the models (only the word vectors, not sense vectors), saving to the same dataframe
models = os.listdir("F:\models\models")
for model_name in models: 
    path = "F:\models\models\\"+model_name+"\\"+"ettenten.txt.word_vectors"
    model = KeyedVectors.load_word2vec_format(path)
    correlations = evaluate(model, model_name, data, correlations)

Now, when all the models are evaluated and results are in the dataframe, let's save it to  ecxel file. 

In [None]:
correlations.to_excel("all_correlations.xlsx")