# Evaluating various pre-trained models on SimLex-999

In this file correlations between various pretrained models similarity and SimLex-999 similarity are calculated, also correlations between the models' similarity score and similarity attained from Estonian raters are calculated. 
<br> 
Lastly the SimLex-999 dataset is also filtered by POS group and the correlations with these filtered datasets are calculated. 

## Imports


In [35]:
from __future__ import print_function
import gensim
print(gensim.__version__) 
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau, linregress
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
import os


3.7.0


## Facebook research fastText vectors 

Facebook research fastText vectors for English and Estonian were downloaded from https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
These pre-trained vectors were trained using CBOW with position-weights, in dimension 300, character n-grams of length 5, window size 5 and 10 negaitves. 
Similarity between concepts are derived from the vectors. Used similarity measure is cosine similarity. This aquired similarities are compared with the SimLex-999 ones and correlation between them are found. Also similarity between different POS groups are found and also the results are compared with the SimLex-999. 

In [2]:
# loading pretrained vectors
wiki_model_est = KeyedVectors.load_word2vec_format('wiki.et.vec')
wiki_model_eng = KeyedVectors.load_word2vec_format('wiki.en.vec')

In [53]:
# loading the translated Simlex-999 to dataframe
file_name = "SimLex-999-eng-est.xlsx"
data = pd.read_excel(file_name)
data.head()

Unnamed: 0,word 1,word 2,sõna 1,sõna 2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),Average
0,old,new,vana,uus,A,1.58,2.72,2.81,2,7.25,1,0.41,0.0
1,smart,intelligent,tark,intelligentne,A,9.2,1.75,2.46,1,7.11,1,0.67,9.0
2,hard,difficult,kõva,raske,A,8.77,3.76,2.21,2,5.94,1,1.19,1.5
3,happy,cheerful,õnnelik,rõõmsameelne,A,9.55,2.56,2.34,1,5.85,1,2.18,9.25
4,hard,easy,kõva,lihtne,A,0.95,3.76,2.07,2,5.82,1,0.93,0.0


## All the methods used

In [45]:
def getSimilarityFromModel(model, data):
    # finds the similarity between concepts from the model, drops all the concept pairs, that are not represented in the model
    # returns 2 dataframes- one for the similarities and the other for missing pairs. 
    df = pd.DataFrame(columns=["word1", "word2","Sõna1", "Sõna2", "SimLex999", "SimFromModel", "POS", "simest"])
    missing = pd.DataFrame(columns=["word1", "word2", "Sõna 1", "Sõna 2"])
    for index, row in data.iterrows():
        word1 = row["sõna 1"].lower().strip()
        word2 = row["sõna 2"].lower().strip()
        eword1 = row["word 1"]
        eword2 = row["word 2"]
        simlex = row["SimLex999"]
        pos = row["POS"]
        try: 
            sim = model.similarity(word1, word2)
            df = df.append({"word1":eword1, "word2": eword2,"Sõna1":word1, "Sõna2":word2, "SimLex999":simlex,"simest":row["Average"], "SimFromModel":sim, "POS":pos}, ignore_index=True)
        except: 
            missing = missing.append({"word1":eword1, "word2": eword2, "Sõna 1":word1, "Sõna 2":word2}, ignore_index=True)
    return df, missing


def correlations(df):
    # Finds correlations between SimLex-999 and model
    # returns the correlations
    pearson = pearsonr(df.SimLex999, df.SimFromModel)
    spearman = spearmanr(df.SimLex999, df.SimFromModel)
    kendall = kendalltau(df.SimLex999, df.SimFromModel)
    
    pearsonEst = pearsonr(df.simest, df.SimFromModel)
    spearmanEst = spearmanr(df.simest, df.SimFromModel)
    kendallEst = kendalltau(df.simest, df.SimFromModel)
    return {"pearson":pearson, "spearman":spearman, "kendall": kendall, "pearsonEst":pearsonEst, "spearmanEst":spearmanEst, "kendallEst": kendallEst}


def transform(row):
    # scaales the similarity from the model into the 0-10 scale 
    return 10*(row["SimFromModel"] - minV)/(maxV-minV)

def printCorrelations(dic):
    # Prints the correlations
    print("Pearson correlation: " + str(dic["pearson"][0]))
    print("Spearman correlation: " + str(dic["spearman"][0]))
    print("Kendall tau correlation: " + str(dic["kendall"][0]))
    print("Pearson correlation between estsim and model: " + str(dic["pearsonEst"][0]))
    print("Spearman correlation between estsim and model: " + str(dic["spearmanEst"][0]))
    print("Kendall tau correlation between estsim and model: " + str(dic["kendallEst"][0]))

def evaluateModel(model, name, data):
    # uses all the other methods to evaluate the model and saves to an excel file
    p,m = getSimilarityFromModel(model, data)
    p["SimFromModel"] = p.apply(transform, axis=1)
    print(name)
    print("Missing pairs: "+ str(m.shape[0]))
    printCorrelations(correlations(p))
    writer = pd.ExcelWriter(name+'_similarity.xlsx', engine='xlsxwriter')
    p.to_excel(writer, sheet_name="similarities")
    m.to_excel(writer, sheet_name="missing pairs")


def evaluateModelsList(models):
    
    for model in models: 
        evaluateModel(model, )

In [54]:
minV, maxV = -1, 1
evaluateModel(wiki_model_est, "wiki_model_est_fastText", data)


wiki_model_est_fastText
Missing pairs: 19
Pearson correlation: 0.29742956551538074
Spearman correlation: 0.3048684145507242
Kendall tau correlation: 0.20820516039731474
Pearson correlation between estsim and model: 0.3689550410571098
Spearman correlation between estsim and model: 0.3820545254678176
Kendall tau correlation between estsim and model: 0.2677731193935729


## word2vec models

Models were trained with word2vec software, on an Estonian Reference corpus. 

In [21]:
# loading the models 
lemma_est_model_cbow100 = KeyedVectors.load_word2vec_format('lemmas.cbow.s100.w2v.bin', binary=True)
lemma_est_model_sg100 = KeyedVectors.load_word2vec_format('lemmas.sg.s100.w2v.bin', binary=True)
lemma_est_model_cbow200 = KeyedVectors.load_word2vec_format('lemmas.cbow.s200.w2v.bin', binary=True)
lemma_est_model_sg200 = KeyedVectors.load_word2vec_format('lemmas.sg.s200.w2v.bin', binary=True)
word_est_model_cbow100 = KeyedVectors.load_word2vec_format('words.cbow.s200.w2v.bin', binary=True)
word_est_model_sg100 = KeyedVectors.load_word2vec_format('words.sg.s100.w2v.bin', binary=True)
word_est_model_cbow200 = KeyedVectors.load_word2vec_format('words.cbow.s200.w2v.bin', binary=True)
word_est_model_sg200 = KeyedVectors.load_word2vec_format('words.sg.s200.w2v.bin', binary=True)

In [55]:
# evaluating and writing results to an excel file. 
evaluateModel(lemma_est_model_cbow100, "lemma_est_model_cbow100", data)
print()
evaluateModel(lemma_est_model_sg100, "lemma_est_model_sg100", data)
print()
evaluateModel(lemma_est_model_cbow200, "lemma_est_model_cbow200", data)
print()
evaluateModel(lemma_est_model_sg200, "lemma_est_model_sg200", data)
print()
evaluateModel(word_est_model_sg100, "word_est_model_sg100", data)
print()
evaluateModel(word_est_model_cbow200, "word_est_model_cbow200", data)
print()
evaluateModel(word_est_model_sg200, "word_est_model_sg200", data)
print()
evaluateModel(word_est_model_cbow100, "word_est_model_cbow100", data)


lemma_est_model_cbow100
Missing pairs: 33
Pearson correlation: 0.25817643949076735
Spearman correlation: 0.2529187888277373
Kendall tau correlation: 0.16941882690587085
Pearson correlation between estsim and model: 0.36041696812729196
Spearman correlation between estsim and model: 0.37671405198929303
Kendall tau correlation between estsim and model: 0.2608281172339406

lemma_est_model_sg100
Missing pairs: 33
Pearson correlation: 0.2832954780415281
Spearman correlation: 0.27542236275746
Kendall tau correlation: 0.18512203471798525
Pearson correlation between estsim and model: 0.3957702397870942
Spearman correlation between estsim and model: 0.4134090338274725
Kendall tau correlation between estsim and model: 0.2872655994285016

lemma_est_model_cbow200
Missing pairs: 33
Pearson correlation: 0.27786883210839214
Spearman correlation: 0.27432792876156525
Kendall tau correlation: 0.18484261820887288
Pearson correlation between estsim and model: 0.3827605127946673
Spearman correlation between

## Pretrained word and multi-sense embeddings for Estonian

In [56]:
# evaluating all the models (only the word vectors, not sense vectors)
models = os.listdir("models")
for modelname in models: 
    model = KeyedVectors.load_word2vec_format("models/"+modelname+"/"+"ettenten.txt.word_vectors")
    evaluateModel(model, modelname, data)

cbow_100_5_10_20
Missing pairs: 17
Pearson correlation: 0.2954388567231574
Spearman correlation: 0.29219959653986355
Kendall tau correlation: 0.1969216189969967
Pearson correlation between estsim and model: 0.3840266905346261
Spearman correlation between estsim and model: 0.41335158930072824
Kendall tau correlation between estsim and model: 0.28791074238552683
cbow_150_15_10_20
Missing pairs: 17
Pearson correlation: 0.31061896457898386
Spearman correlation: 0.318086510964076
Kendall tau correlation: 0.21623585744990617
Pearson correlation between estsim and model: 0.41190654903024576
Spearman correlation between estsim and model: 0.44818150386039374
Kendall tau correlation between estsim and model: 0.3148940453249254
cbow_150_15_5_20
Missing pairs: 15
Pearson correlation: 0.31182441120328663
Spearman correlation: 0.3193273770411882
Kendall tau correlation: 0.21681421958919792
Pearson correlation between estsim and model: 0.4111472245183176
Spearman correlation between estsim and model:

skip_300_5_10_10
Missing pairs: 17
Pearson correlation: 0.3110974350109412
Spearman correlation: 0.31882591876541594
Kendall tau correlation: 0.21680575851991446
Pearson correlation between estsim and model: 0.42253872274562315
Spearman correlation between estsim and model: 0.4573052439987142
Kendall tau correlation between estsim and model: 0.323469641916045
skip_300_5_10_20
Missing pairs: 17
Pearson correlation: 0.3168856380502366
Spearman correlation: 0.3267330905545313
Kendall tau correlation: 0.2219764667682381
Pearson correlation between estsim and model: 0.42881444843548855
Spearman correlation between estsim and model: 0.46581185033761563
Kendall tau correlation between estsim and model: 0.329811437323094
skip_300_5_10_5
Missing pairs: 17
Pearson correlation: 0.3130262580733771
Spearman correlation: 0.32019110848138554
Kendall tau correlation: 0.2172799827679506
Pearson correlation between estsim and model: 0.4261855994844989
Spearman correlation between estsim and model: 0.461

## Correlation, when filtered by POS 

Dataset is now filtered by the POS of the concepts and after that the correlation is find. Reason for this is to find if models can find similarity from certain POS group better. 

In [57]:
# filtering the data set 
dataA = data[data['POS']=='A']
dataN = data[data['POS'] =='N']
dataV = data[data['POS']=='V']

In [58]:
dataV.head()

Unnamed: 0,word 1,word 2,sõna 1,sõna 2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex),Average
777,go,come,minema,tulema,V,2.42,3.15,2.72,2,5.75,1,1.51,2.25
778,take,steal,võtma,varastama,V,6.18,3.06,3.84,2,3.28,1,1.69,6.0
779,listen,hear,kuulama,kuulma,V,8.17,3.47,3.66,2,3.22,1,1.72,6.0
780,think,rationalize,mõtlema,põhjendama,V,8.25,2.41,1.52,1,3.17,1,1.13,3.25
781,occur,happen,ilmnema,juhtuma,V,9.32,1.84,1.78,1,2.87,1,1.34,7.0


In [59]:
evaluateModel(wiki_model_est, "wiki_model_est_fastText_A", dataA)
print()
evaluateModel(wiki_model_est, "wiki_model_est_fastText_N", dataN)
print()
evaluateModel(wiki_model_est, "wiki_model_est_fastText_V", dataV)

wiki_model_est_fastText_A
Missing pairs: 0
Pearson correlation: 0.19942352221591134
Spearman correlation: 0.22603745056542013
Kendall tau correlation: 0.1533789947383818
Pearson correlation between estsim and model: 0.2697437314615472
Spearman correlation between estsim and model: 0.254643514221926
Kendall tau correlation between estsim and model: 0.18774964451071988

wiki_model_est_fastText_N
Missing pairs: 8
Pearson correlation: 0.37321923634180215
Spearman correlation: 0.38413798711574537
Kendall tau correlation: 0.2630880942449072
Pearson correlation between estsim and model: 0.4874698456400136
Spearman correlation between estsim and model: 0.5219775714034443
Kendall tau correlation between estsim and model: 0.3720482526193223

wiki_model_est_fastText_V
Missing pairs: 11
Pearson correlation: 0.1307391578570072
Spearman correlation: 0.1491977079888748
Kendall tau correlation: 0.10420445592542614
Pearson correlation between estsim and model: 0.24017698108769425
Spearman correlation b

In [61]:
# evaluating and writing results to an excel file. 
evaluateModel(lemma_est_model_cbow100, "lemma_est_model_cbow100", dataA)
evaluateModel(lemma_est_model_cbow100, "lemma_est_model_cbow100", dataN)
evaluateModel(lemma_est_model_cbow100, "lemma_est_model_cbow100", dataV)
print()
evaluateModel(lemma_est_model_sg100, "lemma_est_model_sg100", dataA)
evaluateModel(lemma_est_model_sg100, "lemma_est_model_sg100", dataN)
evaluateModel(lemma_est_model_sg100, "lemma_est_model_sg100", dataV)
print()
evaluateModel(lemma_est_model_cbow200, "lemma_est_model_cbow200", dataA)
evaluateModel(lemma_est_model_cbow200, "lemma_est_model_cbow200", dataN)
evaluateModel(lemma_est_model_cbow200, "lemma_est_model_cbow200", dataV)
print()
evaluateModel(lemma_est_model_sg200, "lemma_est_model_sg200", dataA)
evaluateModel(lemma_est_model_sg200, "lemma_est_model_sg200", dataN)
evaluateModel(lemma_est_model_sg200, "lemma_est_model_sg200", dataV)
print()
evaluateModel(word_est_model_sg100, "word_est_model_sg100", dataA)
evaluateModel(word_est_model_sg100, "word_est_model_sg100", dataN)
evaluateModel(word_est_model_sg100, "word_est_model_sg100", dataV)
print()
evaluateModel(word_est_model_cbow200, "word_est_model_cbow200", dataA)
evaluateModel(word_est_model_cbow200, "word_est_model_cbow200", dataN)
evaluateModel(word_est_model_cbow200, "word_est_model_cbow200", dataV)

print()
evaluateModel(word_est_model_sg200, "word_est_model_sg200", dataA)
evaluateModel(word_est_model_sg200, "word_est_model_sg200", dataN)
evaluateModel(word_est_model_sg200, "word_est_model_sg200", dataV)
print()
evaluateModel(word_est_model_cbow100, "word_est_model_cbow100", dataA)
evaluateModel(word_est_model_cbow100, "word_est_model_cbow100", dataN)
evaluateModel(word_est_model_cbow100, "word_est_model_cbow100", dataV)

lemma_est_model_cbow100
Missing pairs: 0
Pearson correlation: 0.18443311245855473
Spearman correlation: 0.21838505534104458
Kendall tau correlation: 0.14943777859949403
Pearson correlation between estsim and model: 0.24049162777885355
Spearman correlation between estsim and model: 0.22409274752533495
Kendall tau correlation between estsim and model: 0.16085531646657325
lemma_est_model_cbow100
Missing pairs: 30
Pearson correlation: 0.3194940528183155
Spearman correlation: 0.29748311980635556
Kendall tau correlation: 0.1995324694355773
Pearson correlation between estsim and model: 0.4058688426144968
Spearman correlation between estsim and model: 0.4230691662466578
Kendall tau correlation between estsim and model: 0.2931383546625192
lemma_est_model_cbow100
Missing pairs: 3
Pearson correlation: 0.16739164125731498
Spearman correlation: 0.15349946006296128
Kendall tau correlation: 0.1045632192414878
Pearson correlation between estsim and model: 0.3035986711637522
Spearman correlation betwee

word_est_model_cbow100
Missing pairs: 5
Pearson correlation: 0.17424211816235866
Spearman correlation: 0.21385698795529956
Kendall tau correlation: 0.14364800041243103
Pearson correlation between estsim and model: 0.3408412744945516
Spearman correlation between estsim and model: 0.36358405330074195
Kendall tau correlation between estsim and model: 0.2571174062485928


In [60]:
models = os.listdir("models")
for modelname in models: 
    model = KeyedVectors.load_word2vec_format("models/"+modelname+"/"+"ettenten.txt.word_vectors")
    evaluateModel(model, modelname+"_A", dataA)
    evaluateModel(model, modelname+"_N", dataN)
    evaluateModel(model, modelname+"_V", dataV)

cbow_100_5_10_20_A
Missing pairs: 0
Pearson correlation: 0.2458993547683711
Spearman correlation: 0.2841368571972757
Kendall tau correlation: 0.21085506343049487
Pearson correlation between estsim and model: 0.33209757374474047
Spearman correlation between estsim and model: 0.32162087524749144
Kendall tau correlation between estsim and model: 0.24358090779223948
cbow_100_5_10_20_N
Missing pairs: 14
Pearson correlation: 0.38007774709674663
Spearman correlation: 0.35037217788763536
Kendall tau correlation: 0.23678029627363983
Pearson correlation between estsim and model: 0.43769389371920187
Spearman correlation between estsim and model: 0.4593966802542283
Kendall tau correlation between estsim and model: 0.3208255438811138
cbow_100_5_10_20_V
Missing pairs: 3
Pearson correlation: 0.17649384488397046
Spearman correlation: 0.1652504454332173
Kendall tau correlation: 0.11371040294720386
Pearson correlation between estsim and model: 0.28681201797601324
Spearman correlation between estsim and 

cbow_300_15_10_20_V
Missing pairs: 3
Pearson correlation: 0.19058226957051186
Spearman correlation: 0.17811597491904793
Kendall tau correlation: 0.12378069693514811
Pearson correlation between estsim and model: 0.2917883355674789
Spearman correlation between estsim and model: 0.2620530619317673
Kendall tau correlation between estsim and model: 0.1844503561759277
cbow_300_15_10_5_A
Missing pairs: 0
Pearson correlation: 0.31263485752725756
Spearman correlation: 0.32845405431091834
Kendall tau correlation: 0.24337009657631883
Pearson correlation between estsim and model: 0.4026341302953911
Spearman correlation between estsim and model: 0.3787211866637683
Kendall tau correlation between estsim and model: 0.28409261712456163
cbow_300_15_10_5_N
Missing pairs: 14
Pearson correlation: 0.4040741430822529
Spearman correlation: 0.3988170622464197
Kendall tau correlation: 0.272797607378498
Pearson correlation between estsim and model: 0.49365370489605653
Spearman correlation between estsim and mod

cbow_300_5_2_20_N
Missing pairs: 7
Pearson correlation: 0.3913653856824442
Spearman correlation: 0.3792510655162148
Kendall tau correlation: 0.2577387030575681
Pearson correlation between estsim and model: 0.4689559066662208
Spearman correlation between estsim and model: 0.4997730270846608
Kendall tau correlation between estsim and model: 0.3520256843207181
cbow_300_5_2_20_V
Missing pairs: 1
Pearson correlation: 0.18281089306373974
Spearman correlation: 0.1637222145406748
Kendall tau correlation: 0.11136530731078015
Pearson correlation between estsim and model: 0.2847196129982242
Spearman correlation between estsim and model: 0.25368688580686166
Kendall tau correlation between estsim and model: 0.17876865229279928
cbow_300_5_5_20_A
Missing pairs: 0
Pearson correlation: 0.2943825965077936
Spearman correlation: 0.30443939888488214
Kendall tau correlation: 0.22202184249067686
Pearson correlation between estsim and model: 0.3818358008803705
Spearman correlation between estsim and model: 0.

skip_300_5_10_10_A
Missing pairs: 0
Pearson correlation: 0.26781799934083633
Spearman correlation: 0.26627834311286724
Kendall tau correlation: 0.19673237226614707
Pearson correlation between estsim and model: 0.35656273004596767
Spearman correlation between estsim and model: 0.3062747520631402
Kendall tau correlation between estsim and model: 0.23370830635831225
skip_300_5_10_10_N
Missing pairs: 14
Pearson correlation: 0.3847640923695623
Spearman correlation: 0.38290348362732207
Kendall tau correlation: 0.26143713929796725
Pearson correlation between estsim and model: 0.4851710930409651
Spearman correlation between estsim and model: 0.5217655305401097
Kendall tau correlation between estsim and model: 0.36883496452379966
skip_300_5_10_10_V
Missing pairs: 3
Pearson correlation: 0.20638931313457054
Spearman correlation: 0.17827135984956372
Kendall tau correlation: 0.12352893958544948
Pearson correlation between estsim and model: 0.2988448506141814
Spearman correlation between estsim and 