In [1]:
%matplotlib inline

import os
import numpy as np
from deepsign.rp.index import TrieSignIndex as Index
from deepsign.io.datasets.wordsim import WordSim353Reader
import matplotlib.pyplot as plt

from scipy.stats import spearmanr


from sklearn.decomposition import PCA
from deepsign.utils.measure import cosine


# model dir
home = os.getenv("HOME")
data_dir = home + "/data/datasets/"
result_dir = home + "/data/results/"
model_dir = result_dir + ""
model_file = model_dir + "model_bnc"
embeddings_file = model_dir + "embeddings.npy"
index_file = model_dir + "index.hdf5"

In [2]:
# load index
index = Index.load(index_file)

In [3]:
# load embeddings
embeddings = np.load(embeddings_file)

def get_vector(word):
    word_ri = index.get_ri(word).to_vector()
    return np.matmul(word_ri, embeddings)

# WordSim353 Correlation

In [4]:
# load toefl
print_questions = False
sim_file = data_dir + "wordsim/sim.csv"
rel_file = data_dir + "wordsim/rel.csv"

wordsim = WordSim353Reader(sim_file,rel_file)


#queen = get_vector("queen")
#majesty = get_vector("majesty")
#print(cosine(queen,majesty))
#r = spearmanr([1,2,3,4,5],[1,2,3,4,5])
#print(r)

rel_in_corpus = [(w1,w2,score) for (w1,w2,score) in wordsim.rel if index.contains(w1) and index.contains(w2)]
model_scores = [cosine(get_vector(w1),get_vector(w2)) for (w1,w2,_) in rel_in_corpus]
gold_scores = [score for (_,_,score) in rel_in_corpus]
rel_correlation = spearmanr(model_scores,gold_scores)
print("rel correlation: ",rel_correlation)

sim_in_corpus = [(w1,w2,score) for (w1,w2,score) in wordsim.sim if index.contains(w1) and index.contains(w2)]
model_scores = [cosine(get_vector(w1),get_vector(w2)) for (w1,w2,_) in sim_in_corpus]
gold_scores = [score for (_,_,score) in sim_in_corpus]
sim_correlation = spearmanr(model_scores,gold_scores)
print("sim correlation: ",sim_correlation)

rel correlation:  SpearmanrResult(correlation=0.013858755210829469, pvalue=0.82671418969391386)


sim correlation:  SpearmanrResult(correlation=0.13009108040907949, pvalue=0.064322770247891875)
