In [5]:
%matplotlib inline

import os
import numpy as np
from deepsign.rp.index import TrieSignIndex as Index
from deepsign.io.datasets.toefl import TOEFLReader

import matplotlib.pyplot as plt



from sklearn.decomposition import PCA
from deepsign.utils.measure import cosine
import sklearn.preprocessing as process


# model params
k = 1000
s = 10
h_dim = 300

home = os.getenv("HOME")
data_dir = home + "/data/datasets/"
result_dir = home + "/data/results/nrp/sparsemax/"

model_suffix = "{k}_{s}_h{h}".format(k=k,s=s,h=h_dim)
index_file = result_dir + "index_"+model_suffix+".hdf5"
model_file = result_dir + "model_"+model_suffix
embeddings_file = result_dir + "embeddings_"+model_suffix+".npy"

In [6]:
# load index
index = Index.load(index_file)

In [12]:
# load embeddings
embeddings = np.load(embeddings_file)
#embeddings = process.normalize(embeddings, norm='l2')


def get_vector(word):
    word_ri = index.get_ri(word).to_vector()
    return np.matmul(word_ri, embeddings)

# TOEFL Synonym test

In [13]:
# load toefl
print_questions = False
questions_file = data_dir + "toefl/questions.csv"
answers_file = data_dir + "toefl/answers.csv"

toefl = TOEFLReader(questions_file=questions_file, answers_file=answers_file)

num_correct = 0
num_questions = 0
# words in toelf and not in index
toefl_remove = set(w for w in toefl.words if not index.contains(w))
for (i, question) in enumerate(toefl.questions):
    question_w = question[0]
    answer_ws = question[1]
    # print(question)
    answer_index = toefl.answer(i)
    # print(aw[answer])

    words = set([question_w] + answer_ws)

    # ignore questions for wich we have no word data
    if words.isdisjoint(toefl_remove):
        # question word vector
        question_vector = get_vector(question_w)
        answer_vectors = [get_vector(word) for word in answer_ws]
        sims = [cosine(question_vector,v) for v in answer_vectors]
        print(question_w)
        print(answer_ws)
        print(sims)
        model_answer = answer_ws[np.argmax(sims)]

        if print_questions:
            print("question ",i+1)
            print("word:", question_w)
            print("correct: ",answer_ws[answer_w])
            print("model answer: ",answer_index)
            print("="*80)

        num_questions += 1
        if model_answer == answer_ws[answer_index]:
            num_correct += 1

accuracy = num_correct / num_questions * 100
print("Model Accuracy: ",accuracy)

enormously
['appropriately', 'uniquely', 'tremendously', 'decidedly']
[0.011491636501584022, 0.0096802328695792961, 0.012534525460166429, 0.024238033276728944]
provisions
['stipulations', 'interrelations', 'jurisdictions', 'interpretations']
[0.063762033373898061, -0.03698908893707914, -0.0562622270887723, -0.037127123406473519]
haphazardly
['dangerously', 'densely', 'randomly', 'linearly']
[-0.026113214519812872, 0.17344900355399578, 0.20291160428725652, 0.14188036035959223]
prominent
['battered', 'ancient', 'mysterious', 'conspicuous']
[0.011622830388636056, 0.093300520241635862, -0.1087420642341055, 0.16289149896679572]
zenith
['completion', 'pinnacle', 'outset', 'decline']
[0.072456842771212421, -0.0041126617238175798, -0.081147676801483284, -0.063799093481665792]
flawed
['tiny', 'imperfect', 'lustrous', 'crude']
[-0.099065349886862664, -0.022550864492755188, 0.097163234183635858, -0.01915697621573418]
urgently
['typically', 'conceivably', 'tentatively', 'desperately']
[-0.10864484

prospective
['particular', 'prudent', 'potential', 'prominent']
[-0.029001906744342238, -0.086839050802023338, -0.07302869590690908, 0.039499455896703763]
generally
['descriptively', 'broadly', 'controversially', 'accurately']
[0.0416395177362145, 0.018378258606046822, -0.055608890246930269, -0.11499566385767082]
sustained
['prolonged', 'refined', 'lowered', 'analyzed']
[-0.043896387677343961, 0.074534793747404723, -0.049674870491764382, -0.017342548847280318]
perilous
['binding', 'exciting', 'offensive', 'dangerous']
[-0.096765202836478562, 0.025543169430227292, -0.074423093192798412, -0.091669186001358843]
tranquillity
['peacefulness', 'harshness', 'weariness', 'happiness']
[0.17965325928856765, -0.030594793826545268, 0.10631271866854436, -0.077253335625902642]
dissipate
['disperse', 'isolate', 'disguise', 'photograph']
[0.10587985423109807, -0.10962844663633191, 0.045634482789033981, 0.050937910003636018]
primarily
['occasionally', 'cautiously', 'consistently', 'chiefly']
[-0.025489