In [1]:
%matplotlib inline

import os
import numpy as np
from deepsign.rp.index import TrieSignIndex as Index
from deepsign.io.datasets.toefl import TOEFLReader
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.decomposition import PCA
from deepsign.utils.measure import cosine


# model dir
home = os.getenv("HOME")
data_dir = home + "/data/datasets/"
result_dir = home + "/data/results/"
model_dir = result_dir + ""
model_file = model_dir + "model_bnc"
embeddings_file = model_dir + "embeddings.npy"
index_file = model_dir + "index.hdf5"

In [2]:
# load index
index = Index.load(index_file)

In [3]:
# load embeddings
embeddings = np.load(embeddings_file)

def get_vector(word):
    word_ri = index.get_ri(word).to_vector()
    return np.matmul(word_ri, embeddings)

# TOEFL Synonym test

In [5]:
# load toefl
print_questions = False
questions_file = data_dir + "toefl/questions.csv"
answers_file = data_dir + "toefl/answers.csv"

toefl = TOEFLReader(questions_file=questions_file, answers_file=answers_file)

num_correct = 0
num_questions = 0
# words in toelf and not in index
toefl_remove = set(w for w in toefl.words if not index.contains(w))
for (i, question) in enumerate(toefl.questions):
    question_w = question[0]
    answer_ws = question[1]
    # print(question)
    answer_index = toefl.answer(i)
    # print(aw[answer])

    words = set([question_w] + answer_ws)

    # ignore questions for wich we have no word data
    if words.isdisjoint(toefl_remove):
        # question word vector
        question_vector = get_vector(question_w)
        answer_vectors = [get_vector(word) for word in answer_ws]
        sims = [cosine(question_vector,v) for v in answer_vectors]
        model_answer = answer_ws[np.argmax(sims)]

        if print_questions:
            print("question ",i+1)
            print("word:", question_w)
            print("correct: ",answer_ws[answer_w])
            print("model answer: ",answer_index)
            print("="*80)

        num_questions += 1
        if model_answer == answer_ws[answer_index]:
            num_correct += 1

accuracy = num_correct / num_questions * 100
print("Model Accuracy: ",accuracy)

Model Accuracy:  26.582278481012654
