In [2]:
%matplotlib inline

import os
import numpy as np
from deepsign.rp.index import TrieSignIndex as Index
from deepsign.io.datasets.toefl import TOEFLReader
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.decomposition import PCA
from deepsign.utils.measure import cosine


# model dir
home = os.getenv("HOME")
data_dir = home + "/data/datasets/"
result_dir = home + "/data/results/"
model_dir = result_dir + "nrp/300d_reg_all/"
model_file = model_dir + "model_bnc"
embeddings_file = model_dir + "embeddings.npy"
index_file = model_dir + "index.hdf5"

In [4]:
# load index
index = Index.load(index_file)

  0%|          | 0/468560 [00:00<?, ?it/s]

  2%|▏         | 7843/468560 [00:00<00:05, 78421.85it/s]

  4%|▍         | 18645/468560 [00:00<00:05, 85444.74it/s]

  6%|▌         | 28210/468560 [00:00<00:04, 88269.54it/s]

  8%|▊         | 37492/468560 [00:00<00:04, 89586.85it/s]

 10%|▉         | 44680/468560 [00:00<00:07, 57239.46it/s]

 11%|█▏        | 52861/468560 [00:00<00:06, 62906.12it/s]

 13%|█▎        | 61168/468560 [00:00<00:06, 67846.34it/s]

 15%|█▌        | 71367/468560 [00:00<00:05, 75420.87it/s]

 17%|█▋        | 80174/468560 [00:01<00:04, 78813.30it/s]

 19%|█▉        | 90289/468560 [00:01<00:04, 84403.39it/s]

 22%|██▏       | 102167/468560 [00:01<00:03, 92426.07it/s]

 24%|██▍       | 111911/468560 [00:01<00:03, 89653.33it/s]

 26%|██▌       | 121884/468560 [00:01<00:03, 92455.37it/s]

 29%|██▊       | 133571/468560 [00:01<00:03, 98636.39it/s]

 31%|███       | 144329/468560 [00:01<00:03, 101154.87it/s]

 33%|███▎      | 154678/468560 [00:01<00:03, 91364.10it/s] 

 35%|███▌      | 164920/468560 [00:01<00:03, 94419.77it/s]

 37%|███▋      | 174763/468560 [00:01<00:03, 94181.81it/s]

 40%|███▉      | 186507/468560 [00:02<00:02, 100128.40it/s]

 42%|████▏     | 196754/468560 [00:02<00:02, 95987.10it/s] 

 44%|████▍     | 206553/468560 [00:02<00:03, 68400.99it/s]

 46%|████▋     | 217266/468560 [00:02<00:03, 76721.34it/s]

 49%|████▉     | 229069/468560 [00:02<00:02, 85718.93it/s]

 51%|█████     | 238822/468560 [00:02<00:02, 80105.55it/s]

 53%|█████▎    | 247992/468560 [00:02<00:02, 83262.51it/s]

 55%|█████▍    | 256981/468560 [00:02<00:02, 85033.10it/s]

 57%|█████▋    | 267542/468560 [00:03<00:02, 90310.62it/s]

 59%|█████▉    | 276994/468560 [00:03<00:02, 88851.37it/s]

 61%|██████    | 286179/468560 [00:03<00:02, 87697.13it/s]

 63%|██████▎   | 295601/468560 [00:03<00:01, 89555.65it/s]

 65%|██████▌   | 305924/468560 [00:03<00:01, 93260.83it/s]

 67%|██████▋   | 315654/468560 [00:03<00:01, 94433.41it/s]

 70%|██████▉   | 326137/468560 [00:03<00:01, 97326.21it/s]

 72%|███████▏  | 335969/468560 [00:03<00:01, 92062.15it/s]

 74%|███████▍  | 345773/468560 [00:03<00:01, 93775.89it/s]

 76%|███████▌  | 355249/468560 [00:04<00:01, 89610.16it/s]

 78%|███████▊  | 367051/468560 [00:04<00:01, 96584.23it/s]

 80%|████████  | 376940/468560 [00:04<00:01, 91154.82it/s]

 83%|████████▎ | 387627/468560 [00:04<00:00, 95358.24it/s]

 85%|████████▍ | 397371/468560 [00:04<00:01, 66549.55it/s]

 87%|████████▋ | 407972/468560 [00:04<00:00, 74913.92it/s]

 89%|████████▉ | 418182/468560 [00:04<00:00, 81416.89it/s]

 92%|█████████▏| 430299/468560 [00:04<00:00, 90304.92it/s]

 94%|█████████▍| 442246/468560 [00:05<00:00, 97439.45it/s]

 97%|█████████▋| 452885/468560 [00:05<00:00, 95095.06it/s]

 99%|█████████▉| 464068/468560 [00:05<00:00, 99562.15it/s]

100%|██████████| 468560/468560 [00:05<00:00, 88619.75it/s]




In [7]:
# load embeddings
embeddings = np.load(embeddings_file)

def get_vector(word):
    word_ri = index.get_ri(word).to_vector()
    return np.matmul(word_ri, embeddings)

# TOEFL Synonym test

In [8]:
# load toefl
questions_file = data_dir + "toefl/questions.csv"
answers_file = data_dir + "toefl/answers.csv"

toefl = TOEFLReader(questions_file=questions_file, answers_file=answers_file)

# words in toelf and not in index
toefl_remove = set(w for w in toefl.words if not index.contains(w))
print("="*80)
for (i, question) in enumerate(toefl.questions):
    question_w = question[0]
    answer_ws = question[1]
    # print(question)
    answer_w = toefl.answer(i)
    # print(aw[answer])

    words = set([question_w] + answer_ws)
    num_questions = 0
    # ignore questions for wich we have no word data
    if words.isdisjoint(toefl_remove):
        
        print("question ",i+1)
        # question word vector
        question_vector = get_vector(question_w)
        answer_vectors = [get_vector(word) for word in answer_ws]
        sims = [cosine(question_vector,v) for v in answer_vectors]
        model_answer = answer_ws[np.argmax(sims)]

        print("word:", question_w)
        print("correct: ",answer_ws[answer_w])
        print("model answer: ",model_answer)

        #print(sims)
        
        #print(question)
        print("="*80)

question  1
word: enormously
correct:  tremendously
model answer:  tremendously
question  2
word: provisions
correct:  stipulations
model answer:  interrelations
question  3
word: haphazardly
correct:  randomly
model answer:  randomly
question  4
word: prominent
correct:  conspicuous
model answer:  conspicuous
question  5
word: zenith
correct:  pinnacle
model answer:  completion
question  6
word: flawed
correct:  imperfect
model answer:  lustrous
question  7
word: urgently
correct:  desperately
model answer:  conceivably
question  8
word: consumed
correct:  eaten
model answer:  bred
question  9
word: advent
correct:  coming
model answer:  stability
question  10
word: concisely
correct:  succinctly
model answer:  succinctly
question  11
word: salutes
correct:  greetings
model answer:  ceremonies
question  12
word: solitary
correct:  alone
model answer:  alone
question  13
word: hasten
correct:  accelerate
model answer:  accompany
question  14
word: perseverance
correct:  endurance
model

word: debate
correct:  argument
model answer:  election
question  27
word: narrow
correct:  thin
model answer:  freezing
question  28
word: arranged
correct:  planned
model answer:  planned
question  29
word: infinite
correct:  limitless
model answer:  unusual
question  30
word: showy
correct:  striking
model answer:  incidental
question  31
word: levied
correct:  imposed
model answer:  correlated
question  32
word: deftly
correct:  skillfully
model answer:  occasionally
question  33
word: distribute
correct:  circulate
model answer:  research
question  34
word: discrepancies
correct:  differences
model answer:  differences
question  35
word: prolific
correct:  productive
model answer:  capable
question  37
word: peculiarly
correct:  uniquely
model answer:  uniquely
question  38
word: hue
correct:  color
model answer:  color
question  39
word: hind
correct:  rear
model answer:  muscular
question  40
word: highlight
correct:  accentuate
model answer:  accentuate
question  41
word: hasti

hurriedly
question  42
word: temperate
correct:  mild
model answer:  windy
question  43
word: grin
correct:  smile
model answer:  exercise
question  44
word: verbally
correct:  orally
model answer:  verbosely
question  45
word: physician
correct:  doctor
model answer:  pharmacist
question  46
word: essentially
correct:  basically
model answer:  basically
question  47
word: keen
correct:  sharp
model answer:  famous
question  48
word: situated
correct:  positioned
model answer:  emptying
question  49
word: principal
correct:  major
model answer:  most
question  50
word: slowly
correct:  gradually
model answer:  gradually
question  51
word: built
correct:  constructed
model answer:  proposed
question  52
word: tasks
correct:  jobs
model answer:  materials
question  53
word: unlikely
correct:  improbable
model answer:  disagreeable
question  55
word: annals
correct:  chronicles
model answer:  trails
question  56
word: wildly
correct:  furiously
model answer:  furiously
question  57
word: 

word: command
correct:  mastery
model answer:  love
question  59
word: concocted
correct:  devised
model answer:  supervised
question  60
word: prospective
correct:  potential
model answer:  prominent
question  61
word: generally
correct:  broadly
model answer:  broadly
question  62
word: sustained
correct:  prolonged
model answer:  analyzed
question  63
word: perilous
correct:  dangerous
model answer:  exciting
question  64
word: tranquillity
correct:  peacefulness
model answer:  happiness
question  65
word: dissipate
correct:  disperse
model answer:  disguise
question  66
word: primarily
correct:  chiefly
model answer:  consistently
question  67
word: colloquial
correct:  conversational
model answer:  incorrect
question  68
word: resolved
correct:  settled
model answer:  forgotten
question  69
word: feasible
correct:  possible
model answer:  equitable
question  70


word: expeditiously
correct:  rapidly
model answer:  rapidly
question  71
word: percentage
correct:  proportion
model answer:  sample
question  72
word: terminated
correct:  ended
model answer:  posed
question  73
word: uniform
correct:  alike
model answer:  sharp
question  74
word: figure
correct:  solve
model answer:  list
question  75
word: sufficient
correct:  enough
model answer:  physiological
question  76
word: fashion
correct:  manner
model answer:  manner
question  77
word: marketed
correct:  sold
model answer:  sold
question  78
word: bigger
correct:  larger
model answer:  larger
question  79
word: roots
correct:  origins
model answer:  rituals
question  80
word: normally
correct:  ordinarily
model answer:  permanently
