In [12]:
index_path = '../output/monosemous.bin'
nmslib_params = {'method': 'hnsw', 'space': 'cosinesimil'}
model_path = '../output/model-h2048p512/lstm-wsd-gigaword-google'
vocab_path = '../output/vocab.2018-05-10-7d764e7.pkl'
mono_path = '../output/monosemous-context-embeddings.2018-05-27-5cd9bb6.npz'
hdn_list_vocab = '../output/hdn-list-vocab.2018-05-18-f48a06c.pkl'

## Testing NMSLib

In [2]:
import numpy as np
monos = np.load(mono_path)

In [3]:
mono_words, mono_embs, mono_hdn_lists = monos['mono_words'], monos['mono_embs'], monos['mono_hdn_lists']
mono_embs.shape

(904288, 512)

In [4]:
from sklearn.utils import resample
some_words, some_emds = resample(mono_words, mono_embs, replace=False, n_samples=10)
some_emds.shape

(10, 512)

In [5]:
%%time
from sklearn.metrics.pairwise import cosine_similarity
s = cosine_similarity(some_emds, mono_embs)

CPU times: user 3.7 s, sys: 1.67 s, total: 5.37 s
Wall time: 5.1 s


## Querying by sentences

In [6]:
from model import LSTMLanguageModel

  from ._conv import register_converters as _register_converters


In [7]:
import tensorflow as tf
sess = tf.InteractiveSession()

In [8]:
lm = LSTMLanguageModel(sess, model_path, vocab_path)

INFO:tensorflow:Restoring parameters from ../output/model-h2048p512/lstm-wsd-gigaword-google


In [9]:
sent = 'I study computer science'.split()
target_index = 3
embs = lm.get_embeddings_sentence(sess, sent, target_index)

In [54]:
sent[target_index]

'science'

In [55]:
from nltk.corpus import wordnet as wn
from evaluate.wn_utils import synsets_graph_info

def get_hdns(lemma):
    graph_info = synsets_graph_info(wn_instance=wn,
                                wn_version='30',
                                lemma=lemma,
                                pos='n')
    return {info['under_lcs']: synset
            for synset, info in graph_info.items() 
            if info['under_lcs']}
    
hdn_list = list(get_hdns(sent[target_index]))

In [13]:
import pickle
with open(hdn_list_vocab, 'rb') as f:
    hdn_list2id = pickle.load(f)

In [16]:
cases_of_same_hdn_list = (mono_hdn_lists == hdn_list2id[hdn_list])
len(cases_of_same_hdn_list.nonzero()[0])

32

In [17]:
import numpy as np
word2id = np.load(vocab_path)
id2word = {i: w for w, i in word2id.items()}

In [24]:
mono_words[cases_of_same_hdn_list]

array([ 16256, 434195,  38315,  94648,  74175,   5926,  25121, 103031,
        72460,  38315,  39247,  22974,  30083,  32603, 177796, 231428,
        85757,  85968,  59113,  40817, 133936,  86803, 178378, 264997,
        83053,  58708,  46880,  25483,  65293, 792661, 132156,  30083],
      dtype=int32)

In [22]:
hdn_list

('eng-30-05616246-n', 'eng-30-05809192-n')

In [52]:
id2synset = {synset2identifier(s, '30'):s for s in wn.all_synsets('n')}

In [53]:
[id2synset[h].name() for h in hdn_list]

['ability.n.02', 'content.n.05']

In [43]:
relevant_words = [id2word[i] for i in mono_words[cases_of_same_hdn_list]]

In [46]:
from evaluate.wn_utils import synset2identifier
relevant_hdns = []
for w in relevant_words:
    hypernyms = [synset2identifier(s, '30') for s in wn.synsets(w, 'n')[0].hypernym_paths()[0]]
    rel_hdn, = [h for h in hypernyms if h in hdn_list]
    relevant_hdns.append(rel_hdn)

In [38]:
s = cosine_similarity([embs], mono_embs[cases_of_same_hdn_list])

In [39]:
s

array([[ 0.07344373,  0.01642235,  0.11265934, -0.00393955,  0.03521454,
        -0.06778327,  0.06247198, -0.00956155,  0.01389751, -0.07077594,
         0.11973589,  0.11337093,  0.05293576,  0.04902532,  0.13012496,
         0.00500624,  0.08833097,  0.02415848,  0.03868581,  0.01956294,
        -0.05338352, -0.02618037,  0.03757791, -0.11224613, -0.04880265,
        -0.04376756, -0.01514762, -0.07666067,  0.01267537,  0.00895502,
        -0.0066125 , -0.07674665]], dtype=float32)

In [47]:
sorted(zip(relevant_words, relevant_hdns, s[0]), key=lambda t: t[2])

[('griffin', 'eng-30-05616246-n', -0.112246126),
 ('folklore', 'eng-30-05809192-n', -0.07674665),
 ('psychiatry', 'eng-30-05809192-n', -0.07666067),
 ('misconception', 'eng-30-05809192-n', -0.07077594),
 ('economics', 'eng-30-05809192-n', -0.06778327),
 ('razzmatazz', 'eng-30-05809192-n', -0.053383518),
 ('bioengineering', 'eng-30-05809192-n', -0.04880265),
 ('ergonomics', 'eng-30-05809192-n', -0.043767557),
 ('geriatrics', 'eng-30-05809192-n', -0.026180368),
 ('aptitude', 'eng-30-05616246-n', -0.015147625),
 ('doppelganger', 'eng-30-05616246-n', -0.009561549),
 ('technicolor', 'eng-30-05616246-n', -0.0066124965),
 ('endocrinology', 'eng-30-05809192-n', -0.0039395466),
 ('podiatry', 'eng-30-05809192-n', 0.005006239),
 ('onomastics', 'eng-30-05809192-n', 0.00895502),
 ('prosthetics', 'eng-30-05809192-n', 0.012675365),
 ('sorcery', 'eng-30-05809192-n', 0.013897514),
 ('counterplan', 'eng-30-05809192-n', 0.016422346),
 ('deity', 'eng-30-05809192-n', 0.019562941),
 ('antihero', 'eng-30-056

In [59]:
from collections import defaultdict

def disambiguate(sentence, target_index):
    hdn2synset = get_hdns(sent[target_index])
    hdn_list = tuple(sorted(hdn2synset))
    cases_of_same_hdn_list = (mono_hdn_lists == hdn_list2id[hdn_list])
    relevant_words = [id2word[i] for i in mono_words[cases_of_same_hdn_list]]
    relevant_hdns = []
    for w in relevant_words:
        hypernyms = [synset2identifier(s, '30') for s in wn.synsets(w, 'n')[0].hypernym_paths()[0]]
        rel_hdn, = [h for h in hypernyms if h in hdn2synset]
        relevant_hdns.append(rel_hdn)
    embs = lm.get_embeddings_sentence(sess, sent, target_index)
    sims = cosine_similarity([embs], mono_embs[cases_of_same_hdn_list])[0]
    hdn2score = defaultdict(float)
    for hdn, sim in zip(relevant_hdns, sims):
        if sim > 0:
            hdn2score[hdn] += sim
    return hdn2synset[max(hdn2score, key=lambda k: hdn2score[k])]

id2synset[disambiguate('I study computer science'.split(), 3)]

Synset('science.n.01')

In [60]:
id2synset[disambiguate('I study computer science'.split(), 2)]

Synset('computer.n.01')

In [62]:
id2synset['eng-30-13293625-n']

Synset('costs.n.01')