In [20]:
import torch

from jenkspy import jenks_breaks
from scipy.spatial import distance
from textblob import TextBlob

In [2]:
model = torch.load(
    '../../data/coref-embeds-5-8-am.model',
    map_location={'cuda:0': 'cpu'},
)



In [5]:
model.eval()

DocEmbedder(
  (embeddings): WordEmbedding(39414, 300)
  (lstm): LSTM(300, 500, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.5)
  (embed): Sequential(
    (0): Linear(in_features=1000, out_features=200, bias=True)
    (1): Tanh()
    (2): Linear(in_features=200, out_features=200, bias=True)
    (3): Tanh()
    (4): Linear(in_features=200, out_features=50, bias=True)
  )
)

In [81]:
blob = TextBlob('Mr. Trump told Mr. Obama to give Mr. Flynn the document.')
tokens = [str(t) for t in blob.tokens]

In [82]:
list(enumerate(tokens))

[(0, 'Mr.'),
 (1, 'Trump'),
 (2, 'told'),
 (3, 'Mr.'),
 (4, 'Obama'),
 (5, 'to'),
 (6, 'give'),
 (7, 'Mr.'),
 (8, 'Flynn'),
 (9, 'the'),
 (10, 'document'),
 (11, '.')]

In [83]:
embeds = [e.detach().numpy() for e in model([tokens])[0]]

In [84]:
seed = embeds[1]
for token, embed in zip(tokens, embeds):
    print(token, distance.cosine(seed, embed))

Mr. 0.0034539103508
Trump 0.0
told 1.02693848312
Mr. 0.292731106281
Obama 1.04116371274
to 1.02700069174
give 1.02696195804
Mr. 0.00970602035522
Flynn 0.39230453968
the 1.08124744892
document 1.12407740951
. 1.0269189924


In [78]:
seed = embeds[0]
ds = [distance.cosine(seed, embed) for token, embed in zip(tokens, embeds)]

In [79]:
b = jenks_breaks(ds, 2)[1]

In [80]:
for i, seed in enumerate(embeds):

    ds = [distance.cosine(seed, embed) for token, embed in zip(tokens, embeds)]
    b = jenks_breaks(ds, 2)[1]
    print(tokens[i], [token for token, embed in zip(tokens, embeds) if distance.cosine(seed, embed) < b])

David ['David', 'she']
sold ['sold', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
Mary ['Mary']
a ['a']
book ['book']
and ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
a ['a']
car ['sold', 'and', 'a', 'car', '.', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
. ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
She ['She', 'she']
drove ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
it ['it']
to ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
California ['California']
, ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
and ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
read ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
it ['it']
as ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.'