In [20]:
import torch

from jenkspy import jenks_breaks
from scipy.spatial import distance
from textblob import TextBlob

In [2]:
model = torch.load(
    '../../data/coref-embeds-5-8-am.model',
    map_location={'cuda:0': 'cpu'},
)



In [5]:
model.eval()

DocEmbedder(
  (embeddings): WordEmbedding(39414, 300)
  (lstm): LSTM(300, 500, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.5)
  (embed): Sequential(
    (0): Linear(in_features=1000, out_features=200, bias=True)
    (1): Tanh()
    (2): Linear(in_features=200, out_features=200, bias=True)
    (3): Tanh()
    (4): Linear(in_features=200, out_features=50, bias=True)
  )
)

In [85]:
blob = TextBlob('Mr. Trump told Mr. Obama to give Mr. Flynn the document.')
blob = TextBlob('David woke up. He went to the store. He called Kara. She was at work.')
tokens = [str(t) for t in blob.tokens]

In [86]:
list(enumerate(tokens))

[(0, 'David'),
 (1, 'woke'),
 (2, 'up'),
 (3, '.'),
 (4, 'He'),
 (5, 'went'),
 (6, 'to'),
 (7, 'the'),
 (8, 'store'),
 (9, '.'),
 (10, 'He'),
 (11, 'called'),
 (12, 'Kara'),
 (13, '.'),
 (14, 'She'),
 (15, 'was'),
 (16, 'at'),
 (17, 'work'),
 (18, '.')]

In [87]:
embeds = [e.detach().numpy() for e in model([tokens])[0]]

In [89]:
seed = embeds[0]
for token, embed in zip(tokens, embeds):
    print(token, distance.cosine(seed, embed))

David 0.0
woke 1.0058727758
up 1.00549658062
. 1.00549863791
He 0.00493949651718
went 1.00540671451
to 1.00556744961
the 1.16329556704
store 1.17760272324
. 1.00546445884
He 0.00540935993195
called 1.005152022
Kara 0.958745159209
. 1.00548970839
She 0.00691163539886
was 1.00538213924
at 1.00550431246
work 1.00560775306
. 1.00547126634


In [78]:
seed = embeds[0]
ds = [distance.cosine(seed, embed) for token, embed in zip(tokens, embeds)]

In [79]:
b = jenks_breaks(ds, 2)[1]

In [80]:
for i, seed in enumerate(embeds):

    ds = [distance.cosine(seed, embed) for token, embed in zip(tokens, embeds)]
    b = jenks_breaks(ds, 2)[1]
    print(tokens[i], [token for token, embed in zip(tokens, embeds) if distance.cosine(seed, embed) < b])

David ['David', 'she']
sold ['sold', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
Mary ['Mary']
a ['a']
book ['book']
and ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
a ['a']
car ['sold', 'and', 'a', 'car', '.', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
. ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
She ['She', 'she']
drove ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
it ['it']
to ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
California ['California']
, ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
and ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
read ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.']
it ['it']
as ['and', '.', 'drove', 'to', ',', 'and', 'read', 'as', 'fast', 'as', 'could', '.'