In [1]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from gensim.models import Word2Vec

In [2]:
pd.options.display.max_rows = 1000

In [3]:
ne = Word2Vec.load('../data/northeast.bin')

In [4]:
mw = Word2Vec.load('../data/midwest.bin')

In [44]:
ne_vocab = set([k for k, v in ne.wv.vocab.items() if v.count > 10000])

In [45]:
mw_vocab = set([k for k, v in mw.wv.vocab.items() if v.count > 10000])

In [46]:
vocab = set.intersection(ne_vocab, mw_vocab)

In [47]:
vocab = sorted(vocab)

In [48]:
len(vocab)

4346

In [49]:
ne_m = np.stack([ne[t] for t in vocab])

In [50]:
mw_m = np.stack([mw[t] for t in vocab])

In [51]:
ne_m.shape

(4346, 200)

In [52]:
mw_m.shape

(4346, 200)

In [53]:
basevecs = ne_m - ne_m.mean(0)

In [54]:
othervecs = mw_m - mw_m.mean(0)

In [55]:
m = othervecs.T.dot(basevecs)

In [56]:
u, _, v = np.linalg.svd(m)

In [57]:
ortho = u.dot(v)

In [58]:
fixedvecs = othervecs.dot(ortho)

In [59]:
data = []
for i, token in enumerate(vocab):
    d = cosine(ne_m[i], fixedvecs[i])
    data.append((token, d))

In [60]:
df = pd.DataFrame(data, columns=('token', 'distance'))

In [62]:
df.sort_values('distance', ascending=False).head(200)

Unnamed: 0,token,distance
38,#kcapinoystar,0.809416
1129,cr,0.772725
1157,ct,0.559044
90,#truth,0.554499
4226,wi,0.548607
2469,ma,0.543152
2448,louis,0.518994
2612,mo,0.468071
18,#etsy,0.454011
2850,pa,0.439254


In [36]:
def compare(token):
    for t, _ in ne.most_similar(token, topn=20):
        print(t)
    print('\n')
    for t, _ in mw.most_similar(token, topn=20):
        print(t)

In [64]:
compare('bone')

bones
bruise
tissue
wrist
arm
chest
bruised
claw
neck
skull
flesh
ribs
hammer
mold
elbow
knee
lump
groin
chin
thumb


bones
knuckles
wrist
neck
gums
tooth
claws
chest
elbow
joints
bun
claw
rag
bruised
meat
cheek
bruise
strain
vein
sock
