In [18]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
pd.options.display.max_rows = 1000

In [4]:
m1 = KeyedVectors.load_word2vec_format('../data/models/v2/glove/new-england.txt')

In [5]:
m2 = KeyedVectors.load_word2vec_format('../data/models/v2/glove/deep-south.txt')

In [6]:
m1_vocab = set(m1.wv.vocab.keys())

In [7]:
m2_vocab = set(m2.wv.vocab.keys())

In [8]:
vocab = set.intersection(m1_vocab, m2_vocab)

In [9]:
vocab = sorted(vocab)

In [10]:
m1_m = np.stack([m1[t] for t in vocab])

In [11]:
m2_m = np.stack([m2[t] for t in vocab])

In [12]:
bv = m1_m - m1_m.mean(0)

In [13]:
ov = m2_m - m2_m.mean(0)

In [14]:
m = ov.T.dot(bv)

In [15]:
u, _, v = np.linalg.svd(m)

In [16]:
ortho = u.dot(v)

In [17]:
m2f = ov.dot(ortho)

In [19]:
data = []
for i, token in enumerate(vocab):
    d = cosine(m1_m[i], m2f[i])
    data.append((token, d))

In [20]:
df = pd.DataFrame(data, columns=('token', 'd'))

In [21]:
len(df)

8027

In [23]:
df.sort_values('d', ascending=False).head(200)

Unnamed: 0,token,d
7801,دعاء,0.987818
7830,ومن,0.968014
7778,أسألك,0.936057
7788,الذكر,0.934368
7887,👉,0.870812
3442,ian,0.867553
2075,diesel,0.86486
7805,شر,0.864511
144,0.0mph,0.854693
5943,sec,0.8542
