In [1]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from gensim.models.keyedvectors import Vocab, KeyedVectors
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
m1 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/new-england.w2v.txt')

In [4]:
m2 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/deep-south.w2v.txt')

In [5]:
vocab = parse_glove_vocab('../data/corpora/north-south/vocab.txt')

In [6]:
m1_m = np.stack([m1[t] for t in vocab])

In [7]:
m2_m = np.stack([m2[t] for t in vocab])

In [8]:
bv = m1_m - m1_m.mean(0)

In [9]:
ov = m2_m - m2_m.mean(0)

In [10]:
m = ov.T.dot(bv)

In [11]:
u, _, v = np.linalg.svd(m)

In [12]:
ortho = u.dot(v)

In [13]:
m2_m_rotated = ov.dot(ortho)

In [14]:
vector_size = m2_m_rotated.shape[1]

m2r = KeyedVectors()
m2r.vector_size = vector_size
m2r.syn0 = np.zeros((len(m2_m_rotated), vector_size))

In [15]:
for i, (token, count) in enumerate(vocab.items()):
    m2r.syn0[i] = m2_m_rotated[i]
    m2r.index2word.append(token)
    m2r.vocab[token] = Vocab(index=i, count=count)

In [17]:
m1['man']

array([  1.24884002e-01,   1.44332007e-01,  -1.88838005e-01,
         2.19178006e-01,   4.67552006e-01,   7.02872992e-01,
        -4.54820991e-01,   7.89109990e-02,   2.21050009e-02,
        -1.60408005e-01,  -9.28110033e-02,  -5.89340985e-01,
         5.51970005e-01,  -2.07999998e-04,   5.00438988e-01,
         3.33081990e-01,  -2.30620001e-02,  -3.67978990e-01,
        -4.61189985e-01,   3.37197989e-01,   9.11249965e-02,
         6.53964996e-01,  -1.36445001e-01,   9.43920016e-02,
         5.32660000e-02,   3.07516992e-01,  -1.25073001e-01,
        -2.74180006e-02,  -3.13892007e-01,   1.32769998e-02,
        -4.54355001e-01,  -1.69514999e-01,  -9.88499969e-02,
         9.55646992e-01,  -3.02067995e-01,   9.23769996e-02,
        -4.34289984e-02,   3.95390004e-01,  -3.92509997e-01,
         4.26539987e-01,   4.32933986e-01,   3.51092994e-01,
        -6.86294019e-01,   1.72132000e-01,   6.70535028e-01,
         4.32944000e-01,  -9.54480022e-02,  -4.73724008e-01,
         3.77716005e-01,

In [18]:
m2['man']

array([  3.61759998e-02,   1.78911000e-01,   8.37259963e-02,
         8.59999971e-04,   3.38272005e-01,   3.93436998e-01,
        -3.72593999e-01,   2.39196002e-01,   3.26780006e-02,
        -4.70229983e-02,  -2.03178003e-01,  -7.92306006e-01,
         6.76219985e-02,  -2.17079997e-01,   2.01519996e-01,
         7.96975017e-01,   2.41005003e-01,   9.37729999e-02,
        -1.34233996e-01,   2.56938010e-01,   3.97998005e-01,
        -7.23100007e-02,  -9.37659964e-02,  -1.54007003e-01,
         5.87633014e-01,   3.12397987e-01,  -5.98330013e-02,
        -1.78242996e-01,  -1.24691002e-01,   2.16599002e-01,
         3.12950015e-02,  -1.08886003e-01,  -2.74172008e-01,
         5.61015010e-01,   3.52872014e-01,   2.51222014e-01,
        -2.18035996e-01,  -5.87499999e-02,  -1.68040007e-01,
         8.26089978e-02,   9.37233984e-01,   2.01159995e-02,
        -5.38096011e-01,   5.60199982e-03,   3.17501992e-01,
         2.18234003e-01,   3.70988995e-01,  -6.88943982e-01,
        -7.52444029e-01,

In [42]:
m1.similar_by_vector(m1['education'] - m2r['education'], topn=50)

[('freaks', 0.2652928829193115),
 ('republic', 0.25170430541038513),
 ('halloween', 0.24341872334480286),
 ('sells', 0.23881211876869202),
 ('casting', 0.2370520532131195),
 ('plug', 0.23281434178352356),
 ('leggings', 0.2281881868839264),
 ('backwards', 0.2279973328113556),
 ('vagina', 0.2261863797903061),
 ('0mph', 0.22344925999641418),
 ('stranger', 0.22120890021324158),
 ('cannabis', 0.2177310287952423),
 ('bun', 0.21749362349510193),
 ('springs', 0.2149505615234375),
 ('colorado', 0.2144516110420227),
 ('puzzle', 0.21308490633964539),
 ('tops', 0.2121865302324295),
 ('tall', 0.2101905345916748),
 ('gpa', 0.20801201462745667),
 ('ripped', 0.2071993350982666),
 ('til', 0.2043541818857193),
 ('pumpkin', 0.20427757501602173),
 ('ea', 0.20329827070236206),
 ('pissing', 0.2030630260705948),
 ('equality', 0.20233789086341858),
 ('toys', 0.20051315426826477),
 ('pisses', 0.1980123668909073),
 ('grandpa', 0.19729438424110413),
 ('cinnamon', 0.19712798297405243),
 ('cotton', 0.1964168548583

In [43]:
cosine(m1['a'], m2r['a'])

0.081728379883631797