In [33]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from gensim.models.keyedvectors import Vocab, KeyedVectors
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

In [6]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [7]:
m1 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/new-england.w2v.txt')

In [8]:
m2 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/deep-south.w2v.txt')

In [9]:
vocab = parse_glove_vocab('../data/corpora/north-south/vocab.txt')

In [12]:
m1_m = np.stack([m1[t] for t in vocab])

In [13]:
m2_m = np.stack([m2[t] for t in vocab])

In [14]:
bv = m1_m - m1_m.mean(0)

In [15]:
ov = m2_m - m2_m.mean(0)

In [16]:
m = ov.T.dot(bv)

In [17]:
u, _, v = np.linalg.svd(m)

In [18]:
ortho = u.dot(v)

In [19]:
m2_m_rotated = ov.dot(ortho)

In [29]:
m2_m_rotated.shape[1]

200

In [35]:
vector_size = m2_m_rotated.shape[1]

m2r = KeyedVectors()
m2r.vector_size = vector_size
m2r.syn0 = np.zeros((len(m2_m_rotated), vector_size))

In [36]:
for i, (token, count) in enumerate(vocab.items()):
    m2r.syn0[i] = m2_m_rotated[i]
    m2r.index2word.append(token)
    m2r.vocab[token] = Vocab(index=i, count=count)

In [37]:
m2r.most_similar('man')

[('dude', 0.6034173965454102),
 ('bruh', 0.5527576208114624),
 ('woman', 0.5370033979415894),
 ('boy', 0.5194164514541626),
 ('wtf', 0.4986564517021179),
 ('bro', 0.4967013895511627),
 ('nigga', 0.49316155910491943),
 ('damn', 0.4929620623588562),
 ('smh', 0.46880608797073364),
 ('lmao', 0.4658861756324768)]

In [38]:
m2.most_similar('man')

[('dude', 0.5719186067581177),
 ('bruh', 0.525306224822998),
 ('woman', 0.509016215801239),
 ('boy', 0.4833117723464966),
 ('wtf', 0.4695777893066406),
 ('bro', 0.46308958530426025),
 ('nigga', 0.45372098684310913),
 ('damn', 0.4516459107398987),
 ('smh', 0.4298434853553772),
 ('😂😂😂', 0.4285019636154175)]

In [51]:
m1.similar_by_vector(m1['liberal'] - m2r['liberal'])

[('rookie', 0.33358442783355713),
 ('autograph', 0.32332780957221985),
 ('♡', 0.3129580020904541),
 ('amo', 0.3115907609462738),
 ('vou', 0.3086968660354614),
 ('nem', 0.30774611234664917),
 ('mint', 0.30079948902130127),
 ('❤❤', 0.2997928559780121),
 ('brandon', 0.2959497272968292),
 ('express', 0.291858971118927)]