In [1]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
pd.options.display.max_rows = 1000

In [4]:
m1 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/new-england.w2v.txt')

In [5]:
m2 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/deep-south.w2v.txt')

In [6]:
vocab = parse_glove_vocab('../data/corpora/north-south/vocab.txt')

In [7]:
m1.most_similar('earth', topn=50)

[('planet', 0.6611870527267456),
 ('flat', 0.4506067931652069),
 ('heaven', 0.4433976411819458),
 ('mother', 0.42221304774284363),
 ('humans', 0.3965595066547394),
 ('moon', 0.3851078748703003),
 ('world', 0.37977415323257446),
 ('mars', 0.3729405403137207),
 ('nasa', 0.367029070854187),
 ('ocean', 0.3626718521118164),
 ('universe', 0.3579068183898926),
 ('nature', 0.3503480553627014),
 ('alien', 0.3462546765804291),
 ('science', 0.32632237672805786),
 ('worlds', 0.3172537088394165),
 ('exist', 0.3115980327129364),
 ('god', 0.3109760880470276),
 ('happiest', 0.3101898431777954),
 ('destroy', 0.30653250217437744),
 ('human', 0.2999586760997772),
 ('mothers', 0.2968764007091522),
 ('🌎', 0.29605579376220703),
 ('humanity', 0.2941015660762787),
 ('the', 0.2896028757095337),
 ('species', 0.2877427637577057),
 ('day', 0.28529858589172363),
 ('life', 0.2831520736217499),
 ('scientists', 0.2808951735496521),
 ('google', 0.28033196926116943),
 ('sea', 0.27928292751312256),
 ('literally', 0.2787

In [8]:
m2.most_similar('earth', topn=50)

[('planet', 0.6582320928573608),
 ('heaven', 0.4366685748100281),
 ('humans', 0.38414978981018066),
 ('mother', 0.3830600082874298),
 ('god', 0.37863433361053467),
 ('nasa', 0.37285831570625305),
 ('nature', 0.37076854705810547),
 ('happiest', 0.36943674087524414),
 ('dust', 0.366108775138855),
 ('world', 0.3646332621574402),
 ('aliens', 0.364328533411026),
 ('dance', 0.3249739408493042),
 ('space', 0.31283432245254517),
 ('human', 0.31199026107788086),
 ('greatest', 0.3072046637535095),
 ('found', 0.3051074743270874),
 ('🌎', 0.29443633556365967),
 ('mars', 0.2938433587551117),
 ('above', 0.29314345121383667),
 ('the', 0.29308223724365234),
 ('living', 0.29220685362815857),
 ('mysterious', 0.2908473014831543),
 ('jesus', 0.29000988602638245),
 ('upon', 0.28865545988082886),
 ('proof', 0.2861965000629425),
 ('land', 0.28618866205215454),
 ('flat', 0.28550148010253906),
 ('lord', 0.28357982635498047),
 ('shadow', 0.283281534910202),
 ('sized', 0.2814238667488098),
 ('alien', 0.2802904844

In [317]:
m1_avg = np.array([m1[t] for t, _ in m1.most_similar('trump', topn=50)]).mean(0)

In [318]:
m2_avg = np.array([m1[t] for t, _ in m2.most_similar('trump', topn=50)]).mean(0)

In [320]:
m1.similar_by_vector(m1_avg-m2_avg, topn=30)

[('he', 0.4565543234348297),
 ('knows', 0.41218113899230957),
 ('himself', 0.3997402787208557),
 ('him', 0.3861508369445801),
 ('asked', 0.3673762381076813),
 ('his', 0.36597931385040283),
 ('said', 0.3444696366786957),
 ('was', 0.34165677428245544),
 ('brady', 0.33608150482177734),
 ('guy', 0.33410316705703735),
 ('knew', 0.3338080644607544),
 ('did', 0.3228428065776825),
 ('wasn', 0.32150840759277344),
 ('she', 0.3203584849834442),
 ('told', 0.3135398030281067),
 ('hasn', 0.299806147813797),
 ('nobody', 0.2962963581085205),
 ('jr', 0.29586708545684814),
 ('client', 0.29453492164611816),
 ('loves', 0.294242262840271),
 ('missed', 0.29114487767219543),
 ('met', 0.2908840775489807),
 ('someone', 0.29007488489151),
 ('tells', 0.2887221574783325),
 ('surprised', 0.28761881589889526),
 ('gave', 0.2841348946094513),
 ('stole', 0.2833779752254486),
 ('saw', 0.28315651416778564),
 ('flynn', 0.28083711862564087),
 ('shawn', 0.27943333983421326)]