In [1]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
m1 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/new-england.w2v.txt')

In [4]:
m2 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/deep-south.w2v.txt')

In [5]:
combined = KeyedVectors.load_word2vec_format('../data/corpora/north-south/combined.w2v.txt')

In [6]:
vocab = parse_glove_vocab('../data/corpora/north-south/vocab.txt')

In [178]:
token = 'full'
m1.most_similar(token, topn=30)

[('length', 0.5049359798431396),
 ('complete', 0.38903945684432983),
 ('hd', 0.35253140330314636),
 ('stack', 0.3423921763896942),
 ('opening', 0.3404300808906555),
 ('blown', 0.33776575326919556),
 ('of', 0.33743369579315186),
 ('view', 0.3183603584766388),
 ('depth', 0.3104381859302521),
 ('zip', 0.3082897365093231),
 ('highlights', 0.30475419759750366),
 ('whole', 0.3038095235824585),
 ('mode', 0.30309295654296875),
 ('review', 0.29942089319229126),
 ('part', 0.29878121614456177),
 ('ii', 0.2890329957008362),
 ('moon', 0.2883172631263733),
 ('release', 0.2859869599342346),
 ('free', 0.28577253222465515),
 ('half', 0.2857567071914673),
 ('packed', 0.2844902276992798),
 ('version', 0.278579980134964),
 ('room', 0.2767868638038635),
 ('swing', 0.2740824222564697),
 ('body', 0.27293661236763),
 ('apply', 0.27210545539855957),
 ('click', 0.2709727883338928),
 ('video', 0.26954761147499084),
 ('trailer', 0.26850101351737976),
 ('frame', 0.2656404972076416)]

In [179]:
m2.most_similar(token, topn=30)

[('length', 0.435407817363739),
 ('blown', 0.41423729062080383),
 ('complete', 0.38919636607170105),
 ('view', 0.3815538287162781),
 ('whole', 0.34214094281196594),
 ('☆★☆', 0.3286590874195099),
 ('ii', 0.3158103823661804),
 ('half', 0.3125581741333008),
 ('of', 0.3048214912414551),
 ('rn', 0.3010605573654175),
 ('pocket', 0.29466959834098816),
 ('opening', 0.28971874713897705),
 ('part', 0.28435128927230835),
 ('circle', 0.28204166889190674),
 ('entire', 0.2769243121147156),
 ('hd', 0.27291879057884216),
 ('price', 0.2722204625606537),
 ('fit', 0.27114593982696533),
 ('therapist', 0.2707825005054474),
 ('details', 0.2702878713607788),
 ('effect', 0.2700255215167999),
 ('production', 0.26974785327911377),
 ('massage', 0.26726457476615906),
 ('nurse', 0.26591360569000244),
 ('meal', 0.26584041118621826),
 ('course', 0.26400595903396606),
 ('gallery', 0.26332783699035645),
 ('mod', 0.26061469316482544),
 ('packed', 0.25941383838653564),
 ('apply', 0.2587586045265198)]

In [167]:
def concept_diff(m1, m2, seed, depth=30, topn=30):
    
    m1_sim = [t for t, _ in m1.most_similar(seed, topn=depth)]
    m2_sim = [t for t, _ in m2.most_similar(seed, topn=depth)]
    
    m1_avg = np.array([combined[t] for t in m1_sim]).mean(0)
    m2_avg = np.array([combined[t] for t in m2_sim]).mean(0)
    
    return combined.similar_by_vector(m1_avg-m2_avg, topn=topn)

In [168]:
def north(seed):
    return concept_diff(m1, m2, seed)

In [169]:
def south(seed):
    return concept_diff(m2, m1, seed)

In [170]:
token = 'liberal'
north(token)

[('radical', 0.39017266035079956),
 ('christian', 0.32963770627975464),
 ('authors', 0.3166189193725586),
 ('social', 0.3155028820037842),
 ('politicians', 0.31452250480651855),
 ('leaders', 0.3060218095779419),
 ('groups', 0.3048692047595978),
 ('organizations', 0.2976757287979126),
 ('community', 0.29292958974838257),
 ('muslim', 0.2926967144012451),
 ('islamic', 0.2864874005317688),
 ('schools', 0.2828742563724518),
 ('become', 0.2812657356262207),
 ('jewish', 0.27893736958503723),
 ('terrorism', 0.2784363031387329),
 ('media', 0.2780627906322479),
 ('traditional', 0.27261096239089966),
 ('communities', 0.27225446701049805),
 ('islam', 0.2653965651988983),
 ('conservative', 0.2653827369213104),
 ('muslims', 0.2650068998336792),
 ('events', 0.25680944323539734),
 ('native', 0.2549709677696228),
 ('terrorist', 0.2545480728149414),
 ('countries', 0.24893291294574738),
 ('programs', 0.24809890985488892),
 ('lgbt', 0.2474164515733719),
 ('nations', 0.246967613697052),
 ('christians', 0.2

In [171]:
south(token)

[('lolol', 0.40383780002593994),
 ('goddamn', 0.383350133895874),
 ('😂😂😂😂😂😂', 0.382423460483551),
 ('pathetic', 0.37693485617637634),
 ('😂😂😂😂😂', 0.3667609691619873),
 ('lmaoooooo', 0.35614970326423645),
 ('hahahahahaha', 0.35565024614334106),
 ('unreal', 0.353347510099411),
 ('😂😂😂😂😂😂😂', 0.35003501176834106),
 ('lmaooooo', 0.3487037420272827),
 ('ikr', 0.34290167689323425),
 ('😂😂😂😂', 0.34179380536079407),
 ('😮', 0.34064120054244995),
 ('lmfaoooooo', 0.340437114238739),
 ('yooo', 0.3394015431404114),
 ('😭😭😭', 0.3392985165119171),
 ('dumbass', 0.3390021324157715),
 ('lmfaoo', 0.3383569121360779),
 ('lmfao', 0.3377176523208618),
 ('😂😂😂', 0.33715498447418213),
 ('omfg', 0.336296945810318),
 ('😭😭', 0.3326728940010071),
 ('kidding', 0.33137011528015137),
 ('crying', 0.3301336169242859),
 ('hahah', 0.3274979591369629),
 ('smfh', 0.3258247673511505),
 ('freakin', 0.32321611046791077),
 ('hahahahaha', 0.32275211811065674),
 ('xd', 0.3216325044631958),
 ('😭😭😭😭', 0.3209562301635742)]