In [1]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [4]:
m1 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/new-england.w2v.txt')

In [5]:
m2 = KeyedVectors.load_word2vec_format('../data/corpora/north-south/deep-south.w2v.txt')

In [6]:
combined = KeyedVectors.load_word2vec_format('../data/corpora/north-south/combined.w2v.txt')

In [7]:
vocab = parse_glove_vocab('../data/corpora/north-south/vocab.txt')

In [450]:
m1.most_similar('clinton', topn=50)

[('hillary', 0.872848391532898),
 ('hrc', 0.738634467124939),
 ('obama', 0.6157892346382141),
 ('dnc', 0.5850938558578491),
 ('bush', 0.5727822184562683),
 ('lynch', 0.5668103098869324),
 ('comey', 0.5617506504058838),
 ('russia', 0.551996111869812),
 ('sanders', 0.5402530431747437),
 ('bernie', 0.5353250503540039),
 ('fbi', 0.5284608006477356),
 ('campaign', 0.5271608829498291),
 ('emails', 0.5074602365493774),
 ('election', 0.5070749521255493),
 ('russians', 0.49625164270401),
 ('investigation', 0.49359989166259766),
 ('chelsea', 0.49229925870895386),
 ('trump', 0.4837045967578888),
 ('democrats', 0.48069459199905396),
 ('crooked', 0.46750450134277344),
 ('donald', 0.4618498980998993),
 ('soros', 0.4610463082790375),
 ('dems', 0.4570835828781128),
 ('supporters', 0.44571083784103394),
 ('corruption', 0.445248007774353),
 ('foundation', 0.4442990720272064),
 ('collusion', 0.43900346755981445),
 ('investigate', 0.43104639649391174),
 ('djt', 0.42890894412994385),
 ('voters', 0.42492023

In [451]:
m2.most_similar('clinton', topn=50)

[('hillary', 0.8677021265029907),
 ('hrc', 0.692702054977417),
 ('obama', 0.6362740993499756),
 ('lynch', 0.5855323076248169),
 ('bush', 0.5737659931182861),
 ('comey', 0.5527466535568237),
 ('dnc', 0.5266940593719482),
 ('bernie', 0.5158839225769043),
 ('russia', 0.5033775568008423),
 ('trump', 0.49705690145492554),
 ('foundation', 0.49584123492240906),
 ('emails', 0.49530476331710815),
 ('fbi', 0.4947221875190735),
 ('sanders', 0.4731174111366272),
 ('chelsea', 0.4705349802970886),
 ('soros', 0.46832138299942017),
 ('scandal', 0.4634433388710022),
 ('wikileaks', 0.4606429934501648),
 ('pelosi', 0.4545612633228302),
 ('democrats', 0.44992542266845703),
 ('admin', 0.44540145993232727),
 ('campaign', 0.4451872706413269),
 ('collusion', 0.4429398477077484),
 ('bill', 0.44080936908721924),
 ('investigation', 0.4387202858924866),
 ('dems', 0.4365312159061432),
 ('president', 0.4329547584056854),
 ('presidency', 0.43273335695266724),
 ('election', 0.4311751425266266),
 ('corruption', 0.4293

In [459]:
def concept_diff(m1, m2, seed, depth=50, topn=30):
    
    m1_sim = [t for t, _ in m1.most_similar(seed, topn=depth)]
    m2_sim = [t for t, _ in m2.most_similar(seed, topn=depth)]
    
    m1_avg = np.array([combined[t] for t in m1_sim]).mean(0)
    m2_avg = np.array([combined[t] for t in m2_sim]).mean(0)
    
    return combined.similar_by_vector(m1_avg-m2_avg, topn=topn)

In [460]:
def north(seed):
    return concept_diff(m1, m2, seed)

In [461]:
def south(seed):
    return concept_diff(m2, m1, seed)

In [462]:
token = 'blood'
north(token)

[('reduce', 0.5296963453292847),
 ('impact', 0.4177272915840149),
 ('managing', 0.40812787413597107),
 ('improve', 0.38893210887908936),
 ('increase', 0.38399115204811096),
 ('boost', 0.3774690330028534),
 ('levels', 0.36981403827667236),
 ('costs', 0.34772640466690063),
 ('affect', 0.3443049192428589),
 ('uncharacteristical', 0.34284472465515137),
 ('prevent', 0.34240657091140747),
 ('rate', 0.3396638333797455),
 ('financial', 0.3256698250770569),
 ('stress', 0.3254662752151489),
 ('research', 0.3230990171432495),
 ('overall', 0.32018959522247314),
 ('energy', 0.31914854049682617),
 ('risk', 0.3188474178314209),
 ('cancer', 0.317279189825058),
 ('cost', 0.31415891647338867),
 ('rates', 0.3122498691082001),
 ('budget', 0.31180518865585327),
 ('nutrition', 0.3105454444885254),
 ('benefit', 0.30983519554138184),
 ('affects', 0.30666130781173706),
 ('symptoms', 0.3046736717224121),
 ('improving', 0.3041445016860962),
 ('savings', 0.3028523623943329),
 ('ensure', 0.30241209268569946),
 ('m

In [463]:
south(token)

[('jesus', 0.4181080758571625),
 ('christ', 0.4088263511657715),
 ('hood', 0.38924920558929443),
 ('washed', 0.3270508348941803),
 ('god', 0.31922203302383423),
 ('choir', 0.3154239058494568),
 ('boy', 0.3121204376220703),
 ('playing', 0.3067709803581238),
 ('mud', 0.3029404282569885),
 ('stood', 0.2963900864124298),
 ('came', 0.29512736201286316),
 ('lamb', 0.29344671964645386),
 ('walked', 0.2925993800163269),
 ('lord', 0.2911379039287567),
 ('window', 0.28959545493125916),
 ('faithful', 0.28499525785446167),
 ('ugly', 0.2847456932067871),
 ('died', 0.282840758562088),
 ('turned', 0.2780376970767975),
 ('uncle', 0.27798259258270264),
 ('gospel', 0.2725037932395935),
 ('glory', 0.27133122086524963),
 ('nigga', 0.26875853538513184),
 ('john', 0.2675343155860901),
 ('bun', 0.2638452649116516),
 ('shook', 0.2626761794090271),
 ('dirt', 0.26195985078811646),
 ('brother', 0.2613903284072876),
 ('gang', 0.25983330607414246),
 ('brothers', 0.25879186391830444)]