In [1]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
m1 = KeyedVectors.load_word2vec_format('../data/corpora/regions/northeast.w2v.txt')

In [19]:
m2 = KeyedVectors.load_word2vec_format('../data/corpora/regions/west.w2v.txt')

In [20]:
m1.most_similar('earth', topn=20)

[('planet', 0.690495491027832),
 ('world', 0.45149412751197815),
 ('humans', 0.42681193351745605),
 ('universe', 0.4267774820327759),
 ('nasa', 0.4212183654308319),
 ('mars', 0.41061821579933167),
 ('nature', 0.3988841772079468),
 ('flat', 0.3964163362979889),
 ('🌎', 0.3931402564048767),
 ('alien', 0.3761330842971802),
 ('aliens', 0.3748101592063904),
 ('beings', 0.36514487862586975),
 ('heaven', 0.36490702629089355),
 ('creatures', 0.35865694284439087),
 ('moon', 0.3538973033428192),
 ('existence', 0.33656996488571167),
 ('exist', 0.3312588334083557),
 ('god', 0.3269472122192383),
 ('science', 0.32379549741744995),
 ('happiest', 0.32316672801971436)]

In [21]:
m2.most_similar('earth', topn=20)

[('planet', 0.604150652885437),
 ('nasa', 0.4618653357028961),
 ('happiest', 0.43611377477645874),
 ('world', 0.42698001861572266),
 ('heaven', 0.42271146178245544),
 ('aliens', 0.4211609959602356),
 ('mars', 0.4162817597389221),
 ('🌎', 0.4160650074481964),
 ('beings', 0.4086928963661194),
 ('nature', 0.39674466848373413),
 ('humans', 0.38687264919281006),
 ('universe', 0.37297430634498596),
 ('moon', 0.36792293190956116),
 ('atmosphere', 0.35663923621177673),
 ('alien', 0.34907543659210205),
 ('god', 0.34403759241104126),
 ('science', 0.34125030040740967),
 ('existence', 0.340227335691452),
 ('scientists', 0.32537758350372314),
 ('exist', 0.32372450828552246)]

In [96]:
def concept_diff(m1, m2, seed, depth=50, topn=30):
    
    m1_sim = [t for t, _ in m1.most_similar(seed, topn=depth)]
    m2_sim = [t for t, _ in m2.most_similar(seed, topn=depth)]
    
    m1_avg = np.array([m1[t] for t in m1_sim]).mean(0)
    m2_avg = np.array([m1[t] for t in m2_sim]).mean(0)
    
    return m1.similar_by_vector(m1_avg-m2_avg, topn=topn)

In [97]:
def north(seed):
    return concept_diff(m1, m2, seed)

In [98]:
def west(seed):
    return concept_diff(m2, m1, seed)

In [105]:
token = 'memories'
north(token)

[('reveals', 0.36275801062583923),
 ('industry', 0.30881744623184204),
 ('feature', 0.3047802448272705),
 ('document', 0.29537874460220337),
 ('google', 0.2897847890853882),
 ('search', 0.2826789617538452),
 ('users', 0.2806784510612488),
 ('report', 0.27290117740631104),
 ('largest', 0.27096864581108093),
 ('allows', 0.26808664202690125),
 ('insider', 0.2626686692237854),
 ('explained', 0.26097261905670166),
 ('surveillance', 0.2536519765853882),
 ('companies', 0.24881863594055176),
 ('highlights', 0.2480195164680481),
 ('features', 0.24758067727088928),
 ('history', 0.24740788340568542),
 ('spotlight', 0.2470731884241104),
 ('honors', 0.24666543304920197),
 ('facebook', 0.2461358606815338),
 ('stories', 0.24385713040828705),
 ('leaked', 0.2431403398513794),
 ('exclusive', 0.24285122752189636),
 ('map', 0.241590216755867),
 ('infographic', 0.24089494347572327),
 ('according', 0.2403336763381958),
 ('selected', 0.23973076045513153),
 ('exec', 0.23928532004356384),
 ('launches', 0.23918

In [106]:
west(token)

[('awww', 0.6390535235404968),
 ('aww', 0.6155736446380615),
 ('bby', 0.6132889986038208),
 ('awwww', 0.6097736358642578),
 ('aw', 0.5771084427833557),
 ('😘😘', 0.5452617406845093),
 ('😘', 0.5419496893882751),
 ('😘😘😘', 0.5250160694122314),
 ('💕', 0.5246809720993042),
 ('ily', 0.5228071212768555),
 ('sis', 0.5146504640579224),
 ('💕💕', 0.5104515552520752),
 ('💞', 0.5060237050056458),
 ('💖', 0.5051789879798889),
 ('ahhh', 0.49821737408638),
 ('😚', 0.4979534149169922),
 ('☺', 0.49489375948905945),
 ('💕💕💕', 0.49404066801071167),
 ('💓', 0.49381452798843384),
 ('💜', 0.48939692974090576),
 ('❤❤', 0.48870134353637695),
 ('sweetie', 0.48577821254730225),
 ('💗', 0.48352888226509094),
 ('❤❤❤', 0.4802273213863373),
 ('😍😍', 0.47827261686325073),
 ('😍😍😍', 0.4772385358810425),
 ('❤', 0.47641900181770325),
 ('😻', 0.47278863191604614),
 ('😍', 0.4721750020980835),
 ('darling', 0.47158902883529663)]