In [1]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook

from sources import parse_glove_vocab

from IPython.display import Markdown, display

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
m1 = KeyedVectors.load_word2vec_format('../../data/corpora/north-south/new-england.w2v.txt')

In [4]:
m2 = KeyedVectors.load_word2vec_format('../../data/corpora/north-south/deep-south.w2v.txt')

In [5]:
combined = KeyedVectors.load_word2vec_format('../../data/corpora/north-south/combined.w2v.txt')

In [6]:
vocab = parse_glove_vocab('../../data/corpora/north-south/vocab.txt')

In [7]:
def concept_diff(m1, m2, seed, depth=50, topn=20):
    
    m1_sim = [t for t, _ in m1.most_similar(seed, topn=depth)]
    m2_sim = [t for t, _ in m2.most_similar(seed, topn=depth)]
    
    m1_avg = np.array([combined[t] for t in m1_sim]).mean(0)
    m2_avg = np.array([combined[t] for t in m2_sim]).mean(0)
    
    return combined.similar_by_vector(m1_avg-m2_avg, topn=topn)

In [8]:
def north(seed, *args, **kwargs):
    return concept_diff(m1, m2, seed, *args, **kwargs)

In [9]:
def south(seed, *args, **kwargs):
    return concept_diff(m2, m1, seed, *args, **kwargs)

In [10]:
def compare(token, *args, **kwargs):
    display(Markdown(f'# {token}'))
    
    display(Markdown(f'#### New England'))
    print(' '.join([t for t, _ in north(token, *args, **kwargs)]))
    print('\n')
    
    display(Markdown(f'#### South'))
    print(' '.join([t for t, _ in south(token, *args, **kwargs)]))

In [11]:
compare('tv')

# tv

#### New England

commercials nbc cameras ads cbs commercial abc livestream outlet coverage msnbc cnn rooms ad machines boycott vegas footage snap chills




#### South

actor supporter singer carter actress speak fi laura sci educated empire although role pen legend fellow mentor ali sterling gemini


In [14]:
compare('entertainment')

# entertainment

#### New England

ps4 xbox pc console playstation 🔴 controller fidget keyboard arcade spinner minecraft vr sony ipad nintendo ios overwatch twitch tablet




#### South

leadership housing meeting council article magazine leaders policy fashion environmental institute events regarding industry jewish organization independent area hotel development


In [16]:
compare('media')

# media

#### New England

corrupt lying dems traitor democrats potus democrat msm gop he politicians elected djt idiot republican bs obama disgrace hrc asshole




#### South

wordpress website tools photography seo instagram tips apps online design pinterest app analytics sites blog web pages page site linkedin


In [18]:
compare('movie')

# movie

#### New England

character characters comics authors comic villain actor politician known whose mlk writer marvel fellow establishment dc thrown artists who artist




#### South

hd video clip playlist surveillance drone download clips tube vid audio wireless compilation premium 4k tutorial liked livestream dvd setup


In [20]:
compare('book')

# book

#### New England

picture essay bc wrote writing whose written timeline drawing liking pictures paper staring letters thread letter were writers tweet page




#### South

strategies ebook killer 99 prepare webinar basics available ron domestic publish via promote tactics rite automation tips sell learn pros


In [21]:
compare('film')

# film

#### New England

batman superman little lego wars liked found dragon minecraft beast episode frog comic star theater spider discovery saw gem mario




#### South

producers awards managing hiring industry careers outstanding agency investors excellence consulting applications athletes meetings hire managers seeking operations association executive


In [22]:
compare('hollywood')

# hollywood

#### New England

ca san diego angeles california dc santa francisco los comic antonio counseling fe con utah superman az colorado tx kansas




#### South

cnn fake abc msnbc lifestyle fame unbelievable facts news kathy spotify apple narrative proof anchor timeline comment 🎧 blowing nbc


In [26]:
compare('music')

# music

#### New England

pop soul blues band mix cafe folk j jazz magic rock urban purple jungle chic art 94 vintage rubber pink




#### South

comments soundcloud submit respond tweeting reply deleted posting comment stated youtube replies message viral views 100k tweet users over earlier


In [29]:
compare('nintendo')

# nintendo

#### New England

blu dvd arcade hd classic ray billy mix mens christmas includes feat edition featuring mini adventure g remix camp kelly




#### South

ignoring questions removed customers professor facing removing answering point could further politician argument dragged firm otherwise user question asking dms


In [30]:
compare('game')

# game

#### New England

hockey nhl refs dicks brad celtics criticize ot scored men boston suck yankees large 2b bradley jersey teams sox boys




#### South

💥 video ⚡ episode 🏈 ep watch livestream vlog ✨ viral preview jungle youtube irma dawn ♥ promise ❄ 🙏🏽


In [32]:
compare('song')

# song

#### New England

lipstick lego wars color potter thumb wear metal nails comics makeup outfits themed costume heroes band wearing socks wore rubber




#### South

diss gospel rapper produced soundcloud stops minaj record recorded tracks grind beats mill producer meek nicki word rappers respond remy


In [35]:
compare('celebrity')

# celebrity

#### New England

copper shop shipping clothing burnt towel marijuana sizes applies goods finger pot brass usd sale trend maryland enforcement pipe waist




#### South

moments childhood appearance footage episode entertainment reality photos experience legendary ruin hollywood stars television show nights hall liked director athlete
