In [1]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from gensim.models import KeyedVectors

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [3]:
pd.options.display.max_rows = 1000

In [4]:
m1 = KeyedVectors.load_word2vec_format('../data/models/v2/glove/new-england.txt')

In [5]:
m2 = KeyedVectors.load_word2vec_format('../data/models/v2/glove/deep-south.txt')

In [65]:
m1.most_similar('thin', topn=20)

[('blue', 0.5567487478256226),
 ('thick', 0.5558913946151733),
 ('line', 0.5137444734573364),
 ('grass', 0.4914484918117523),
 ('tan', 0.47100830078125),
 ('orange', 0.45093637704849243),
 ('sky', 0.44146430492401123),
 ('skin', 0.4269309341907501),
 ('lines', 0.4232107996940613),
 ('line,', 0.4153229296207428),
 ('color', 0.3929116129875183),
 ('muddy', 0.38477015495300293),
 ('red', 0.3771873116493225),
 ('measuring', 0.3756137490272522),
 ('eyes.', 0.374064177274704),
 ('dark', 0.3739303648471832),
 ('soft', 0.36622288823127747),
 ('yellow', 0.36135002970695496),
 ('waters', 0.3612261414527893),
 ('green', 0.3596183657646179)]

In [66]:
m2.most_similar('thin', topn=20)

[('lizzy', 0.7598437666893005),
 ('town"', 0.6060155630111694),
 ('*the', 0.5849156975746155),
 ('playing*', 0.58421790599823),
 ('begins', 0.5485669374465942),
 ('quiet', 0.5261313319206238),
 ('lizzy*', 0.518750011920929),
 ('song,', 0.5141802430152893),
 ('the\xa0boys\xa0are\xa0back\xa0in', 0.5109759569168091),
 ('"the', 0.5106024146080017),
 ('boys', 0.4991893768310547),
 ('the\xa0boys\xa0are\xa0back', 0.4950389266014099),
 ('back\xa0in', 0.4942794442176819),
 ('the\xa0boys\xa0are', 0.48644423484802246),
 ('back!)', 0.48218199610710144),
 ('in\xa0town!', 0.47584760189056396),
 ('boys\xa0are\xa0back\xa0in', 0.46684083342552185),
 ('back\xa0in\xa0town!', 0.46266141533851624),
 ('are\xa0back', 0.46150702238082886),
 ('boys\xa0are\xa0back', 0.4523363709449768)]

In [19]:
token = 'work'

m1.n_similarity(
    [t for t, _ in m1.most_similar(token, topn=20)],
    [t for t, _ in m2.most_similar(token, topn=20) if t in m1]
)

0.88868590050221241

In [67]:
data = []
for token in m1.wv.vocab.keys():
    
    if token not in m2:
        continue
    
    score = m1.n_similarity(
        [t for t, _ in m1.most_similar(token, topn=20)],
        [t for t, _ in m2.most_similar(token, topn=20) if t in m1]
    )
    
    data.append((token, score))

In [68]:
df1 = pd.DataFrame(data, columns=('token', 'score'))

In [69]:
len(df1)

8027

In [74]:
df1.sort_values('score').head(100)

Unnamed: 0,token,sim
835,left.,-0.011111
7996,......,-0.004701
6688,ian,0.011648
7669,👉,0.020226
1149,beyond,0.04222
3897,levels,0.054737
6977,machines,0.058727
3352,ton,0.085474
1296,including,0.097276
7106,bra,0.102435


In [75]:
data = []
for token in m1.wv.vocab.keys():
    
    if token not in m2:
        continue
        
    sim1 = [t for t, _ in m1.most_similar(token, topn=20)]
    sim2 = [t for t, _ in m2.most_similar(token, topn=20) if t in m1]
    
    if sim2 < sim1:
        continue
        
    score = m1.n_similarity(sim1, sim2)
    
    data.append((token, score))

In [76]:
df2 = pd.DataFrame(data, columns=('token', 'score'))

In [77]:
len(df2)

4004

In [89]:
df2.sort_values('score', ascending=True).head(300)

Unnamed: 0,token,score
412,left.,-0.011111
3991,......,-0.004701
1691,ton,0.085474
635,including,0.097276
1227,sec,0.128824
1598,armed,0.162364
1966,skip,0.180703
1506,unit,0.205814
2945,"co,",0.21367
3989,tide,0.217262


In [81]:
def compare(token):
    for t, _ in m1.most_similar(token, topn=20):
        print(t)
    print('\n')
    for t, _ in m2.most_similar(token, topn=20):
        print(t)

In [97]:
compare('finest')

boston's
garnett
patrice
bergeron
rc
autograph
iconic
psa
philadelphia
rookie
sip
pacific
mint
❤❤
auto
olynyk
gem
topps
pierce
🗣


world's
largest
fastest
worlds
duo
وما
coolest
ابو
celebrates
مبروك
compliment
افتح
atlanta's
honesty
يا
💙💙
mothers
cakes
part.
رسول
