In [1]:
from gensim.models import Word2Vec

In [2]:
al = Word2Vec.load('../models/AL.bin')

In [3]:
ma = Word2Vec.load('../models/MA.bin')

In [4]:
al.wv.most_similar(['money'])

[('dollars', 0.6398943066596985),
 ('taxes', 0.6398506164550781),
 ('groceries', 0.6265619993209839),
 ('profits', 0.6176314353942871),
 ('funds', 0.6121464371681213),
 ('millions', 0.6062153577804565),
 ('cash', 0.6023549437522888),
 ('assets', 0.6020326018333435),
 ('shit', 0.5996403694152832),
 ('them', 0.5995004177093506)]

In [5]:
ma.wv.most_similar(['money'])

[('cash', 0.6781432628631592),
 ('taxes', 0.6553000807762146),
 ('billions', 0.6355156898498535),
 ('funds', 0.6341602802276611),
 ('dollars', 0.6300170421600342),
 ('millions', 0.6215152740478516),
 ('them', 0.6183197498321533),
 ('debt', 0.6090303063392639),
 ('attention', 0.5968717336654663),
 ('taxpayers', 0.5736443996429443)]

In [6]:
from wordfreq import top_n_list

In [7]:
words = set(top_n_list('en', 1000))

In [8]:
ma_vocab = set(ma.wv.vocab.keys())

In [9]:
al_vocab = set(al.wv.vocab.keys())

In [10]:
vocab = set.intersection(ma_vocab, al_vocab, words)

In [11]:
len(vocab)

968

In [12]:
import pandas as pd

In [13]:
pd.set_option('display.max_rows', 1000)

In [14]:
data = []
for word in vocab:
    
    ma_topn = ma.wv.most_similar([word], topn=20)
    al_topn = al.wv.most_similar([word], topn=20)
    
    ma_words = set([w for w, _ in ma_topn])
    al_words = set([w for w, _ in al_topn])
    
    jacc = len(ma_words.intersection(al_words)) / len(ma_words.union(al_words))
    
    data.append((word, jacc))

In [15]:
df = pd.DataFrame(data, columns=('word', 'jaccard'))

In [16]:
df.sort_values('jaccard').head(100)

Unnamed: 0,word,jaccard
843,co,0.0
122,town,0.0
511,boys,0.0
31,0000,0.0
905,gone,0.0
522,al,0.0
713,dr,0.0
446,terms,0.025641
37,section,0.025641
238,central,0.025641


In [17]:
def compare(word):
    print('MA')
    for w, d in ma.wv.most_similar([word], topn=20):
        print(w, d)
    print('\nAL')
    for w, d in al.wv.most_similar([word], topn=20):
        print(w, d)

In [18]:
compare('capital')

MA
sector 0.6949245929718018
corporate 0.6882071495056152
biotech 0.6808187365531921
bureau 0.6758202910423279
coalition 0.6654890775680542
sustainability 0.6644855737686157
corporation 0.6624889969825745
agriculture 0.6616630554199219
corp 0.6571483612060547
distribution 0.656872570514679
tourism 0.6567668914794922
equity 0.6545354723930359
homeownership 0.6532920598983765
#alternativeinvestments 0.6528520584106445
cooperative 0.6443458795547485
merger 0.6434593200683594
pension 0.643139123916626
litigation 0.6384045481681824
global 0.6372753977775574
largest 0.6337259411811829

AL
kurdistan 0.7466413974761963
tourism 0.7251983880996704
mosul 0.7110778093338013
historic 0.7071679830551147
iraq 0.7031489014625549
sanctuary 0.6999616026878357
largest 0.6932883262634277
bureau 0.6923047304153442
alliance 0.686123788356781
prisons 0.684540867805481
delaware 0.682929515838623
massachusetts 0.6810706257820129
parliament 0.6807454824447632
council 0.680150032043457
coalition 0.68001556396484

In [19]:
compare('earth')

MA
planet 0.7850744724273682
species 0.5984479188919067
oceans 0.5953711271286011
soil 0.5706795454025269
trees 0.5625171661376953
brink 0.5569252967834473
world 0.5553345680236816
continent 0.553207278251648
surface 0.5431884527206421
mars 0.5381508469581604
moon 0.5334579944610596
endangered 0.5331406593322754
occupation 0.5209196209907532
forests 0.5204536318778992
rise 0.51918625831604
volcano 0.518470287322998
planets 0.5156018733978271
flesh 0.5155590176582336
antarctica 0.5152111649513245
arctic 0.5121740102767944

AL
planet 0.7479363679885864
darkness 0.5628431439399719
heaven 0.5573424100875854
world 0.5498610734939575
wrath 0.5298027992248535
purpose 0.5209683179855347
christ 0.5110281705856323
god 0.5087047219276428
walls 0.5073636174201965
plague 0.5038360357284546
enemy 0.49878764152526855
evil 0.49704599380493164
one 0.49643099308013916
the 0.49623388051986694
land 0.4949355125427246
israel 0.4943498373031616
destruction 0.49155545234680176
nature 0.48888880014419556
egyp

In [20]:
compare('property')

MA
properties 0.6026852130889893
acre 0.5590474605560303
unit 0.5498247742652893
broker 0.5492644906044006
units 0.5436503887176514
listing 0.541427731513977
condo 0.5365622043609619
entities 0.5304471850395203
acres 0.5216138958930969
vehicle 0.5210677981376648
tenant 0.5144335031509399
homes 0.51262366771698
wealth 0.5089935660362244
landlords 0.5061561465263367
fee 0.49798768758773804
tax 0.4955253303050995
townhouse 0.49433666467666626
fha 0.48944324254989624
theft 0.4881579577922821
apts 0.485747754573822

AL
homes 0.7195132970809937
properties 0.7113392353057861
construction 0.6297256946563721
vehicles 0.6280474662780762
buildings 0.6272759437561035
prisons 0.6253973245620728
company 0.6161502003669739
assets 0.6155925989151001
materials 0.6155424118041992
ownership 0.6120445728302002
equipment 0.6115008592605591
carriers 0.6105365753173828
government 0.6098369359970093
housing 0.6048632860183716
businesses 0.6036931276321411
residence 0.6015585660934448
employment 0.599377095699

In [21]:
compare('future')

MA
workforce 0.6082973480224609
industry 0.598998486995697
ramifications 0.5942939519882202
ecosystem 0.5935578346252441
environment 0.5877271890640259
importance 0.5829315185546875
evolution 0.5780459046363831
trajectory 0.5735111832618713
implications 0.5703089833259583
impact 0.5670745968818665
midst 0.5619704127311707
world 0.5604044198989868
convergence 0.5602703094482422
innovation 0.5563632249832153
economy 0.5548859238624573
role 0.5532742738723755
disruption 0.5531015396118164
careers 0.5417039394378662
complexity 0.5396655201911926
transformation 0.5382096767425537

AL
presence 0.5954444408416748
purpose 0.5791889429092407
world 0.5680302381515503
life 0.5492112636566162
success 0.5473106503486633
profession 0.5458510518074036
journey 0.5428240299224854
greatness 0.5380560159683228
career 0.5335182547569275
wealth 0.529577374458313
path 0.5270237922668457
confidence 0.5233656764030457
process 0.5228111743927002
abilities 0.5212066769599915
vision 0.518271803855896
business 0.

In [22]:
compare('success')

MA
roi 0.6661114692687988
successes 0.6624370813369751
achieving 0.6609929800033569
simplicity 0.654789924621582
effectiveness 0.65130215883255
growth 0.6444323062896729
wealth 0.6381429433822632
resilience 0.6357244253158569
relevance 0.6310155987739563
execution 0.6289172172546387
maximizing 0.6284192204475403
leadership 0.6241458654403687
measurable 0.6240534782409668
successful 0.6210318207740784
confidence 0.6210116744041443
longevity 0.6187664270401001
#digitaltransformation 0.618420422077179
#customerexperience 0.6165258884429932
branding 0.6161686182022095
mastery 0.6153881549835205

AL
goal 0.6438810229301453
wealth 0.6422476768493652
creativity 0.6396653652191162
failure 0.6210229396820068
achieving 0.6116781234741211
strength 0.6112919449806213
growth 0.6072525978088379
improving 0.6066861152648926
skill 0.6057469844818115
experience 0.6048508882522583
confidence 0.6045737266540527
journey 0.6042675971984863
ability 0.6025824546813965
purpose 0.6005719900131226
greatness 0.5