In [1]:
from gensim.models import Word2Vec

In [2]:
bos = Word2Vec.load('../models/boston.bin')

In [3]:
sf = Word2Vec.load('../models/sf.bin')

In [4]:
bos.wv.most_similar(['money'])

[('cash', 0.706450343132019),
 ('taxes', 0.6839010715484619),
 ('billions', 0.6621408462524414),
 ('dollars', 0.6601486206054688),
 ('millions', 0.648537814617157),
 ('funds', 0.6466509103775024),
 ('debt', 0.6296778321266174),
 ('taxpayers', 0.6152437329292297),
 ('attention', 0.610807478427887),
 ('them', 0.607587456703186)]

In [5]:
sf.wv.most_similar(['money'])

[('cash', 0.7404167056083679),
 ('billions', 0.6718142032623291),
 ('taxes', 0.6593451499938965),
 ('dollars', 0.6488584876060486),
 ('millions', 0.637386679649353),
 ('funds', 0.6361412405967712),
 ('debt', 0.6185427904129028),
 ('profits', 0.6088622212409973),
 ('profit', 0.5983096361160278),
 ('assets', 0.5972768664360046)]

In [6]:
from wordfreq import top_n_list

In [7]:
words = set(top_n_list('en', 1000))

In [8]:
bos_vocab = set(bos.wv.vocab.keys())

In [9]:
sf_vocab = set(sf.wv.vocab.keys())

In [10]:
vocab = set.intersection(bos_vocab, sf_vocab, words)

In [11]:
len(vocab)

968

In [12]:
import pandas as pd

In [13]:
pd.set_option('display.max_rows', 1000)

In [14]:
data = []
for word in vocab:
    
    bos_topn = bos.wv.most_similar([word], topn=20)
    sf_topn = sf.wv.most_similar([word], topn=20)
    
    bos_words = set([w for w, _ in bos_topn])
    sf_words = set([w for w, _ in sf_topn])
    
    jacc = len(bos_words.intersection(sf_words)) / len(bos_words.union(sf_words))
    
    data.append((word, jacc))

In [15]:
df = pd.DataFrame(data, columns=('word', 'jaccard'))

In [16]:
df.sort_values('jaccard').head(100)

Unnamed: 0,word,jaccard
878,0000,0.0
718,model,0.025641
215,al,0.025641
54,air,0.025641
807,style,0.052632
674,george,0.052632
824,king,0.052632
485,dr,0.052632
780,park,0.052632
935,daily,0.052632


In [17]:
def compare(word):
    print('Boston')
    for w, d in bos.wv.most_similar([word], topn=20):
        print(w, d)
    print('\nSan Francisco')
    for w, d in sf.wv.most_similar([word], topn=20):
        print(w, d)

In [18]:
compare('capital')

Boston
holdings 0.811380922794342
decreased 0.7312884330749512
shareholder 0.7207496166229248
corp 0.720212996006012
pharmaceuticals 0.7181622982025146
advisors 0.7139804363250732
boosted 0.7127933502197266
ltd 0.7065377235412598
trimmed 0.7011086940765381
lowered 0.6967998743057251
nasdaq 0.6917380094528198
plc 0.6840825080871582
valuation 0.6839246153831482
sector 0.6804944276809692
nyse 0.6741302013397217
investments 0.6728320121765137
bancorp 0.6690026521682739
investment 0.6653239727020264
biotech 0.6641702651977539
corporation 0.6638432145118713

San Francisco
financing 0.7117353677749634
equity 0.6867775321006775
capitalists 0.6722977161407471
venture 0.6668517589569092
ventures 0.6553739905357361
fintech 0.6542747616767883
fund 0.6522723436355591
funds 0.6442632675170898
investment 0.6415426731109619
sector 0.6396785974502563
financial 0.6373595595359802
investments 0.6340592503547668
blockchain 0.6267150640487671
regulator 0.6231181621551514
wealth 0.6183961629867554
vc 0.6128

In [19]:
compare('shot')

Boston
shots 0.694328784942627
layup 0.6762422323226929
possession 0.6700683832168579
shooting 0.6353663206100464
penalty 0.6223310232162476
buzzer 0.5987129211425781
rebound 0.5935577154159546
foul 0.5929661989212036
rim 0.5929230451583862
timeout 0.589382529258728
shoots 0.5819135904312134
fastball 0.5727157592773438
bat 0.5696253776550293
game 0.5637671947479248
ball 0.5627437829971313
rifle 0.5600715279579163
breakaway 0.5574859976768494
goal 0.5504113435745239
bullets 0.5476298332214355
pistol 0.5399275422096252

San Francisco
shots 0.6442517638206482
shootout 0.6164708733558655
shooting 0.5918394327163696
shoots 0.5855886936187744
picture 0.5672692060470581
wounded 0.5631301403045654
possession 0.5614088177680969
injured 0.561119794845581
night 0.5569995641708374
pic 0.5542809963226318
knife 0.5529394149780273
photo 0.5509947538375854
robbed 0.5363732576370239
thief 0.5349283218383789
stabbed 0.5255791544914246
scene 0.5243826508522034
shoot 0.5219007730484009
struck 0.5173529386

In [20]:
compare('team')

Boston
teams 0.7349029183387756
player 0.6952635645866394
franchise 0.6867581605911255
division 0.6672658920288086
organization 0.6663256883621216
players 0.6537505388259888
roster 0.6509213447570801
game 0.638468861579895
league 0.6359627842903137
teammates 0.6356001496315002
qb 0.6275639533996582
base 0.6260645389556885
talent 0.6141942143440247
offense 0.6122673749923706
staff 0.6101247072219849
goal 0.603513240814209
tribe 0.6013587713241577
company 0.6012178063392639
position 0.5998313426971436
country 0.5931286215782166

San Francisco
organization 0.7090767621994019
teams 0.668716311454773
partner 0.652695894241333
community 0.6482599973678589
client 0.6288882493972778
company 0.6168371438980103
staff 0.6124565005302429
opponent 0.5941076874732971
crew 0.5878932476043701
position 0.5779956579208374
program 0.5744198560714722
platform 0.5731533765792847
audience 0.57221919298172
career 0.5702283382415771
network 0.5686354041099548
talent 0.5683847665786743
product 0.56074726581573

In [21]:
compare('space')

Boston
spaces 0.7127925157546997
buildings 0.6166949272155762
room 0.611009955406189
storage 0.6034278273582458
amenities 0.5898017883300781
energy 0.5822328329086304
rooms 0.5797927379608154
equipment 0.5628229975700378
garage 0.5570880174636841
exterior 0.5557072162628174
capacity 0.5548709034919739
apartment 0.5528761148452759
place 0.549321711063385
electricity 0.5478377342224121
electronics 0.5475473403930664
fabric 0.5443395376205444
sensors 0.5433973670005798
vehicles 0.5416244864463806
sunlight 0.5394332408905029
transport 0.5371584296226501

San Francisco
spaces 0.6850579977035522
satellites 0.5531437397003174
mission 0.5527688264846802
outer 0.5495072603225708
storage 0.5470969676971436
light 0.5430710315704346
astronauts 0.5308605432510376
room 0.5255872011184692
rooms 0.5151591300964355
telescope 0.5127924680709839
area 0.5123414993286133
transit 0.5113630294799805
spacecraft 0.5008309483528137
power 0.49769437313079834
vcloud 0.4969034790992737
drones 0.49267855286598206
s