In [26]:
from gensim.models import Word2Vec

In [27]:
bos = Word2Vec.load('../models/boston.bin')

In [28]:
sf = Word2Vec.load('../models/sf.bin')

In [29]:
bos.wv.most_similar(['money'])

[('cash', 0.706450343132019),
 ('taxes', 0.6839010715484619),
 ('billions', 0.6621408462524414),
 ('dollars', 0.6601486206054688),
 ('millions', 0.648537814617157),
 ('funds', 0.6466509103775024),
 ('debt', 0.6296778321266174),
 ('taxpayers', 0.6152437329292297),
 ('attention', 0.610807478427887),
 ('them', 0.607587456703186)]

In [30]:
sf.wv.most_similar(['money'])

[('cash', 0.7404167056083679),
 ('billions', 0.6718142032623291),
 ('taxes', 0.6593451499938965),
 ('dollars', 0.6488584876060486),
 ('millions', 0.637386679649353),
 ('funds', 0.6361412405967712),
 ('debt', 0.6185427904129028),
 ('profits', 0.6088622212409973),
 ('profit', 0.5983096361160278),
 ('assets', 0.5972768664360046)]

In [31]:
from wordfreq import top_n_list

In [32]:
words = set(top_n_list('en', 1000))

In [33]:
bos_vocab = set(bos.wv.vocab.keys())

In [34]:
sf_vocab = set(sf.wv.vocab.keys())

In [35]:
vocab = set.intersection(bos_vocab, sf_vocab, words)

In [36]:
len(vocab)

968

In [37]:
import pandas as pd

In [38]:
pd.set_option('display.max_rows', 1000)

In [39]:
data = []
for word in vocab:
    
    bos_topn = bos.wv.most_similar([word], topn=20)
    sf_topn = sf.wv.most_similar([word], topn=20)
    
    bos_words = set([w for w, _ in bos_topn])
    sf_words = set([w for w, _ in sf_topn])
    
    jacc = len(bos_words.intersection(sf_words)) / len(bos_words.union(sf_words))
    
    data.append((word, jacc))

In [40]:
df = pd.DataFrame(data, columns=('word', 'jaccard'))

In [41]:
df.sort_values('jaccard').head(100)

Unnamed: 0,word,jaccard
878,0000,0.0
492,air,0.025641
46,model,0.025641
965,al,0.025641
901,size,0.052632
448,daily,0.052632
906,park,0.052632
95,george,0.052632
638,king,0.052632
303,style,0.052632


In [43]:
def compare(word):
    print('Boston')
    for w, d in bos.wv.most_similar([word], topn=20):
        print(w, d)
    print('\nSan Francisco')
    for w, d in sf.wv.most_similar([word], topn=20):
        print(w, d)

In [60]:
for word in df.sort_values('jaccard').head(100)['word']:
    print('\n', word)
    compare(word)
    print('---------')


 0000
Boston
#malottery 0.8216317296028137
#makeno 0.8191916942596436
063 0.79896479845047
rnskov 0.7981466054916382
010 0.7883666753768921
684 0.7879969477653503
971 0.7824335098266602
506 0.7820887565612793
717 0.7778519988059998
584 0.7770617008209229
661 0.7770025134086609
587 0.7767542600631714
573 0.7737782001495361
#entsports 0.7718541622161865
pukul 0.7713130712509155
075 0.7700620293617249
55555 0.769437849521637
372 0.7680801153182983
471 0.7663791179656982
493 0.7650988101959229

San Francisco
26s 0.7848165035247803
@molto 0.7827160954475403
@pl4news 0.7808850407600403
594 0.7801806330680847
@saudinews50 0.7616879940032959
@soyguu4 0.7580990791320801
@e3aziz 0.7571420073509216
36s 0.7565902471542358
@sahil 0.7564978003501892
697 0.7555118799209595
#maryvoto 0.7553219795227051
elhigawy 0.7548400163650513
vxu17 0.7526546716690063
51m 0.7513240575790405
@rachelbrom 0.7510630488395691
nikooo 0.7507759928703308
@yunahurutsuki 0.7491456270217896
@dcwastaken 0.7488631010055542
@sa

city 0.6179165244102478
property 0.5911345481872559
range 0.5745447874069214
electricity 0.5638565421104431
forests 0.5541895627975464
cities 0.5525177717208862
collapse 0.5471006035804749
territory 0.5427159070968628
habitat 0.5365992784500122
farm 0.5347812175750732
homes 0.5339690446853638
extinction 0.532280445098877
country 0.5315366983413696
tunnels 0.5312346816062927
houses 0.5300989151000977
state 0.5248299241065979
housing 0.5189083814620972
buildings 0.5181988477706909
fossil 0.5173025131225586
authority 0.5158241987228394
---------

 performance
Boston
production 0.6501346230506897
performances 0.6118725538253784
career 0.6062182188034058
presentation 0.5921624302864075
selection 0.5716707706451416
competition 0.5564935207366943
performer 0.5525826215744019
experience 0.5464810132980347
masterclass 0.5431725978851318
achievement 0.5328694581985474
stage 0.5326017141342163
scoring 0.527654230594635
velocity 0.5262007713317871
success 0.5256083011627197
progression 0.524202108

systems 0.4980274438858032
racial 0.4977461099624634
consumption 0.49668651819229126
anti 0.49571719765663147
displacement 0.4951900243759155

San Francisco
labeling 0.5807060599327087
domestic 0.5764105319976807
heavily 0.552498996257782
primarily 0.5453433990478516
limiting 0.5439968109130859
targeting 0.5350682139396667
eg 0.5343496799468994
inherently 0.5321123600006104
discrimination 0.5279086232185364
traditional 0.5271385908126831
cultural 0.5267617106437683
basic 0.5263444185256958
etc 0.5254875421524048
reinforcing 0.525192141532898
binary 0.5221467018127441
male 0.5207228660583496
shoddy 0.5204254388809204
groups 0.5185104608535767
mostly 0.5172061324119568
passive 0.5161009430885315
---------

 related
Boston
linked 0.7256054878234863
affecting 0.6444903612136841
relating 0.6324243545532227
similar 0.631294846534729
attributed 0.618959903717041
traumatic 0.6070696115493774
involving 0.5834884643554688
regarding 0.5719761848449707
specific 0.5546267628669739
compromising 0.54

towns 0.5843605995178223
church 0.5842469930648804
sf 0.5680181384086609
downtown 0.559093713760376
@sfcity 0.543919563293457
streets 0.5430628061294556
suburb 0.533321738243103
hotel 0.5297962427139282
paradise 0.5294846296310425
party 0.5285468101501465
nyc 0.5273250341415405
tourist 0.5256274938583374
country 0.5215440988540649
residence 0.5203524827957153
socal 0.5192265510559082
basement 0.5172737836837769
boat 0.5167876482009888
---------

 0
Boston
div 0.6827199459075928
5b 0.6714518070220947
1 0.6611159443855286
rout 0.6387614011764526
94 0.629062294960022
2 0.6173661947250366
6b 0.6148043870925903
1q 0.5976235866546631
5m 0.5888688564300537
pt 0.5864851474761963
105 0.5849291086196899
pp 0.5847907066345215
splatoon 0.5843316912651062
nil 0.5841560959815979
q3 0.5833656191825867
sog 0.5826030969619751
timeout 0.5787252187728882
@bostonlaxnet 0.5787181854248047
9m 0.5766912698745728
rbis 0.5755777955055237

San Francisco
v1 0.6990424990653992
v0 0.6879900097846985
5x 0.664968073

definitive 0.551648736000061
master 0.5197420716285706
total 0.4872318208217621
binding 0.4813922345638275
full 0.47877904772758484
final 0.46720090508461
comprehensive 0.4611433744430542
collector 0.459723562002182
ultimate 0.4529586434364319
disaster 0.4508955776691437
practical 0.4493893086910248
monstrous 0.44160962104797363
essential 0.44123375415802
manual 0.4404970407485962
recurring 0.4393296241760254
survival 0.4359338879585266
powerful 0.43463027477264404
execution 0.4333978295326233
guides 0.4271375834941864
creation 0.42528021335601807
---------

 act
Boston
acting 0.62640380859375
acts 0.607728123664856
feel 0.5635513067245483
sound 0.5567253828048706
treat 0.5494229793548584
behave 0.5429298877716064
move 0.5162726640701294
defend 0.501920759677887
oppression 0.49970683455467224
ideology 0.49880605936050415
rhetoric 0.49609309434890747
establishment 0.4955988824367523
serve 0.4941655993461609
pretend 0.49259674549102783
discriminate 0.48836857080459595
fight 0.48379188776

circumstances 0.5473394393920898
intent 0.545245885848999
responses 0.5316944122314453
context 0.5305548906326294
form 0.5302412509918213
consent 0.5231456756591797
correct 0.5225217938423157
reporting 0.516541600227356
terms 0.5155776739120483
actions 0.5155678987503052
validity 0.5152487754821777
moral 0.5114071369171143
reasoning 0.507281482219696
objective 0.5065199136734009
integrity 0.5060268640518188
document 0.5057175159454346
principle 0.5056890845298767
consensus 0.5025424957275391
structure 0.4971768260002136
bias 0.49595195055007935
---------

 economic
Boston
environmental 0.7874346971511841
uncertainty 0.7621890306472778
declining 0.7594515681266785
inequality 0.7566649913787842
sector 0.7453415989875793
institutional 0.7425094842910767
increasing 0.7330743670463562
regulatory 0.7315494418144226
#climatechange 0.726742148399353
technological 0.7221850752830505
growth 0.7169259786605835
disruption 0.7145304083824158
infrastructure 0.7141036987304688
globalization 0.7137683

loom 0.5965268611907959
durable 0.5717490911483765
horizontal 0.5712660551071167
smaller 0.5679486989974976
complex 0.5613641142845154
micro 0.5556274652481079
thermal 0.5536093711853027
concrete 0.5434786081314087
huge 0.5428553819656372
distinct 0.537462592124939
vertical 0.5362114906311035
tribal 0.5321404337882996
medium 0.5306832790374756
quantities 0.5290963053703308
significant 0.5279491543769836
massive 0.5255883932113647
synthetic 0.5222872495651245
carbon 0.5205459594726562
---------

 mark
Boston
shania 0.5417073965072632
jonathan 0.5122659802436829
matthew 0.507895827293396
henry 0.5013517141342163
bishop 0.5012912750244141
greg 0.48637861013412476
gary 0.4739862382411957
author 0.47107547521591187
mike 0.46986454725265503
brian 0.46819227933883667
christopher 0.46798139810562134
ceo 0.46441468596458435
corey 0.4624536335468292
michael 0.4602887034416199
richard 0.4584369361400604
eric 0.4568008482456207
founder 0.4545097351074219
willie 0.45449304580688477
williams 0.45149

interventions 0.6534116864204407
curriculum 0.6514061689376831

San Francisco
study 0.7546467781066895
genetics 0.7214548587799072
survey 0.7154235243797302
biology 0.7032018899917603
findings 0.6995872259140015
genomic 0.6855931282043457
studies 0.6789861917495728
evaluation 0.6724969744682312
scientific 0.6679754853248596
breakthrough 0.6678256988525391
imaging 0.6625401973724365
technology 0.6610836982727051
behavioral 0.6604061126708984
genetic 0.6600304245948792
neuroscience 0.6562884449958801
molecular 0.6562504768371582
analysis 0.6488516926765442
researcher 0.6441352367401123
genomics 0.6351022720336914
biological 0.6321078538894653
---------


In [61]:
compare('space')

Boston
spaces 0.7127925157546997
buildings 0.6166949272155762
room 0.611009955406189
storage 0.6034278273582458
amenities 0.5898017883300781
energy 0.5822328329086304
rooms 0.5797927379608154
equipment 0.5628229975700378
garage 0.5570880174636841
exterior 0.5557072162628174
capacity 0.5548709034919739
apartment 0.5528761148452759
place 0.549321711063385
electricity 0.5478377342224121
electronics 0.5475473403930664
fabric 0.5443395376205444
sensors 0.5433973670005798
vehicles 0.5416244864463806
sunlight 0.5394332408905029
transport 0.5371584296226501

San Francisco
spaces 0.6850579977035522
satellites 0.5531437397003174
mission 0.5527688264846802
outer 0.5495072603225708
storage 0.5470969676971436
light 0.5430710315704346
astronauts 0.5308605432510376
room 0.5255872011184692
rooms 0.5151591300964355
telescope 0.5127924680709839
area 0.5123414993286133
transit 0.5113630294799805
spacecraft 0.5008309483528137
power 0.49769437313079834
vcloud 0.4969034790992737
drones 0.49267855286598206
s

In [62]:
compare('team')

Boston
teams 0.7349029183387756
player 0.6952635645866394
franchise 0.6867581605911255
division 0.6672658920288086
organization 0.6663256883621216
players 0.6537505388259888
roster 0.6509213447570801
game 0.638468861579895
league 0.6359627842903137
teammates 0.6356001496315002
qb 0.6275639533996582
base 0.6260645389556885
talent 0.6141942143440247
offense 0.6122673749923706
staff 0.6101247072219849
goal 0.603513240814209
tribe 0.6013587713241577
company 0.6012178063392639
position 0.5998313426971436
country 0.5931286215782166

San Francisco
organization 0.7090767621994019
teams 0.668716311454773
partner 0.652695894241333
community 0.6482599973678589
client 0.6288882493972778
company 0.6168371438980103
staff 0.6124565005302429
opponent 0.5941076874732971
crew 0.5878932476043701
position 0.5779956579208374
program 0.5744198560714722
platform 0.5731533765792847
audience 0.57221919298172
career 0.5702283382415771
network 0.5686354041099548
talent 0.5683847665786743
product 0.56074726581573

In [64]:
compare('original')

Boston
animated 0.6296606063842773
exquisite 0.6011489629745483
beatles 0.6007251143455505
avengers 0.579468309879303
edited 0.569148063659668
noir 0.5676820278167725
abstract 0.5638761520385742
iconic 0.55681312084198
genesis 0.5531405210494995
origin 0.5436378121376038
arc 0.5365482568740845
cd 0.5290073156356812
marvel 0.5269842147827148
musical 0.5260090827941895
infamous 0.5239936113357544
explicit 0.5230308771133423
acoustic 0.5228502750396729
obscure 0.5220116972923279
theme 0.5208238959312439
vinyl 0.5142929553985596

San Francisco
nes 0.6052721738815308
beatles 0.603911280632019
soundtrack 0.5788899660110474
classic 0.575417697429657
illustrated 0.5707852840423584
unreleased 0.5530076622962952
framed 0.547962486743927
poster 0.5475499629974365
art 0.5387270450592041
manga 0.5321853160858154
animated 0.5301929116249084
remastered 0.5201306343078613
1984 0.5195710062980652
deluxe 0.5183162093162537
1974 0.5173275470733643
80s 0.5171765685081482
vinyl 0.5167392492294312
remake 0.

In [63]:
compare('shot')

Boston
shots 0.694328784942627
layup 0.6762422323226929
possession 0.6700683832168579
shooting 0.6353663206100464
penalty 0.6223310232162476
buzzer 0.5987129211425781
rebound 0.5935577154159546
foul 0.5929661989212036
rim 0.5929230451583862
timeout 0.589382529258728
shoots 0.5819135904312134
fastball 0.5727157592773438
bat 0.5696253776550293
game 0.5637671947479248
ball 0.5627437829971313
rifle 0.5600715279579163
breakaway 0.5574859976768494
goal 0.5504113435745239
bullets 0.5476298332214355
pistol 0.5399275422096252

San Francisco
shots 0.6442517638206482
shootout 0.6164708733558655
shooting 0.5918394327163696
shoots 0.5855886936187744
picture 0.5672692060470581
wounded 0.5631301403045654
possession 0.5614088177680969
injured 0.561119794845581
night 0.5569995641708374
pic 0.5542809963226318
knife 0.5529394149780273
photo 0.5509947538375854
robbed 0.5363732576370239
thief 0.5349283218383789
stabbed 0.5255791544914246
scene 0.5243826508522034
shoot 0.5219007730484009
struck 0.5173529386

In [65]:
compare('capital')

Boston
holdings 0.811380922794342
decreased 0.7312884330749512
shareholder 0.7207496166229248
corp 0.720212996006012
pharmaceuticals 0.7181622982025146
advisors 0.7139804363250732
boosted 0.7127933502197266
ltd 0.7065377235412598
trimmed 0.7011086940765381
lowered 0.6967998743057251
nasdaq 0.6917380094528198
plc 0.6840825080871582
valuation 0.6839246153831482
sector 0.6804944276809692
nyse 0.6741302013397217
investments 0.6728320121765137
bancorp 0.6690026521682739
investment 0.6653239727020264
biotech 0.6641702651977539
corporation 0.6638432145118713

San Francisco
financing 0.7117353677749634
equity 0.6867775321006775
capitalists 0.6722977161407471
venture 0.6668517589569092
ventures 0.6553739905357361
fintech 0.6542747616767883
fund 0.6522723436355591
funds 0.6442632675170898
investment 0.6415426731109619
sector 0.6396785974502563
financial 0.6373595595359802
investments 0.6340592503547668
blockchain 0.6267150640487671
regulator 0.6231181621551514
wealth 0.6183961629867554
vc 0.6128