In [1]:
from corpus import StateCorpus

In [2]:
corpus = StateCorpus('../data/AL-MA/')

In [3]:
al_counts = corpus.word_counts('AL')

In [8]:
al_counts['boys']

120590

In [18]:
ma_counts = corpus.word_counts('MA')

In [10]:
from gensim.models import Word2Vec

In [11]:
al = Word2Vec.load('../models/AL.bin')

In [12]:
ma = Word2Vec.load('../models/MA.bin')

In [17]:
al.wv.most_similar(['boys'])

[('oo', 0.5985551476478577),
 ('stack', 0.4874264597892761),
 ('endzone', 0.45280131697654724),
 ('lizzy', 0.4360118508338928),
 ('thin', 0.41837841272354126),
 ('town', 0.41499826312065125),
 ('trenches', 0.41488319635391235),
 ('brining', 0.4122316837310791),
 ('violets', 0.4024427533149719),
 ('redwoods', 0.39452260732650757)]

In [16]:
ma.wv.most_similar(['boys'])

[('girls', 0.8680894374847412),
 ('varsity', 0.6474319696426392),
 ('softball', 0.6404802799224854),
 ('kids', 0.6369673609733582),
 ('ladies', 0.6289759278297424),
 ('volleyball', 0.6269707679748535),
 ('hoop', 0.6260010004043579),
 ('jv', 0.6162349581718445),
 ('bball', 0.6102374196052551),
 ('tennis', 0.6093944311141968)]

In [42]:
from wordfreq import top_n_list

In [43]:
words = set(top_n_list('en', 1000))

In [44]:
ma_vocab = set(ma.wv.vocab.keys())

In [45]:
al_vocab = set(al.wv.vocab.keys())

In [46]:
vocab = set.intersection(ma_vocab, al_vocab, words)

In [47]:
len(vocab)

968

In [48]:
import pandas as pd

In [49]:
pd.set_option('display.max_rows', 1000)

In [50]:
data = []
for word in vocab:
    
    ma_topn = ma.wv.most_similar([word], topn=20)
    al_topn = al.wv.most_similar([word], topn=20)
    
    ma_words = set([w for w, _ in ma_topn])
    al_words = set([w for w, _ in al_topn])
    
    jacc = len(ma_words.intersection(al_words)) / len(ma_words.union(al_words))
    
    data.append((word, jacc))

In [51]:
df = pd.DataFrame(data, columns=('word', 'jaccard'))

In [52]:
df.sort_values('jaccard').head(100)

Unnamed: 0,word,jaccard
764,0000,0.0
917,gone,0.0
857,dr,0.0
168,boys,0.0
626,al,0.0
326,co,0.0
867,town,0.0
165,section,0.025641
63,daily,0.025641
705,central,0.025641


In [53]:
ma.wv.most_similar(['gone'], topn=20)

[('settled', 0.6982407569885254),
 ('flown', 0.6852797865867615),
 ('done', 0.6726028919219971),
 ('taken', 0.6571210026741028),
 ('moved', 0.642339825630188),
 ('eliminated', 0.6013427376747131),
 ('gotten', 0.5939329862594604),
 ('shifted', 0.5866950750350952),
 ('passed', 0.5809922218322754),
 ('thrown', 0.5778520107269287),
 ('consumed', 0.5778185725212097),
 ('disappeared', 0.5759103894233704),
 ('beaten', 0.5742921233177185),
 ('doubled', 0.5736864805221558),
 ('awoken', 0.5695406198501587),
 ('dealt', 0.5650726556777954),
 ('risen', 0.5639537572860718),
 ('cleared', 0.5638896226882935),
 ('sprung', 0.5602065324783325),
 ('shown', 0.5599979162216187)]

In [54]:
al.wv.most_similar(['gone'], topn=20)

[('gon', 0.8416994214057922),
 ('gonna', 0.8148479461669922),
 ('finna', 0.7731435298919678),
 ('tryna', 0.7458698153495789),
 ('gotta', 0.7425788640975952),
 ('ima', 0.6913616061210632),
 ('ll', 0.6889411807060242),
 ('going', 0.6753376126289368),
 ('imma', 0.6539572477340698),
 ('trynna', 0.6297451257705688),
 ('goin', 0.6278444528579712),
 ('will', 0.6153505444526672),
 ('might', 0.6103156805038452),
 ('wanna', 0.6055372953414917),
 ('tryin', 0.602297306060791),
 ('gunna', 0.5976340770721436),
 ('supposed', 0.5942718982696533),
 ('could', 0.5873080492019653),
 ('dont', 0.5803220272064209),
 ('bout', 0.5773816108703613)]

In [55]:
ma.wv.most_similar(['town'], topn=20)

[('city', 0.8087241649627686),
 ('neighborhood', 0.7586129903793335),
 ('village', 0.6459503769874573),
 ('district', 0.6052602529525757),
 ('southie', 0.6028234958648682),
 ('courthouse', 0.5918200612068176),
 ('downtown', 0.5885204076766968),
 ('area', 0.5857598781585693),
 ('maine', 0.5819447636604309),
 ('residence', 0.5783712863922119),
 ('mattapan', 0.5770481824874878),
 ('delaware', 0.5758880972862244),
 ('castle', 0.573813796043396),
 ('church', 0.5717982053756714),
 ('office', 0.5701369047164917),
 ('selectmen', 0.5662739276885986),
 ('state', 0.566063642501831),
 ('library', 0.5628957748413086),
 ('plains', 0.5617846846580505),
 ('revitalization', 0.5561991930007935)]

In [58]:
al.wv.most_similar(['boys'], topn=20)

[('oo', 0.5985551476478577),
 ('stack', 0.4874264597892761),
 ('endzone', 0.45280131697654724),
 ('lizzy', 0.4360118508338928),
 ('thin', 0.41837841272354126),
 ('town', 0.41499826312065125),
 ('trenches', 0.41488319635391235),
 ('brining', 0.4122316837310791),
 ('violets', 0.4024427533149719),
 ('redwoods', 0.39452260732650757),
 ('meantime', 0.3823511600494385),
 ('columns', 0.3809135854244232),
 ('rarest', 0.380619615316391),
 ('12s', 0.3793204128742218),
 ('stabbers', 0.3784700632095337),
 ('backstreets', 0.3729916214942932),
 ('hallway', 0.37096071243286133),
 ('holes', 0.3703182339668274),
 ('#supportoursis', 0.3677912950515747),
 ('steeping', 0.3655661940574646)]