In [1]:
import gensim
import json

# Load Corpus and Train Model

In [2]:
INFILE = "tokenized_corpus.json"

In [3]:
with open(INFILE, "r") as infile:
    tokens = json.loads(infile.readline())

In [4]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

model.build_vocab(tokens)

model.train(tokens, total_examples=model.corpus_count, epochs=model.epochs)

(6928126, 8518270)

# 1. Exploration of Corpus Through the Model

## 1.1 Words Similar to Keywords (`for`, `if`...)

In [10]:
model.wv.most_similar('for')

[('case', 0.5832850337028503),
 ('userisdefined', 0.4873828887939453),
 ('x_height', 0.48287323117256165),
 ('results', 0.48111656308174133),
 ('isusedby', 0.4804070293903351),
 ('is_base', 0.47911837697029114),
 ('tool_child', 0.47467532753944397),
 ('cancast', 0.4686744809150696),
 ('range', 0.46649882197380066),
 ('zip', 0.4658733606338501)]

In [11]:
model.wv.most_similar('if')

[('elif', 0.7684581279754639),
 ('func_meth', 0.6381980776786804),
 ('else', 0.6246802806854248),
 ('outin', 0.6098721027374268),
 ('features_to', 0.6000626683235168),
 ('tok', 0.5956555008888245),
 ('continue', 0.5858328938484192),
 ('warn_external', 0.5808218717575073),
 ('iscode', 0.5804414749145508),
 ('branches', 0.5699276328086853)]

In [12]:
model.wv.most_similar('def')

[('class', 0.5676499009132385),
 ('mth', 0.5496786236763),
 ('testgauss', 0.5416885018348694),
 ('orthogonality', 0.5096248984336853),
 ('schar', 0.48761487007141113),
 ('with_wrap', 0.4817056357860565),
 ('self', 0.48054298758506775),
 ('test_fft', 0.47273823618888855),
 ('a_del', 0.46983879804611206),
 ('vendor_id', 0.4671586751937866)]

In [13]:
model.wv.most_similar('self')

[('super', 0.7258020639419556),
 ('setter', 0.6831666231155396),
 ('set_axes', 0.6730579137802124),
 ('oldopts', 0.6446407437324524),
 ('get_pad', 0.6311656832695007),
 ('coeffs', 0.6230141520500183),
 ('op_dict', 0.6202643513679504),
 ('inherited', 0.6167618036270142),
 ('get_minpos', 0.6099565029144287),
 ('height_arg', 0.60246741771698)]

## 1.2 Finding the Outlier

In [19]:
model.wv.doesnt_match(['int','float','double', 'ndarray'])

'ndarray'

In [20]:
model.wv.doesnt_match(['def','class', 'self', 'super', 'else'])

'else'

## 1.3 Exploring Vector Definitions

In [32]:
# What is numpy without ndarrays? Scipy! (see 5th result)
# also note: pallets (flask) in list
model.wv.most_similar(model.wv['numpy'] - model.wv['ndarray'])

[('numpy', 0.5947498083114624),
 ('needs_sphinx', 0.5481728315353394),
 ('doug', 0.534336268901825),
 ('fftpack', 0.5310518741607666),
 ('scipy', 0.5253139734268188),
 ('merges', 0.5177391171455383),
 ('september', 0.5167557001113892),
 ('packaging', 0.5018081665039062),
 ('pallets', 0.49546921253204346),
 ('reported', 0.4953809082508087)]

## 1.3 Most Common Tokens

In [34]:
y = model.wv.index_to_key
y[:10] # np is a really common modifier within the numpy codebase, too!

['the', 'self', 'np', 'if', 'def', 'in', 'is', 'of', 'to', 'for']