# Word2Vec Tutorial
Following https://rare-technologies.com/word2vec-tutorial/

In [2]:
import gensim, logging

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
sentences = [['first', 'sentence'], ['second', 'sentence']]

In [5]:
# train word2Vec on the 2 sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

2020-04-28 19:07:59,580 : INFO : collecting all words and their counts
2020-04-28 19:07:59,581 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-28 19:07:59,581 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences
2020-04-28 19:07:59,582 : INFO : Loading a fresh vocabulary
2020-04-28 19:07:59,582 : INFO : effective_min_count=1 retains 3 unique words (100% of original 3, drops 0)
2020-04-28 19:07:59,582 : INFO : effective_min_count=1 leaves 4 word corpus (100% of original 4, drops 0)
2020-04-28 19:07:59,583 : INFO : deleting the raw counts dictionary of 3 items
2020-04-28 19:07:59,583 : INFO : sample=0.001 downsamples 3 most-common words
2020-04-28 19:07:59,583 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)
2020-04-28 19:07:59,584 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes
2020-04-28 19:07:59,584 : INFO : resetting layer weights
2020-04-28 19:07:59,585 : INFO : training mo

In [6]:
model.save('/tmp/mymodel')

2020-04-28 19:27:35,801 : INFO : saving Word2Vec object under /tmp/mymodel, separately None
2020-04-28 19:27:35,802 : INFO : not storing attribute vectors_norm
2020-04-28 19:27:35,803 : INFO : not storing attribute cum_table
2020-04-28 19:27:35,805 : INFO : saved /tmp/mymodel


In [7]:
new_model = gensim.models.Word2Vec.load('/tmp/mymodel')

2020-04-28 19:27:45,650 : INFO : loading Word2Vec object from /tmp/mymodel
2020-04-28 19:27:45,651 : INFO : loading wv recursively from /tmp/mymodel.wv.* with mmap=None
2020-04-28 19:27:45,652 : INFO : setting ignored attribute vectors_norm to None
2020-04-28 19:27:45,652 : INFO : loading vocabulary recursively from /tmp/mymodel.vocabulary.* with mmap=None
2020-04-28 19:27:45,653 : INFO : loading trainables recursively from /tmp/mymodel.trainables.* with mmap=None
2020-04-28 19:27:45,653 : INFO : setting ignored attribute cum_table to None
2020-04-28 19:27:45,653 : INFO : loaded /tmp/mymodel


In [8]:
model.similarity('first', 'second')

  """Entry point for launching an IPython kernel.


-0.069061086

In [9]:
model.similarity('first', 'sentence')

  """Entry point for launching an IPython kernel.


-0.0078044254

In [10]:
model['first']

  """Entry point for launching an IPython kernel.


array([-5.1557651e-04, -2.6061535e-03, -2.2290575e-03, -3.5140431e-05,
       -3.5001715e-03, -3.2925140e-03, -3.2704154e-03,  2.9731002e-03,
        2.8313864e-03,  1.7160014e-04,  3.0552052e-04, -4.1806176e-03,
       -2.5628280e-04, -2.7225162e-03,  2.9707816e-04,  8.3405717e-04,
       -9.7546814e-04, -3.9576604e-03, -1.8852463e-03, -8.5335545e-04,
       -4.1023158e-03,  1.4865092e-03,  1.8346042e-04,  4.3650451e-03,
        6.1661669e-04,  8.9580764e-04,  4.7157751e-03, -4.4615977e-03,
        1.1433013e-03, -4.9245879e-03, -2.5632298e-03, -2.8972991e-03,
        1.2104673e-03,  1.3076308e-03, -1.3947985e-04,  4.7516855e-03,
        3.5329117e-03,  3.4518333e-04,  3.5774116e-03, -4.3694666e-03,
       -4.0800171e-03,  3.5351724e-03,  3.4319721e-03, -2.6409563e-03,
       -3.5992175e-04,  4.8676943e-03,  1.4481074e-03, -4.6682209e-03,
       -2.6874815e-03,  9.9697325e-04,  7.8563811e-04, -1.0971050e-04,
       -1.8902803e-03,  3.3627409e-03, -4.3316414e-03,  4.4663106e-03,
      

## From gensim docs
https://radimrehurek.com/gensim/models/word2vec.html

In [12]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [13]:
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)

2020-04-28 20:32:04,979 : INFO : collecting all words and their counts
2020-04-28 20:32:04,980 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-28 20:32:04,980 : INFO : collected 12 word types from a corpus of 29 raw words and 9 sentences
2020-04-28 20:32:04,981 : INFO : Loading a fresh vocabulary
2020-04-28 20:32:04,981 : INFO : effective_min_count=1 retains 12 unique words (100% of original 12, drops 0)
2020-04-28 20:32:04,982 : INFO : effective_min_count=1 leaves 29 word corpus (100% of original 29, drops 0)
2020-04-28 20:32:04,982 : INFO : deleting the raw counts dictionary of 12 items
2020-04-28 20:32:04,983 : INFO : sample=0.001 downsamples 12 most-common words
2020-04-28 20:32:04,983 : INFO : downsampling leaves estimated 3 word corpus (12.1% of prior 29)
2020-04-28 20:32:04,983 : INFO : estimated required memory for 12 words and 100 dimensions: 15600 bytes
2020-04-28 20:32:04,984 : INFO : resetting layer weights
2020-04-28 20:32:04,986 : INFO :

In [14]:
vector = model.wv['computer']

In [15]:
vector

array([-1.6001988e-03,  3.6201663e-03,  1.2796778e-03,  4.0890765e-03,
        3.2890588e-03,  3.0807797e-03,  1.9979298e-03, -1.1946845e-04,
        2.8702940e-03,  4.8626680e-03, -2.6661153e-03,  8.9778932e-04,
       -4.3168568e-04,  3.5303696e-03,  2.1694896e-04,  8.7484223e-04,
       -2.7474156e-03,  2.3072974e-03, -4.0924000e-03,  3.6694556e-03,
       -1.3320621e-03, -3.0234435e-03, -2.5829002e-03, -6.7849323e-04,
        4.4032671e-03, -4.9507194e-03, -6.1436801e-04, -1.5792623e-03,
       -1.2434268e-03, -1.0983588e-03,  1.4996656e-03, -3.4952483e-03,
        4.6523707e-03,  3.9034693e-03, -4.6174503e-03, -5.3702452e-04,
       -3.4897919e-03,  2.5301194e-03, -4.2174128e-03, -2.3708383e-03,
       -9.7744912e-04, -1.7065967e-03, -3.3377951e-03,  7.5072236e-04,
        3.3937241e-03,  3.8625572e-03, -1.7226865e-03,  5.1567581e-04,
       -1.9285055e-03,  2.0000597e-03, -3.4114390e-03,  4.7271914e-04,
       -2.4166233e-03,  4.5798719e-03, -2.6406355e-03, -9.7798940e-04,
      

In [16]:
avocado = model.wv['avocado']

KeyError: "word 'avocado' not in vocabulary"

In [17]:
cheese = model.wv['cheese']

KeyError: "word 'cheese' not in vocabulary"

In [18]:
from gensim.models.word2vec import BrownCorpus

In [21]:
model = Word2Vec(BrownCorpus('/home/ubuntu/nltk_data/corpora/brown'))

2020-04-28 21:00:01,760 : INFO : collecting all words and their counts
2020-04-28 21:00:01,761 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-28 21:00:02,004 : INFO : PROGRESS: at sentence #10000, processed 184896 words, keeping 21493 word types
2020-04-28 21:00:02,238 : INFO : PROGRESS: at sentence #20000, processed 361784 words, keeping 30751 word types
2020-04-28 21:00:02,465 : INFO : PROGRESS: at sentence #30000, processed 533189 words, keeping 38041 word types
2020-04-28 21:00:02,706 : INFO : PROGRESS: at sentence #40000, processed 713064 words, keeping 44354 word types
2020-04-28 21:00:02,932 : INFO : PROGRESS: at sentence #50000, processed 881301 words, keeping 50158 word types
2020-04-28 21:00:03,112 : INFO : collected 54294 word types from a corpus of 1008788 raw words and 57160 sentences
2020-04-28 21:00:03,112 : INFO : Loading a fresh vocabulary
2020-04-28 21:00:03,144 : INFO : effective_min_count=5 retains 15079 unique words (27% of origi

In [22]:
avocado = model.wv['avocado']

KeyError: "word 'avocado' not in vocabulary"

In [24]:
avocado = model.wv['president']

KeyError: "word 'president' not in vocabulary"

In [23]:
avocado = model.wv['cheese']

KeyError: "word 'cheese' not in vocabulary"

In [25]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f3cd384a250>

In [28]:
model.wv.vocab

{'image/nn': <gensim.models.keyedvectors.Vocab at 0x7f3cd396ded0>,
 'intensification/nn': <gensim.models.keyedvectors.Vocab at 0x7f3cd3cfc650>,
 'is/be': <gensim.models.keyedvectors.Vocab at 0x7f3cd3832f50>,
 'applied/vb': <gensim.models.keyedvectors.Vocab at 0x7f3cd3832990>,
 'under/in': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a410>,
 'conditions/nn': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a3d0>,
 'of/in': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a350>,
 'low/jj': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a490>,
 'incident/jj': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a450>,
 'light/nn': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a310>,
 'levels/nn': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a5d0>,
 'whenever/wr': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a610>,
 'the/at': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a590>,
 'integration/nn': <gensim.models.keyedvectors.Vocab at 0x7f3cd384a4d0>,
 'time/nn': <gensim.models.keyedvectors.Vo

In [29]:
model.wv['avocado/nn']

array([ 0.13542426,  0.08641184, -0.06718861, -0.06150314,  0.04078102,
       -0.01311649,  0.02429487,  0.05043231,  0.0688144 ,  0.05826308,
       -0.08768259,  0.06050111,  0.01696055, -0.07378364, -0.08366   ,
       -0.04544665,  0.00342775, -0.01289376,  0.03894362, -0.0227945 ,
        0.03141955,  0.18336345, -0.01032996, -0.03418918, -0.03222478,
        0.05456274,  0.20482992,  0.11831625,  0.0024343 , -0.04893686,
        0.03770311, -0.0399111 , -0.15655242,  0.01034668, -0.06910104,
       -0.06439083, -0.09890756,  0.01149493, -0.02454964,  0.04047146,
        0.04330104,  0.09573743, -0.02907959,  0.00096501,  0.10755651,
        0.05820831,  0.01706162, -0.03298414,  0.02794273,  0.05917836,
       -0.01910414, -0.06989597,  0.04259958, -0.02260439,  0.09474701,
        0.08870851, -0.08878886,  0.02434646,  0.05695599, -0.03341379,
       -0.0671249 ,  0.029426  ,  0.05028586, -0.00200809,  0.10357421,
       -0.07066309,  0.07806551,  0.15439007, -0.02634114, -0.00

In [30]:
model.wv.similar_by_word('avocado/nn')

2020-04-28 21:04:33,722 : INFO : precomputing L2-norms of word weight vectors


[('encouragement/nn', 0.9326245784759521),
 ('wise/jj', 0.9317049384117126),
 ('mess/nn', 0.9286695718765259),
 ('attempt/nn', 0.9277729392051697),
 ('weaken/vb', 0.926093578338623),
 ('guys/nn', 0.925919234752655),
 ('enjoy/vb', 0.9254236817359924),
 ('practice/vb', 0.9252655506134033),
 ('repel/vb', 0.9240432977676392),
 ('grant/vb', 0.9239420890808105)]

In [31]:
model.wv.similar_by_word('cheese/nn')

[('lime/nn', 0.9784870743751526),
 ('editors/nn', 0.9694792032241821),
 ('vivid/jj', 0.968917727470398),
 ('jokes/nn', 0.9682754278182983),
 ('grapes/nn', 0.9673639535903931),
 ('uneven/jj', 0.9670487642288208),
 ('corn/nn', 0.9668615460395813),
 ('carpenter/nn', 0.9665806293487549),
 ('loose/jj', 0.9654962420463562),
 ('brutal/jj', 0.9653011560440063)]

## TODO
* Write generator that produces sentences by
 * iterating over a json file
 * iterating over a directory of json files

In [36]:
DATA_HOME = '/home/ubuntu/XCS224U-Project/data/raw_web_joined'

In [74]:
from os import path, listdir

In [41]:
f1 = path.join(DATA_HOME, '703_00156_2019-11-13_2_2641647_joined.json')

In [43]:
import json

In [87]:
from ir import normalize_word

In [88]:
class ReceiptIter:
    def __init__(self):
        self.data_path = '/home/ubuntu/XCS224U-Project/data/raw_web_joined'
        
    def __iter__(self):
        for fname in listdir(self.data_path):
            file_path = path.join(self.data_path, fname)
            with open(file_path, 'r') as f:                
                receipts = json.load(f)
                for receipt in receipts:
                    web_sent = [normalize_word(w) for w in receipt['web'].lower().split()]
                    yield web_sent
                    raw_sent = [normalize_word(w) for w in receipt['raw'].lower().split()]
                    yield raw_sent

In [91]:
ri = ReceiptIter()

In [92]:
sents = [sent for sent in ri]

In [93]:
sents[:10]

[['bobs', 'blue', 'cheese', 'dressing'],
 ['bobs', 'drsng'],
 ['carrot'],
 ['carrots', 'clip', 'top'],
 ['challenge', 'unsalted', 'butter'],
 ['challenge', 'butter'],
 ['cilantro'],
 ['cilantro'],
 ['egglands', 'best', 'grade', 'a', 'large', 'eggs'],
 ['egglands', 'best', 'eggs']]

In [94]:
len(sents)

1422

In [95]:
model = Word2Vec(ReceiptIter())

2020-04-28 21:59:20,386 : INFO : collecting all words and their counts
2020-04-28 21:59:20,387 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-28 21:59:20,400 : INFO : collected 1024 word types from a corpus of 6082 raw words and 1422 sentences
2020-04-28 21:59:20,400 : INFO : Loading a fresh vocabulary
2020-04-28 21:59:20,401 : INFO : effective_min_count=5 retains 262 unique words (25% of original 1024, drops 762)
2020-04-28 21:59:20,401 : INFO : effective_min_count=5 leaves 4724 word corpus (77% of original 6082, drops 1358)
2020-04-28 21:59:20,403 : INFO : deleting the raw counts dictionary of 1024 items
2020-04-28 21:59:20,403 : INFO : sample=0.001 downsamples 107 most-common words
2020-04-28 21:59:20,403 : INFO : downsampling leaves estimated 2937 word corpus (62.2% of prior 4724)
2020-04-28 21:59:20,404 : INFO : estimated required memory for 262 words and 100 dimensions: 340600 bytes
2020-04-28 21:59:20,404 : INFO : resetting layer weights
2020-

In [97]:
model.wv.vocab

{'blue': <gensim.models.keyedvectors.Vocab at 0x7f3cd35fbcd0>,
 'cheese': <gensim.models.keyedvectors.Vocab at 0x7f3cd37002d0>,
 'dressing': <gensim.models.keyedvectors.Vocab at 0x7f3cd39b5990>,
 'unsalted': <gensim.models.keyedvectors.Vocab at 0x7f3cd372ae50>,
 'butter': <gensim.models.keyedvectors.Vocab at 0x7f3cd3d7b390>,
 'egglands': <gensim.models.keyedvectors.Vocab at 0x7f3cd3d7bc90>,
 'best': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b63d0>,
 'grade': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b6f50>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f3cd3d7b4d0>,
 'large': <gensim.models.keyedvectors.Vocab at 0x7f3cd3d7b610>,
 'eggs': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b6a10>,
 'fresh': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b6690>,
 'gourmet': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b67d0>,
 'crispy': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b60d0>,
 'jalapenos': <gensim.models.keyedvectors.Vocab at 0x7f3cd37b66d0>,
 'fgor': <gensim.models.keyed

In [98]:
model.wv.similar_by_word('cheese')

2020-04-28 21:59:49,665 : INFO : precomputing L2-norms of word weight vectors


[('feast', 0.7685936093330383),
 ('french', 0.7592940330505371),
 ('smoked', 0.7346859574317932),
 ('beyond', 0.7333604097366333),
 ('grade', 0.7325740456581116),
 ('ground', 0.7305741310119629),
 ('wet', 0.7256548404693604),
 ('', 0.7255364656448364),
 ('hot', 0.7243200540542603),
 ('-', 0.7217936515808105)]

In [99]:
model.wv.similar_by_word('kro')

[('pacf', 0.5085748434066772),
 ('with', 0.4650784730911255),
 ('chopped', 0.4405178427696228),
 ('natural', 0.4364372193813324),
 ('lmtd', 0.43595147132873535),
 ('food', 0.4341128170490265),
 ('ranch', 0.43262341618537903),
 ('a', 0.42758044600486755),
 ('gravy', 0.42737022042274475),
 ('dairy-free', 0.42728391289711)]

In [100]:
model.wv.similar_by_word('kroger')

[('feast', 0.5828536152839661),
 ('hot', 0.5772860646247864),
 ('french', 0.5749461650848389),
 ('original', 0.5731462836265564),
 ('fancy', 0.5688536167144775),
 ('', 0.5653416514396667),
 ('dairy-free', 0.5573346614837646),
 ('organic', 0.5519124269485474),
 ('creamer', 0.5517327189445496),
 ('meat', 0.5507047772407532)]

In [101]:
model.wv.similar_by_word('jalapenos')

[('egglands', 0.39589881896972656),
 ('roasted', 0.3936736285686493),
 ('ground', 0.36728888750076294),
 ('avocado', 0.364783376455307),
 ('uncured', 0.3543057143688202),
 ('free', 0.3492000102996826),
 ('gourmet', 0.33909812569618225),
 ('naturally', 0.3387349843978882),
 ('a', 0.3379022777080536),
 ('bynd', 0.3332291543483734)]

In [103]:
model2 = Word2Vec(ReceiptIter(), size=10)

2020-04-28 22:15:56,877 : INFO : collecting all words and their counts
2020-04-28 22:15:56,878 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-28 22:15:56,891 : INFO : collected 1024 word types from a corpus of 6082 raw words and 1422 sentences
2020-04-28 22:15:56,892 : INFO : Loading a fresh vocabulary
2020-04-28 22:15:56,893 : INFO : effective_min_count=5 retains 262 unique words (25% of original 1024, drops 762)
2020-04-28 22:15:56,893 : INFO : effective_min_count=5 leaves 4724 word corpus (77% of original 6082, drops 1358)
2020-04-28 22:15:56,894 : INFO : deleting the raw counts dictionary of 1024 items
2020-04-28 22:15:56,895 : INFO : sample=0.001 downsamples 107 most-common words
2020-04-28 22:15:56,895 : INFO : downsampling leaves estimated 2937 word corpus (62.2% of prior 4724)
2020-04-28 22:15:56,896 : INFO : estimated required memory for 262 words and 10 dimensions: 151960 bytes
2020-04-28 22:15:56,896 : INFO : resetting layer weights
2020-0

In [104]:
model2.wv.similar_by_word('eggs')

2020-04-28 22:16:36,466 : INFO : precomputing L2-norms of word weight vectors


[('french', 0.8073101043701172),
 ('qty', 0.7961218357086182),
 ('naturally', 0.7954999208450317),
 ('flavor', 0.7710254192352295),
 ('turkey', 0.7475664615631104),
 ('tender', 0.7101244926452637),
 ('with', 0.6879206895828247),
 ('jalapenos', 0.6788337230682373),
 ('food', 0.6744784116744995),
 ('pure', 0.6739354729652405)]

In [105]:
model.wv.similar_by_word('eggs')

[('turkey', 0.6665092706680298),
 ('feast', 0.6415066719055176),
 ('', 0.633222758769989),
 ('fancy', 0.6275984644889832),
 ('a', 0.6264246106147766),
 ('bacon', 0.6144722700119019),
 ('large', 0.6093783378601074),
 ('french', 0.6074662208557129),
 ('qty', 0.6065540313720703),
 ('grade', 0.6045123338699341)]