In [58]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')
review.head(2)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes_cool,votes_funny,votes_useful,cleaned_tokenized
10,UsFtqoBl7naz8AVUBZMjQQ,2013-11-08,Di3exaUCFNw1V4kSNW5pgA,5,All the food is great here. But the best thing...,review,uK8tzraOp4M5u3uYrqIBXg,0,0,0,"[[food, great], [best, thing, wing], [wing, si..."
11,UsFtqoBl7naz8AVUBZMjQQ,2014-03-29,0Lua2-PbqEQMjD9r89-asw,3,We checked this place out this past Monday for...,review,I_47G-R2_egp7ME5u_ltew,0,0,0,"[[checked, place, past, monday, wing-night], [..."


In [98]:


%load_ext autoreload
%autoreload 2


import gensim
from itertools import chain
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import doc2vec
from gensim.models.doc2vec import TaggedDocument


n_epochs = 10
n_docs = 1000 # -1 for almost all of them...

# Collapse each review to a 1D list of words.
review_flatten = [list(chain.from_iterable(doc)) for doc in review.cleaned_tokenized[:n_docs]]

# docs = [TaggedDocument(words, ['SENT_%i'%index,])
#                              for index, words in enumerate(review_flatten)]

docs = [TaggedDocument(words, [review.review_id.iloc[index]])
                             for index, words in enumerate(review_flatten)]


# A list of words for each review  
sentences = [doc.words for doc in docs]

print '\nFirst Doc: \n-----------------\n', docs[0]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

First Doc: 
-----------------
TaggedDocument(['food', 'great', 'best', 'thing', u'wing', u'wing', 'simply', 'fantastic', 'wet', 'cajun', 'best', 'most', 'popular', 'also', 'like', 'seasoned', 'salt', u'wing', 'wing-night', 'monday', 'wednesday', 'night', '075', 'whole', u'wing', 'dining', 'area', 'nice', 'very', 'family', 'friendly', 'bar', 'very', 'nice', 'well', 'place', 'truly', 'yinzers', 'dream', 'pittsburgh', 'dad', 'would', 'love', 'place', 'nat'], [u'Di3exaUCFNw1V4kSNW5pgA'])


In [99]:
import copy 

model = doc2vec.Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=12)
# Build the vocab from list of sentences.
model.build_vocab(docs) 
# Useful for training.  Get shuffled during training. 
docs_shuffled = copy.deepcopy(docs)


In [114]:
from random import shuffle

for epoch in range(1):
    print '\rTraining Epoch %i, alpha %1.4f'%(epoch+1, model.alpha),
    #model.train(np.random.permutation(docs))
    shuffle(docs_shuffled)
    model.train(docs_shuffled)
    model.alpha -= 0.001 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay


# Normalize the word vectors.
vec_norms = np.sqrt(np.sum(model.syn0**2, axis=1))
model.syn0 = (model.syn0/vec_norms[:, numpy.newaxis])
# Normalize the doc vectors.
vec_norms = np.sqrt(np.sum(model.docvecs.doctag_syn0**2, axis=1))
model.docvecs.doctag_syn0 = (model.docvecs.doctag_syn0/vec_norms[:, numpy.newaxis])




Training Epoch 1, alpha 0.0240


In [113]:
model.docvecs.doctag_syn0.shape

(1000, 100)

In [101]:
# Can find similar documents..
print model.docvecs.most_similar(positive=docs[0][1]), '\n'

# Can find similar words...
print model.most_similar(positive=['beer']), '\n'

# Can find documents that are most similar to keywords.... 
print model.docvecs.most_similar(positive=[model['beer'], model['music']]), '\n'

# Can find words that are most common in documents
print " ".join(docs[0][0])
print model.most_similar(positive=[model.docvecs[docs[0][1][0]], ]), '\n'


[(u'MNczjKfOZ8VD4Q9YzWNBbQ', 0.8875299692153931), (u'QcAMcIZgS4_gtU18GaWjiA', 0.8867901563644409), (u'DkqhyLlc7nkt7Du0RYfz3w', 0.8855961561203003), (u'yM4tNnuNuiH7MU54Ul128Q', 0.8844144344329834), (u'oyPN_upWhqekHRGIpKwp4g', 0.883427083492279), (u'_TlCk3xdz8RQCBQJAM55ZA', 0.8828277587890625), (u'gHKXQT4xVWohNGkCR6swog', 0.8823326230049133), (u'32dbp8Tkc1DvgTMZuXH7lQ', 0.8809983730316162), (u'JXILRnFAOBauVnDibjGrDA', 0.8803400993347168), (u'vXfyHC4vS6uyL7r2wBk4UA', 0.8800426125526428)] 

[('food', 0.9956748485565186), ('at-least', 0.9953977465629578), ('pittsburgh', 0.9950786828994751), ('not', 0.994979202747345), ('menu', 0.9948906898498535), ('dont', 0.9948253631591797), ('place', 0.9948084354400635), ('very', 0.9947399497032166), ('try', 0.9947223663330078), ('nice', 0.9946841597557068)] 

[(u'MsoASFxCmOOkOBOqZ80ngQ', 0.9963570833206177), (u'hnnT9vXu-m2PZeOKpqX-6Q', 0.9949337244033813), (u'c_OfFAXTywYkhsV4DgTpSw', 0.9945775270462036), (u'pKcQEu1QMRLWRuBBLqFU3A', 0.9941195249557495), 

set([u'UsFtqoBl7naz8AVUBZMjQQ', u'mVHrayjG3uZ_RLHkLj-AMg'])
[-0.0038604951, -0.00059710076]


In [104]:
# print model.docvecs['KUinHkKyGhznElgIzx0yIw']*2
# print get_mean_doc_vector(['KUinHkKyGhznElgIzx0yIw', 'KUinHkKyGhznElgIzx0yIw'], model)

for rev in review.review_id.iloc[:10]:
    print rev
    print np.dot(model.docvecs[rev], model.docvecs[rev])

Di3exaUCFNw1V4kSNW5pgA
0.0042853
0Lua2-PbqEQMjD9r89-asw
0.0122367
7N9j5YbBHBW6qguE5DAeyA
0.00195272
mjCJR33jvUNt41iJCxDU_g
0.00155523
6w6gMZ3iBLGcUM4RBIuifQ
0.0221827
jVVv_DA5mCDB6mediuwHAw
0.00289149
3Es8GsjkssusYgeU6_ZVpQ
0.0126018
KAkcn7oQP1xX8KsZ-XmktA
0.00472332
BZNJkkP0bXnwQ2-sCqat2Q
0.014506
VDTIbR3G5_IPkpXbo2MutA
0.00847885


In [1]:
for word, sim in model.most_similar('beer'):
    print np.dot(model[word], model[word])

NameError: name 'model' is not defined

In [78]:
for key in model.vocab.keys():
    model[key]

{'fawn': <gensim.models.word2vec.Vocab at 0x7f994aeb0bd0>,
 'raining': <gensim.models.word2vec.Vocab at 0x7f996d984e90>,
 'bypassed': <gensim.models.word2vec.Vocab at 0x7f996d2de450>,
 'cussed': <gensim.models.word2vec.Vocab at 0x7f99be5c1650>,
 'blackend': <gensim.models.word2vec.Vocab at 0x7f99be5c1e50>,
 '5-diamond': <gensim.models.word2vec.Vocab at 0x7f99be5c1290>,
 'yellow': <gensim.models.word2vec.Vocab at 0x7f99be5c1c50>,
 'four': <gensim.models.word2vec.Vocab at 0x7f99be5c12d0>,
 'prefix': <gensim.models.word2vec.Vocab at 0x7f99be5c1150>,
 'deelish': <gensim.models.word2vec.Vocab at 0x7f99656b1d50>,
 'hanging': <gensim.models.word2vec.Vocab at 0x7f99be5c1d90>,
 'bistroid': <gensim.models.word2vec.Vocab at 0x7f99be5c1a50>,
 'woody': <gensim.models.word2vec.Vocab at 0x7f99be5c1c90>,
 'aggression': <gensim.models.word2vec.Vocab at 0x7f99be5c1950>,
 'conjure': <gensim.models.word2vec.Vocab at 0x7f99be5c1dd0>,
 'frou-frou': <gensim.models.word2vec.Vocab at 0x7f99be5c1f10>,
 'crooned

In [107]:
model.syn0.shape

(7626, 100)

In [108]:
model.syn0.shape

(7626, 100)

In [110]:
model['beer']

array([-0.11914355, -0.00848406, -0.25816774,  0.05004114, -0.13797277,
       -0.07304802,  0.07144317, -0.00289828,  0.00988765, -0.0521464 ,
       -0.02484666, -0.05874219,  0.16049536, -0.00942174,  0.15264362,
        0.0524993 ,  0.01947534, -0.09520859, -0.04392029, -0.17615482,
       -0.19804238, -0.09196329, -0.0707287 ,  0.17948456,  0.03011344,
       -0.13420026, -0.07682815, -0.06674536, -0.06478921,  0.05593799,
       -0.22613293,  0.01537087,  0.05933586, -0.07667404,  0.08785174,
       -0.04805563, -0.10572113,  0.01903196,  0.06023778, -0.15284617,
        0.00767768,  0.07501449, -0.01899237,  0.11723676, -0.00583145,
       -0.04982071,  0.04778826, -0.11280042,  0.0237955 ,  0.08502073,
       -0.04627986, -0.0168891 , -0.06349624, -0.01229718, -0.02730742,
        0.14085445, -0.0117617 , -0.00330494, -0.06652286,  0.0688545 ,
        0.09140737,  0.06150964,  0.1517535 , -0.10411895,  0.29116744,
       -0.10556895, -0.12878124,  0.05843589,  0.13523191, -0.03

In [112]:
model.most_similar(['beer'])

[('food', 0.9956748485565186),
 ('at-least', 0.9953977465629578),
 ('pittsburgh', 0.9950786828994751),
 ('not', 0.994979202747345),
 ('menu', 0.9948906898498535),
 ('dont', 0.9948253631591797),
 ('place', 0.9948084354400635),
 ('very', 0.9947399497032166),
 ('try', 0.9947223663330078),
 ('nice', 0.9946841597557068)]