In [None]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_restaurant_reviews_cleaned_and_tokenized.pickle')
review.head(2)

In [2]:


%load_ext autoreload
%autoreload 2


import gensim
from itertools import chain
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import doc2vec
from gensim.models.doc2vec import TaggedDocument

n_epochs = 10
n_docs = -1 # -1 for almost all of them...

# Collapse each review to a 1D list of words.
review_flatten = [list(chain.from_iterable(doc)) for doc in review.cleaned_tokenized[:n_docs]]


# Generate the tagged document list. 
docs = [TaggedDocument(words, [review.review_id.iloc[index]])
                             for index, words in enumerate(review_flatten)]


# A list of words for each review  
sentences = [doc.words for doc in docs]

print '\nFirst Doc: \n-----------------\n', docs[0]



First Doc: 
-----------------
TaggedDocument(['food', 'great', 'best', 'thing', u'wing', u'wing', 'simply', 'fantastic', 'wet', 'cajun', 'best', 'most', 'popular', 'also', 'like', 'seasoned', 'salt', u'wing', 'wing-night', 'monday', 'wednesday', 'night', '075', 'whole', u'wing', 'dining', 'area', 'nice', 'very', 'family', 'friendly', 'bar', 'very', 'nice', 'well', 'place', 'truly', 'yinzers', 'dream', 'pittsburgh', 'dad', 'would', 'love', 'place', 'nat'], [u'Di3exaUCFNw1V4kSNW5pgA'])


In [3]:
model = doc2vec.Doc2Vec(min_count=5, window=8, size=100, sample=1e-4, negative=5, workers=12)
# Build the vocab from list of sentences.
model.build_vocab(docs) 



In [4]:
from random import shuffle

for epoch in range(10):
    print '\rTraining Epoch %i, alpha %1.4f'%(epoch+1, model.alpha),
    #model.train(np.random.permutation(docs))
    shuffle(docs)
    model.train(docs)
    model.alpha -= 0.001 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

model.init_sims(replace=True)    
# # Normalize the word vectors.
# vec_norms = np.sqrt(np.sum(model.syn0**2, axis=1))
# model.syn0 = (model.syn0/vec_norms[:, numpy.newaxis])
# # Normalize the doc vectors.
# vec_norms = np.sqrt(np.sum(model.docvecs.doctag_syn0**2, axis=1))
# model.docvecs.doctag_syn0 = (model.docvecs.doctag_syn0/vec_norms[:, numpy.newaxis])

model.save('../output/doc2vec_bars.model')

Training Epoch 10, alpha 0.0160


In [5]:
# Can find similar documents..
print model.docvecs.most_similar(positive=['KUinHkKyGhznElgIzx0yIw']), '\n'

# Can find similar words...Re: Dream Companies and contact from recruiters
print model.most_similar(positive=['beer']), '\n'

# Can find documents that are most similar to keywords.... 
print model.docvecs.most_similar(positive=[model['beer'], model['music']]), '\n'

# Can find words that are most common in documents
print review.text[review.review_id=='KUinHkKyGhznElgIzx0yIw'].values
print model.most_similar(positive=[model.docvecs['KUinHkKyGhznElgIzx0yIw']]), '\n'


[(u'Thj-jeeY3rpu3dMJvTwTKg', 0.6003409624099731), (u'qpKFRaQvtrFRhnaQiR9I6g', 0.5944679975509644), (u'AkYXLQgc8OiHzh6srSTQCg', 0.58905029296875), (u'5Ipcu5mDhR__OPFXBJA_1g', 0.5879070162773132), (u'DNzrVCKa9jTKe9MQMxiXew', 0.5869959592819214), (u'CsBC1lyGlAcKxO1DI0XY6w', 0.5863107442855835), (u'quH_YU1n3NEHgUeo_IM5LQ', 0.5834709405899048), (u'IGL-pUN_vHdyZeqV24S-9Q', 0.5767616033554077), (u'VOhV6DvuwxnVpl6az7smmQ', 0.5720300674438477), (u'796ZD0lWIbHsipRfvZaDqQ', 0.571638822555542)] 

[('wine', 0.6038611531257629), (u'import', 0.5595511198043823), ('microbrews', 0.552147626876831), (u'draft', 0.5492744445800781), (u'craft', 0.5180332064628601), ('domestic', 0.5174193978309631), ('tap', 0.5162851810455322), ('chardonnay', 0.49968481063842773), ('whiskey', 0.48812004923820496), ('lager', 0.47685831785202026)] 

[(u'dayAuooCnXLB06lBpWtLyQ', 0.554182767868042), (u'_yNyxUfsLOfGOyneVxM9lw', 0.5318312644958496), (u'XeRek8yFWv3IN69BudilEQ', 0.5191066265106201), (u'J90iqydSFvYHArQ6lp1xxQ', 0.51