In [1]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')
review.head(2)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes_cool,votes_funny,votes_useful,cleaned_tokenized
10,UsFtqoBl7naz8AVUBZMjQQ,2013-11-08,Di3exaUCFNw1V4kSNW5pgA,5,All the food is great here. But the best thing...,review,uK8tzraOp4M5u3uYrqIBXg,0,0,0,"[[food, great], [best, thing, wing], [wing, si..."
11,UsFtqoBl7naz8AVUBZMjQQ,2014-03-29,0Lua2-PbqEQMjD9r89-asw,3,We checked this place out this past Monday for...,review,I_47G-R2_egp7ME5u_ltew,0,0,0,"[[checked, place, past, monday, wing-night], [..."


In [2]:


%load_ext autoreload
%autoreload 2


import gensim
from itertools import chain
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import doc2vec
from gensim.models.doc2vec import TaggedDocument


n_epochs = 10
n_docs = -1 # -1 for almost all of them...

# Collapse each review to a 1D list of words.
review_flatten = [list(chain.from_iterable(doc)) for doc in review.cleaned_tokenized[:n_docs]]

# Generate the tagged document list. 
docs = [TaggedDocument(words, [review.review_id.iloc[index]])
                             for index, words in enumerate(review_flatten)]


# A list of words for each review  
sentences = [doc.words for doc in docs]

print '\nFirst Doc: \n-----------------\n', docs[0]



First Doc: 
-----------------
TaggedDocument(['food', 'great', 'best', 'thing', u'wing', u'wing', 'simply', 'fantastic', 'wet', 'cajun', 'best', 'most', 'popular', 'also', 'like', 'seasoned', 'salt', u'wing', 'wing-night', 'monday', 'wednesday', 'night', '075', 'whole', u'wing', 'dining', 'area', 'nice', 'very', 'family', 'friendly', 'bar', 'very', 'nice', 'well', 'place', 'truly', 'yinzers', 'dream', 'pittsburgh', 'dad', 'would', 'love', 'place', 'nat'], [u'Di3exaUCFNw1V4kSNW5pgA'])


In [5]:
model = doc2vec.Doc2Vec(min_count=2, window=8, size=100, sample=1e-4, negative=5, workers=12)
# Build the vocab from list of sentences.
model.build_vocab(docs) 






In [6]:
from random import shuffle

for epoch in range(10):
    print '\rTraining Epoch %i, alpha %1.4f'%(epoch+1, model.alpha),
    #model.train(np.random.permutation(docs))
    shuffle(docs)
    model.train(docs)
    model.alpha -= 0.001 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

model.init_sims(replace=True)    
# # Normalize the word vectors.
# vec_norms = np.sqrt(np.sum(model.syn0**2, axis=1))
# model.syn0 = (model.syn0/vec_norms[:, numpy.newaxis])
# # Normalize the doc vectors.
# vec_norms = np.sqrt(np.sum(model.docvecs.doctag_syn0**2, axis=1))
# model.docvecs.doctag_syn0 = (model.docvecs.doctag_syn0/vec_norms[:, numpy.newaxis])

model.save('../output/doc2vec_bars.model')

Training Epoch 10, alpha 0.0160





In [None]:
# Can find similar documents..
print model.docvecs.most_similar(positive=['KUinHkKyGhznElgIzx0yIw']), '\n'

# Can find similar words...
print model.most_similar(positive=['beer']), '\n'

# Can find documents that are most similar to keywords.... 
print model.docvecs.most_similar(positive=[model['beer'], model['music']]), '\n'

# Can find words that are most common in documents
print review.text[review.review_id=='KUinHkKyGhznElgIzx0yIw'].values
print model.most_similar(positive=[model.docvecs['KUinHkKyGhznElgIzx0yIw']]), '\n'
