This notebook analyzes data for 100% of data

In [1]:
import os
import codecs
import json
import spacy
import numpy as np
import pandas as pd
import itertools as it
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

No handlers could be found for logger "gensim.models.word2vec"


In [2]:
# loads Yelp data, saves review texts and ratings in another file.
# If stars=1 or 2, store '1'. If stars=3, store '2'. If stars=4 or 5, store '3'

data_directory = os.path.join('dataset')
review_filepath = os.path.join(data_directory,'yelp_academic_dataset_review.json')
result_directory = os.path.join('result-3categories')
review_txt_filepath = os.path.join(result_directory,'review_text.txt')
star_txt_filepath = os.path.join(result_directory,'star_text.txt')
# review_count = 0
# with codecs.open(star_txt_filepath, 'w', encoding='utf_8') as star_txt_file:
#     with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:
#         with codecs.open(review_filepath, encoding='utf_8') as review_file:
#             for review_json in review_file:
#                 review = json.loads(review_json)
#                 review_txt_file.write(review[u'text'].replace('\n', ' ').replace('\r', ' ') + '\n')
#                 if (review[u'stars'] == 1) or (review[u'stars'] == 2):
#                     star_txt_file.write('1' + '\n')
#                 if (review[u'stars'] == 3):
#                     star_txt_file.write('2' + '\n')
#                 if (review[u'stars'] == 4) or (review[u'stars'] == 5):
#                     star_txt_file.write('3' + '\n')
#                 review_count += 1

In [3]:
nlp = spacy.load('en')

In [4]:
def punct_space(token):
    return token.is_punct or token.is_space

def line_review(filename):
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    for parsed_review in nlp.pipe(line_review(filename), batch_size=10000, n_threads=4):        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [5]:
unigram_sentences_filepath = os.path.join(result_directory,'unigram_sentences_all.txt')

In [6]:
# perform lemmatization
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for sentence in lemmatized_sentence_corpus(review_txt_filepath):
        f.write(sentence + '\n')

In [7]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [8]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print u' '.join(unigram_sentence)
    print u''

kind of like cheers but in a blue collar neighborhood in the 1950 's

fan freakin tastic

i could feel at home here

you definitely want to hit mapquest or plug in your gp though

i be not sure that i could find it again on my own it really be a hidden gem

i will be make my friend take me back until i can memorize where the heck it be

addendum 2nd visit for the fish sandwich

excellent

truly

a pound of fish on a fish shape bun as oppose to da burgh 's seemingly popular hamburger bun



In [9]:
bigram_model_filepath = os.path.join(result_directory, 'bigram_model_all')

In [10]:
%%time
# create a bigram model using unigrams

bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)

bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 8min 42s, sys: 4.02 s, total: 8min 46s
Wall time: 8min 47s


In [11]:
bigram_sentences_filepath = os.path.join(result_directory,'bigram_sentences_all.txt')

In [12]:
with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:

    for unigram_sentence in unigram_sentences:

        bigram_sentence = u' '.join(bigram_model[unigram_sentence])

        f.write(bigram_sentence + '\n')

In [13]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [14]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print u' '.join(bigram_sentence)
    print u''

kind of like cheers but in a blue_collar neighborhood in the 1950_'s

fan freakin_tastic

i could feel at home here

you definitely want to hit mapquest or plug in your gp though

i be not sure that i could find it again on my own it really be a hidden_gem

i will be make my friend take me back until i can memorize where the heck it be

addendum 2nd visit for the fish sandwich

excellent

truly

a pound of fish on a fish shape bun as oppose to da_burgh 's seemingly popular hamburger_bun



In [15]:
trigram_model_filepath = os.path.join(result_directory,'trigram_model_all')

In [16]:
#create trigrams based on bigram
trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [17]:
trigram_sentences_filepath = os.path.join(result_directory,'trigram_sentences_all.txt')

In [18]:
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:

    for bigram_sentence in bigram_sentences:

        trigram_sentence = u' '.join(trigram_model[bigram_sentence])

        f.write(trigram_sentence + '\n')

In [19]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [20]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print u' '.join(trigram_sentence)
    print u''

kind of like cheers but in a blue_collar neighborhood in the 1950_'s

fan_freakin_tastic

i could feel at home here

you definitely want to hit mapquest or plug in your gp though

i be not sure that i could find it again on my own it really be a hidden_gem

i will be make my friend take me back until i can memorize where the heck it be

addendum 2nd visit for the fish sandwich

excellent

truly

a pound of fish on a fish shape bun as oppose to da_burgh 's seemingly popular hamburger_bun



In [21]:
trigram_reviews_filepath = os.path.join(result_directory,'trigram_transformed_reviews_all.txt')

In [22]:
with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:

    for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                  batch_size=10000, n_threads=4):

        # lemmatize the text, removing punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                          if not punct_space(token)]

        # apply the first-order and second-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]

        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                          if term not in spacy.en.STOPWORDS]

        # write the transformed review as a line in the new file
        trigram_review = u' '.join(trigram_review)
        f.write(trigram_review + '\n')

In [23]:
# comparing original review and transformed review
print u'Original:' + u'\n'

for review in it.islice(line_review(review_txt_filepath), 3645, 3646):
    print review

print u'----' + u'\n'
print u'Transformed:' + u'\n'

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 3645, 3646):
        print review

Original:

Lucky to have such a wonderful museum in the city I study now, I have spent so many hours wandering around the paintings and the amazing fossils!

----

Transformed:

lucky wonderful museum city study spend hour wander_around painting amazing fossil



In [24]:
# use trigram for X
X_list=pd.read_table(trigram_reviews_filepath, skip_blank_lines=False ,names=['text'])
X=X_list.text

In [25]:
y_list=pd.read_table(star_txt_filepath,names=['stars'])
y=y_list.stars

In [26]:
print X.shape
print y.shape

(2225213,)
(2225213,)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [28]:
# run CountVectorizer on trigram
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train.values.astype('U'))
X_test_dtm = vect.transform(X_test.values.astype('U'))
# X_test_dtm2 = vect.transform(["ddgdhrrrgh00"])

In [29]:
X_test_dtm.shape

(556304, 373740)

In [30]:
# run naive bayse on countvectorizer
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_nb = nb.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_nb)
#print y_pred_class

0.775462696655


In [31]:
print metrics.classification_report(y_test, y_pred_nb)
print metrics.confusion_matrix(y_test, y_pred_nb)

             precision    recall  f1-score   support

          1       0.71      0.68      0.69    112540
          2       0.37      0.49      0.42     70773
          3       0.90      0.86      0.88    372991

avg / total       0.79      0.78      0.78    556304

[[ 76737  25453  10350]
 [ 11568  34332  24873]
 [ 20318  32349 320324]]


In [None]:
rf=RandomForestClassifier()
rf.fit(X_train_dtm, y_train)
y_pred_rf = rf.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_rf)

0.770729672985


In [None]:
from sklearn import svm
svc=svm.SVC()
svc.fit(X_train_dtm, y_train)
y_pred_svm = svc.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_svm)

In [None]:
# gridsearch on nb
from sklearn.grid_search import GridSearchCV
param_grid = {'alpha': [0,.5,1],
              'fit_prior': [True,False]}
nb_rf = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5)
nb_rf.fit(X_train_dtm, y_train)
nb_rf.best_params_
print nb_rf.best_params_

In [None]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_nb = nb.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_nb)

In [None]:
# gridsearch on randomforest
param_grid = {'n_estimators': [10, 50, 100],
              'max_depth': [3, 5, 10, None],
              'min_samples_split': [2, 5],
              'criterion': ['gini', 'entropy'],
              'bootstrap': [True, False]}
gs_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
gs_rf.fit(X_train_dtm, y_train)
gs_rf.best_params_
print gs_rf.best_params_

In [None]:
rf=RandomForestClassifier(min_samples_split=2, n_estimators=50, bootstrap=False, criterion='entropy', max_depth=None)
rf.fit(X_train_dtm, y_train)
y_pred_class = rf.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# gridsearch on svm
param_grid = {'C': [0.01, 0.1, 1, 10],
              'class_weight': [None,'balanced']}
gs_svc = GridSearchCV(svm.SVC(kernel='linear'), param_grid)
gs_svc.fit(X_train_dtm, y_train)
gs_svc.best_params_
print gs_svc.best_params_

In [None]:
svc=svm.SVC(kernel='linear',C= 0.01, class_weight= 'balanced')
svc.fit(X_train_dtm, y_train)
y_pred_class = svc.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_class)