In [2]:
import os
import codecs
import json
import spacy
import numpy as np
import pandas as pd
import itertools as it
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

No handlers could be found for logger "gensim.models.word2vec"


In [3]:
data_directory = os.path.join('dataset')
review_filepath = os.path.join(data_directory,'reviews2.json')
result_directory = os.path.join('result-3categories-minidata')
review_txt_filepath = os.path.join(result_directory,'review_text.txt')
star_txt_filepath = os.path.join(result_directory,'star_text.txt')
# review_count = 0
# with codecs.open(star_txt_filepath, 'w', encoding='utf_8') as star_txt_file:
#     with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:
#         with codecs.open(review_filepath, encoding='utf_8') as review_file:
#             for review_json in review_file:
#                 review = json.loads(review_json)
#                 review_txt_file.write(review[u'text'].replace('\n', ' ').replace('\r', ' ') + '\n')
#                 if (review[u'stars'] == 1) or (review[u'stars'] == 2):
#                     star_txt_file.write('1' + '\n')
#                 if (review[u'stars'] == 3):
#                     star_txt_file.write('2' + '\n')
#                 if  (review[u'stars'] == 4) or (review[u'stars'] == 5):
#                     star_txt_file.write('3' + '\n')
#                 review_count += 1

In [4]:
nlp = spacy.load('en')

In [5]:
def punct_space(token):
    return token.is_punct or token.is_space

def line_review(filename):
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    for parsed_review in nlp.pipe(line_review(filename), batch_size=10000, n_threads=4):        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [6]:
unigram_sentences_filepath = os.path.join(result_directory,'unigram_sentences_all.txt')

In [6]:
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for sentence in lemmatized_sentence_corpus(review_txt_filepath):
        f.write(sentence + '\n')

In [7]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [8]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print u' '.join(unigram_sentence)
    print u''

they have about 20+ thing on tap

and if you do not count thing that be not beer e.g. bud they still have about 12+ beer on tap

that make me really happy

the casual pizza/beer atmosphere be wonderful for a late dinner last night

the folk be really nice and the customer be not super crazy either like you would expect sometimes in a college town

also they have thing like pool table and arcade machine which make them pretty awesome in my book

the pizza be really good

we have a special pizza with feta tomato and basil and a basket of fried mushroos and zuccini

the wait be reasonable for a pizza make from scratch

the pizza be delicious but surprisingly not fill



In [9]:
bigram_model_filepath = os.path.join(result_directory, 'bigram_model_all')

In [10]:
%%time

bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)

bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 4.43 s, sys: 60.1 ms, total: 4.49 s
Wall time: 4.48 s


In [11]:
bigram_sentences_filepath = os.path.join(result_directory,'bigram_sentences_all.txt')

In [12]:
with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:

    for unigram_sentence in unigram_sentences:

        bigram_sentence = u' '.join(bigram_model[unigram_sentence])

        f.write(bigram_sentence + '\n')

In [13]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [14]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print u' '.join(bigram_sentence)
    print u''

they have about 20+ thing on_tap

and if_you do not count thing that be not beer e.g. bud they still have about 12+ beer on_tap

that make me really happy

the casual pizza/beer atmosphere be wonderful for a late dinner last_night

the folk be really nice and the customer be not super crazy either like you would expect sometimes in a college_town

also they have thing like pool_table and arcade machine which make them pretty awesome in my book

the pizza be really good

we have a special pizza with feta tomato and basil and a basket of fried mushroos and zuccini

the wait be reasonable for a pizza make from_scratch

the pizza be delicious but surprisingly not fill



In [7]:
trigram_model_filepath = os.path.join(result_directory,'trigram_model_all')

In [16]:
trigram_model = Phrases(bigram_sentences)

trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [3]:
trigram_sentences_filepath = os.path.join(result_directory,'trigram_sentences_all.txt')

In [18]:
with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:

    for bigram_sentence in bigram_sentences:

        trigram_sentence = u' '.join(trigram_model[bigram_sentence])

        f.write(trigram_sentence + '\n')

In [19]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [20]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print u' '.join(trigram_sentence)
    print u''

they have about 20+ thing on_tap

and if_you do_not count thing that be not beer e.g. bud they still have about 12+ beer_on_tap

that make me really happy

the casual pizza/beer atmosphere be wonderful for a late dinner last_night

the folk be really nice and the customer be not super crazy either like you would expect sometimes in a college_town

also they have thing like pool_table and arcade machine which make them pretty awesome in my_book

the pizza be really good

we have a special pizza with feta tomato and basil and a basket of fried mushroos and zuccini

the wait be reasonable for a pizza make from_scratch

the pizza be delicious but surprisingly not fill



In [4]:
trigram_reviews_filepath = os.path.join(result_directory,'trigram_transformed_reviews_all.txt')

In [22]:
with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:

    for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                  batch_size=10000, n_threads=4):

        # lemmatize the text, removing punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                          if not punct_space(token)]

        # apply the first-order and second-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]

        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                          if term not in spacy.en.STOPWORDS]

        # write the transformed review as a line in the new file
        trigram_review = u' '.join(trigram_review)
        f.write(trigram_review + '\n')

In [23]:
print u'Original:' + u'\n'

for review in it.islice(line_review(review_txt_filepath), 3645, 3646):
    print review

print u'----' + u'\n'
print u'Transformed:' + u'\n'

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 3645, 3646):
        print review

Original:

The Free Speech Movement Cafe has Mario Savio's immortal words etched in metal that you can read while waiting in line:  "To me, freedom of speech is something that represents the very dignity of what a human being is. ... It is the thing that marks us as just below the angels."  That being said, FSM is the second-best cafe in Berkeley to engage in that practice known as "eye-f*cking."  Any student who has ever sat in a cafe to study during midterms has done it.  You're sitting at a table, enjoying your hot grilled chicken panini and fresh green salad, allowing your Americano to cool, when you see an attractive person sitting across the way from you.  S/he is typing away at a laptop, but every so often--so you think--s/he looks up and gazes intently into your eyes.  Maybe s/he is looking THROUGH you to the grayscale images of the Free Speech Movement behind you; maybe the person is simply seeking the camaraderie of another weary, exam-riddled soul.  But you know better.  S/h

In [5]:
X_list=pd.read_table(trigram_reviews_filepath, skip_blank_lines=False ,names=['text'])
X=X_list.text

In [6]:
y_list=pd.read_table(star_txt_filepath,names=['stars'])
y=y_list.stars

In [10]:
print X.shape
print y.shape

(17396,)
(17396,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train.values.astype('U'))
X_test_dtm = vect.transform(X_test.values.astype('U'))
# X_test_dtm2 = vect.transform(["ddgdhrrrgh00"])

In [9]:
X_test_dtm.shape

(4349, 25061)

In [14]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_nb = nb.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_nb)
#print y_pred_class

0.711887790297


In [15]:
print metrics.classification_report(y_test, y_pred_nb)
print metrics.confusion_matrix(y_test, y_pred_nb)

             precision    recall  f1-score   support

          1       0.70      0.57      0.63       735
          2       0.49      0.37      0.42      1031
          3       0.77      0.89      0.83      2583

avg / total       0.69      0.71      0.70      4349

[[ 422  169  144]
 [ 118  377  536]
 [  66  220 2297]]


In [16]:
y_test_binary = np.where(y_test==3, 2, 1)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

1.5939296389974706

In [17]:
rf=RandomForestClassifier()
rf.fit(X_train_dtm, y_train)
y_pred_rf = rf.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_rf)

0.638537594849


In [23]:
from sklearn import svm
svc=svm.SVC(kernel='linear')
svc.fit(X_train_dtm, y_train)
y_pred_svm = svc.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_svm)

0.657162566107


In [35]:
from sklearn.grid_search import GridSearchCV
param_grid = {'alpha': [0,.5,1],
              'fit_prior': [True,False]}
nb_rf = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5)
nb_rf.fit(X_train_dtm, y_train)
nb_rf.best_params_
print nb_rf.best_params_

{'alpha': 1, 'fit_prior': True}


In [None]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_nb = nb.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_nb)

0.711887790297


In [None]:
param_grid = {'n_estimators': [10, 50, 100],
              'max_depth': [3, 5, 10, None],
              'min_samples_split': [2, 5],
              'criterion': ['gini', 'entropy'],
              'bootstrap': [True, False]}
gs_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
gs_rf.fit(X_train_dtm, y_train)
gs_rf.best_params_
print gs_rf.best_params_

In [None]:
rf=RandomForestClassifier(min_samples_split=2, n_estimators=50, bootstrap=False, criterion='entropy', max_depth=None)
rf.fit(X_train_dtm, y_train)
y_pred_class = rf.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_class)

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 10],
              'class_weight': [None,'balanced']}
gs_svc = GridSearchCV(svm.SVC(kernel='linear'), param_grid)
gs_svc.fit(X_train_dtm, y_train)
gs_svc.best_params_
print gs_svc.best_params_

In [None]:
svc=svm.SVC(kernel='linear',C= 0.01, class_weight= 'balanced')
svc.fit(X_train_dtm, y_train)
y_pred_class = svc.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_class)

In [None]:
from textblob import TextBlob


In [21]:
tfi = TfidfVectorizer()
X_train_tfi = tfi.fit_transform(X_train.values.astype('U'))
X_test_tfi = tfi.transform(X_test.values.astype('U'))

In [22]:
nb = MultinomialNB()
nb.fit(X_train_tfi, y_train)
y_pred_nb_tfi = nb.predict(X_test_tfi)

print metrics.accuracy_score(y_test, y_pred_nb_tfi)

0.596229018165


In [24]:
svc=svm.SVC(kernel='linear')
svc.fit(X_train_tfi, y_train)
y_pred_svm_tfi = svc.predict(X_test_tfi)

print metrics.accuracy_score(y_test, y_pred_svm_tfi)

0.728213382387


In [None]:
lr=LogisticRegression()
lr.fit(X_train_dtm, y_train)
y_pred_lr = lr.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_lr)

In [8]:
from sklearn.feature_extraction.text import HashingVectorizer
hv=HashingVectorizer()
X_train_hv = hv.fit_transform(X_train.values.astype('U'))
X_test_hv = hv.transform(X_test.values.astype('U'))

nb = MultinomialNB()
nb.fit(X_train_hv, y_train)
y_pred_nb_hv = nb.predict(X_test_hv)

print metrics.accuracy_score(y_test, y_pred_nb_hv)

ValueError: Input X must be non-negative