In [44]:
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, GridSearchCV, KFold, \
StratifiedKFold, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, r2_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from xgboost.sklearn import XGBClassifier

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

%matplotlib inline

sns.set_style("darkgrid")

In [None]:
# Compare various classifiers accuracy and classification report with Vader sentiment analyzer

In [45]:
df = pd.read_csv('amazon_food-tokenize.csv', index_col=0, encoding='utf-8')
df.head()

Unnamed: 0,reviewText,overall,reviewText_tokenize
0,just another flavor of kit kat but the taste i...,4.0,"[flavor, kit, kat, taste, unique, bit, differe..."
1,i bought this on impulse and it comes from jap...,3.0,"[bought, impulse, japan, amused, family, weird..."
2,really good great gift for any fan of green te...,4.0,"[good, great, gift, fan, green, tea, expensive..."
3,i had never had it before was curious to see w...,5.0,"[curious, like, smooth, great, subtle, good, f..."
4,ive been looking forward to trying these after...,4.0,"[ive, looking, forward, trying, hearing, popul..."


In [3]:
X = df.reviewText_tokenize
y = df.overall

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
y_train.value_counts()

In [28]:
# sm = SMOTEENN()

In [39]:
# Naive Bayes - MultinomialNB - GridSearchCV
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', MultinomialNB())
]) 

params = dict(cls__alpha=np.linspace(0.1, 10, 10))
grid_search = GridSearchCV(pipeline, param_grid=params, cv=5)

grid_search.fit(X_train, y_train)
print 'best hyper-parameters =', grid_search.best_params_
print 'best score =', grid_search.best_score_
print ''
print 'best estimator =',grid_search.best_estimator_
print ''

end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'


best hyper-parameters = {'cls__alpha': 0.10000000000000001}
best score = 0.592687741436

best estimator = Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        str...use_idf=True)), ('cls', MultinomialNB(alpha=0.10000000000000001, class_prior=None, fit_prior=True))])

Time taken : 1013.68899703  seconds


In [16]:
# Naive Bayes - MultinomialNB
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', MultinomialNB(alpha=0.1))
]) 
pipeline.fit(X_train, y_train)
nb_predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'

0.594155629504
Time taken : 43.7946031094  seconds


In [17]:
print classification_report(y_test, nb_predicted)

             precision    recall  f1-score   support

        1.0       0.90      0.01      0.02      1743
        2.0       1.00      0.01      0.01      2341
        3.0       0.61      0.03      0.05      5287
        4.0       0.40      0.09      0.15      9825
        5.0       0.60      0.99      0.75     26181

avg / total       0.59      0.59      0.47     45377



In [9]:
# Logistic Regression - GridSearchCV
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', LogisticRegression())
]) 

params = dict(cls__solver=['newton-cg', 'lbfgs', 'sag'])
grid_search = GridSearchCV(pipeline, param_grid=params)

grid_search.fit(X_train, y_train)
print 'best hyper-parameters =', grid_search.best_params_
print 'best score =', grid_search.best_score_
print ''
print 'best estimator =',grid_search.best_estimator_
print ''

end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'

best hyper-parameters = {'cls__solver': 'newton-cg'}
best score = 0.652634660975

best estimator = Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        str...ty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False))])

Time taken : 18.2112945318  minutes


In [24]:
# Logistic Regression
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', LogisticRegression(solver='newton-cg'))
]) 
pipeline.fit(X_train, y_train)
lr_predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'

0.662295876766
Time taken : 163.688863993  seconds


In [25]:
print classification_report(y_test, lr_predicted)

             precision    recall  f1-score   support

        1.0       0.69      0.22      0.33      1743
        2.0       0.42      0.08      0.14      2341
        3.0       0.47      0.30      0.37      5287
        4.0       0.49      0.36      0.41      9825
        5.0       0.72      0.93      0.81     26181

avg / total       0.62      0.66      0.62     45377



In [25]:
# Random Forest Classifier
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', RandomForestClassifier())
]) 
pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'

0.596866253829
Time taken : 860.863065004  seconds


In [26]:
# SGDClassifier
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', SGDClassifier())
]) 
pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'



0.625316790444
Time taken : 43.0113868713  seconds


In [27]:
# SVC
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', SVC())
]) 
pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = end - start
print 'Time taken :',(exe_time),' seconds'

0.577583357207
Time taken : 7284.43519688  seconds


In [10]:
# XGBoost - GridSearch
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', XGBClassifier(objective='multi:softmax'))
]) 
params = dict(cls__learning_rate=[0.2, 0.4, 0.6], cls__max_depth=[2,3,4])
grid_search = GridSearchCV(pipeline, param_grid=params)

grid_search.fit(X_train, y_train)
print 'best hyper-parameters =', grid_search.best_params_
print 'best score =', grid_search.best_score_
print ''
print 'best estimator =',grid_search.best_estimator_
print ''

end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'

best hyper-parameters = {'cls__learning_rate': 0.6, 'cls__max_depth': 3}
best score = 0.64305751013

best estimator = Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        str...n_estimators=100, nthread=-1, objective='multi:softprob', seed=0,
       silent=True, subsample=1))])

Time taken : 271.923445217  minutes


In [26]:
# XGBoost
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', XGBClassifier(learning_rate=0.6, max_depth=3, objective='multi:softmax'))
]) 
pipeline.fit(X_train, y_train)
xgb_predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'

0.648852943121
Time taken : 14.5472668171  minutes


In [27]:
print classification_report(y_test, xgb_predicted)

             precision    recall  f1-score   support

        1.0       0.52      0.26      0.35      1743
        2.0       0.37      0.13      0.19      2341
        3.0       0.47      0.27      0.34      5287
        4.0       0.49      0.29      0.36      9825
        5.0       0.70      0.93      0.80     26181

avg / total       0.60      0.65      0.60     45377



In [None]:
# Pipeline GridsearchCV 
start = time.time()

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', SGDClassifier())
]) 

params = dict(cls=[MultinomialNB(), SGDClassifier()],
              cls__alpha=[0.0001, 0.001, 0.01, 0.1, 0.5, 1.0])
grid_search = GridSearchCV(pipeline, param_grid=params)

grid_search.fit(X_train, y_train)
print ''
print 'best hyper-parameters =', grid_search.best_params_
print 'best score =', grid_search.best_score_
print ''
print 'best estimator =',grid_search.best_estimator_

end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'

In [12]:
# Ensemble Voting Classifier
start = time.time()

clf1 = LogisticRegression(solver='newton-cg')
clf2 = RandomForestClassifier()
clf4 = MultinomialNB(alpha=0.1)
clf5 = SGDClassifier()
clf6 = XGBClassifier(learning_rate=0.6, max_depth=3, objective='multi:softmax')

pipeline = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore', ngram_range=(1,2), strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('cls', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf4), ('sgd', clf5),
                                        ('xgb', clf6)], voting='hard'))
]) 
pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
print pipeline.score(X_test, y_test)

end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'



0.635938911784
Time taken : 31.4764133811  minutes


In [46]:
# Using Vader sentiment analyzer with Logistic Regression

analyzer = SentimentIntensityAnalyzer()

In [47]:
df.head()

Unnamed: 0,reviewText,overall,reviewText_tokenize
0,just another flavor of kit kat but the taste i...,4.0,"[flavor, kit, kat, taste, unique, bit, differe..."
1,i bought this on impulse and it comes from jap...,3.0,"[bought, impulse, japan, amused, family, weird..."
2,really good great gift for any fan of green te...,4.0,"[good, great, gift, fan, green, tea, expensive..."
3,i had never had it before was curious to see w...,5.0,"[curious, like, smooth, great, subtle, good, f..."
4,ive been looking forward to trying these after...,4.0,"[ive, looking, forward, trying, hearing, popul..."


In [48]:
start = time.time()

df['vader_neg'] = 0
df['vader_pos'] = 0
df['vader_neu'] = 0
df['vader_compound'] = 0

for i, q in enumerate(df.reviewText_tokenize.values):
    vs = analyzer.polarity_scores(q)
    df.iloc[i, -4] = vs['neg']
    df.iloc[i, -3] = vs['pos']
    df.iloc[i, -2] = vs['neu']
    df.iloc[i, -1] = vs['compound']
    
end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'

Time taken : 19.8124195496  minutes


In [49]:
df.tail()

Unnamed: 0,reviewText,overall,reviewText_tokenize,vader_neg,vader_pos,vader_neu,vader_compound
151249,delicious glutenfree oatmeal we tried both the...,4.0,"[delicious, glutenfree, oatmeal, tried, regula...",0.041,0.4,0.559,0.9713
151250,with the many selections of instant oatmeal ce...,4.0,"[selection, instant, oatmeal, cereal, produced...",0.0,0.111,0.889,0.8271
151251,while i usually review cds and dvds as well as...,5.0,"[usually, review, cd, dvd, entertainment, rela...",0.037,0.204,0.759,0.9485
151252,my son and i enjoyed these oatmeal packets he...,4.0,"[son, enjoyed, oatmeal, packet, fond, maple, b...",0.0,0.341,0.659,0.9595
151253,i like to eat oatmeal i the mornings i usually...,4.0,"[like, oatmeal, morning, usually, buy, quaker,...",0.078,0.183,0.739,0.7024


In [50]:
X = df[['vader_neg','vader_pos','vader_neu','vader_compound']]
y = df.overall.values

In [51]:
Xs = StandardScaler().fit_transform(X)

In [52]:
Xs_train, Xs_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3)

In [53]:
lr = LogisticRegression(solver='newton-cg').fit(Xs_train, y_train)

In [54]:
lr_vader_predicted = lr.predict(Xs_test) 
print lr.score(Xs_test, y_test)

0.577429094034


In [55]:
print classification_report(y_test, lr_vader_predicted)

             precision    recall  f1-score   support

        1.0       0.23      0.06      0.10      1762
        2.0       0.00      0.00      0.00      2317
        3.0       0.21      0.00      0.00      5334
        4.0       1.00      0.00      0.00      9703
        5.0       0.58      0.99      0.73     26261

avg / total       0.58      0.58      0.43     45377



In [56]:
scores = cross_val_score(LogisticRegression(), Xs, y, cv=5)
print scores
print np.mean(scores)

[ 0.57819059  0.5773172   0.57803048  0.57666116  0.57739355]
0.577518595375
