# Product sentiment analysis

Kaggle competition: https://www.kaggle.com/c/product-reviews-sentiment-analysis-light

The goal is to predict whether the product review is positive or negative based on review text.

In [99]:
# import necessary libraries

import pandas as pd
import numpy as np
import nltk

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB



import warnings
warnings.filterwarnings('ignore')

Import train and test data:

In [10]:
train_data = pd.read_csv('products_sentiment_train.tsv', sep='\t', names=['text', 'label'])
test_data = pd.read_csv('products_sentiment_test.tsv', sep='\t')

In [11]:
train_data.head()

Unnamed: 0,text,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [14]:
print("Train data size: {}".format(train_data.shape))

Train data size: (2000, 2)


In [15]:
test_data.head()

Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [16]:
print(train_data[train_data['label'] == 1].count())

text     1274
label    1274
dtype: int64


In [33]:
X_train = train_data['text']
X_test = test_data['text']
y = train_data['label']

## Building model 

Before building model we need some baseline predictor (all 0's, all 1's) to compare our accuracy score

In [32]:
size = len(y)

baseline_all_ones = [1] * size
baseline_all_zeroes = [0] * size
baseline_random = np.random.choice([0, 1], size=size)

print("Accuracy for all-1 baseline predictor: {}".format(accuracy_score(baseline_all_ones, y)))
print("Accuracy for all-0 baseline predictor: {}".format(accuracy_score(baseline_all_zeroes, y)))
print("Accuracy for random baseline predictor: {}".format(accuracy_score(baseline_random, y)))

Accuracy for all-1 baseline predictor: 0.637
Accuracy for all-0 baseline predictor: 0.363
Accuracy for random baseline predictor: 0.481


Let's try to build and estimate a simple linear model:

In [100]:
clf_pipeline = Pipeline(steps=[
        ('vectorizer', TfidfVectorizer()),
        ('estimator', LinearSVC())
])

In [71]:
print("Score on training set: {}".format(np.mean(cross_val_score(clf_pipeline, X_train, y, cv=5))))

Score on training set: 0.7665031843949025


So far our classifier is better than baseline.

## Parameters setting

First let's try to change different elements of our pipeline to see if we can improve accuracy.

In [79]:
stop_words_nltk = nltk.corpus.stopwords.words('english')

Let's search for the best model parameters using Grid Search and cross validation

In [77]:
clf_pipeline.get_params()

{'memory': None,
 'steps': [('vectorizer',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('estimator',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False))],
 'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0

In [116]:
param_grid = {
    'vectorizer': [CountVectorizer(), TfidfVectorizer()],
    'estimator': [LinearSVC(), LogisticRegression()],
    'vectorizer__stop_words': [None, stop_words_nltk, 'english'],
    'vectorizer__min_df': [1, 10, 50],
    'vectorizer__ngram_range': [(1,3), (1,4)],
    'vectorizer__analyzer': ['word'],
    'estimator__C': [0.01, 0.1, 1, 3, 20]
}

grid = GridSearchCV(clf_pipeline, param_grid=param_grid, scoring='accuracy', cv=5)

In [117]:
%%time
grid.fit(X_train, y)
print("Done!")

Done!
CPU times: user 4min 6s, sys: 440 ms, total: 4min 6s
Wall time: 3min 54s


In [118]:
grid.best_score_

0.7935

## Results submission

In [119]:
final_pipeline = grid.best_estimator_

Making predictions for our test data:

In [120]:
final_pipeline.fit(X_train, y)
predictions = final_pipeline.predict(X_test)

In [121]:
test_data['y'] = predictions

In [122]:
test_data.head()

Unnamed: 0,Id,text,y
0,0,"so , why the small digital elph , rather than ...",1
1,1,3/4 way through the first disk we played on it...,0
2,2,better for the zen micro is outlook compatibil...,1
3,3,6 . play gameboy color games on it with goboy .,1
4,4,"likewise , i 've heard norton 2004 professiona...",0


In [123]:
test_data.to_csv("submission.csv", columns=['Id', 'y'], index=False)