# Predicting sentiment from product reviews - Gaio Scikit-learn version

In [23]:
#Import libraries
import sframe
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

In [30]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

In [31]:
#define the data dictionary
dtype_dict = {'name':str,'review':str,'rating':int}

In [43]:
#import data
#products = sframe.SFrame('amazon_baby.gl/')
products= pd.read_csv('amazon_baby.csv',converters=dtype_dict)

In [44]:
products.head(n=3)

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5


In [45]:
products.dtypes

name      object
review    object
rating     int64
dtype: object

In [46]:
type(products['review'][0])

str

In [47]:
#remove the puntuation
#products['review_clean'] = products['review'].apply(remove_punctuation)
products['review_clean'] = products['review'].apply(lambda x: x.translate(None, string.punctuation))

In [48]:
#fill n/a cells
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [50]:
#ignore all reviews with rating = 3, since they tend to have a neutral sentiment.
products = products[products['rating'] != 3]

In [51]:
#assign reviews with a rating >=4 to be positive reviews, while the ones with rating <= 2 are negative.
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [53]:
#split the data -nb I will not get the same results as in the graphlab version
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(products,test_size=0.2, random_state=1)

In [54]:
#build the vector word count for each review
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [60]:
# Create the logistic regression model - well, a bit faster than year 2000, still so fu.... slower than graphlab!!!
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [72]:
weights = sentiment_model.coef_
print weights.size

121505


In [69]:
num_positive_weights = (weights>0).sum()
print num_positive_weights

85751


In [73]:
#make predictions on the test data.
sample_test_data = test_data[10:13]
print sample_test_data

                                                     name  \
117165       Lassig Glam Small Messenger Diaper Bag ,navy   
30667   BOB Weather Shield for Single Revolution/Strol...   
60268                Tiny Love Sweet Island Dreams Mobile   

                                                   review  rating  \
117165  While I'm sure this bag is a wonderful diaper ...       5   
30667   This weather shield has been a great accessory...       5   
60268   And we managed to get it to attach to the Grac...       5   

                                             review_clean  sentiment  
117165  While Im sure this bag is a wonderful diaper b...          1  
30667   This weather shield has been a great accessory...          1  
60268   And we managed to get it to attach to the Grac...          1  


In [74]:
#convert sample_test_data into the sparse matrix format first
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
#calculate the scores with scikit built-in decision function
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  7.95759432  11.18179808   1.16569787]


In [77]:
#Prediciting Sentiment
predictions=np.where(scores>0,+1,-1)
predictions

array([1, 1, 1])

In [79]:
#check what scikit comes up with
print "Class predictions according to Scikit:" 
print sentiment_model.predict(sample_test_matrix)

Class predictions according to Scikit:
[1 1 1]


In [81]:
#define link function
def sigmoid(x):
    return 1.0/(1 + np.exp(-x))

In [82]:
#calculate probabilities
print sigmoid(scores)

[ 0.99965013  0.99998607  0.76236651]


In [84]:
#compare with scikit
print sentiment_model.predict_proba(sample_test_matrix)[:,1]

[ 0.99965013  0.99998607  0.76236651]


In [95]:
#Compute accuracy of the classifier
# First get the predictions
pred_class = sentiment_model.predict(test_matrix) 

true_labels = test_data['sentiment']
    
# Compute the number of correctly classified examples
correct_predictions = test_data[pred_class == true_labels ]

# Then compute accuracy by dividing num_correct by total number of examples
print len(correct_predictions)
print len(test_data)
accuracy = len(correct_predictions)/float(len(test_data))
print ("My accuracy %.6f") % accuracy

# Accuracy of the sentiment model
sentiment_model_accuracy = accuracy_score(y_true=true_labels, y_pred=pred_class)
print ("Scikit accuracy %.6f") % sentiment_model_accuracy

31071
33351
My accuracy 0.931636
Scikit accuracy 0.931636
