In [135]:
from __future__ import print_function, division, absolute_import

import string
import pdb

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [47]:
products = pd.read_csv('amazon_baby.csv', index_col=None)

In [48]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [45]:
## Average number of times a product has been rated
products.groupby('name')['name'].count().mean()

5.6439424415905926

Cleaning punctuations from the review, so that there is no effect of comma, full stop etc. on our model

In [34]:
def remove_punctuation(text):
    return text.translate(None, string.punctuation)

In [59]:
# Removing NA values in the review
products.fillna({'review':' '}, inplace=True)

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4


In [60]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [50]:
products.review.head(1).values

array([ 'These flannel wipes are OK, but in my opinion not worth keeping.  I also ordered someImse Vimse Cloth Wipes-Ocean Blue-12 countwhich are larger, had a nicer, softer texture and just seemed higher quality.  I use cloth wipes for hands and faces and have been usingThirsties 6 Pack Fab Wipes, Boyfor about 8 months now and need to replace them because they are starting to get rough and have had stink issues for a while that stripping no longer handles.'], dtype=object)

In [29]:
products.review_without_punctuation.head(1).values

array([ 'These flannel wipes are OK but in my opinion not worth keeping  I also ordered someImse Vimse Cloth WipesOcean Blue12 countwhich are larger had a nicer softer texture and just seemed higher quality  I use cloth wipes for hands and faces and have been usingThirsties 6 Pack Fab Wipes Boyfor about 8 months now and need to replace them because they are starting to get rough and have had stink issues for a while that stripping no longer handles'], dtype=object)

Review has been cleaned of the punctuations marks

Removing all 3 ratings as they are more neutral in nature

In [62]:
products = products[products.rating != 3]

In [65]:
products.ix[products.rating>=4,'sentiment'] = 1
products.ix[products.rating <= 2, 'sentiment'] = -1

In [70]:
train_index = pd.read_json('module-2-assignment-train-idx.json')
test_index = pd.read_json('module-2-assignment-test-idx.json')`

In [107]:
train_index.head().values

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [125]:
train_data = products.iloc[train_index[0].values]
test_data  = products.iloc[test_index[0].values] 

In [200]:
train_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


Creating bag of words based feature

In [128]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data.review_clean.tolist())
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'].values)

In [136]:
## Finding the column at which few of the features are
print(vectorizer.vocabulary_.get("documents"))
print(vectorizer.vocabulary_.get("awesome"))

35362
13287


In [137]:
## Checking the shape of the train matrix and test matrix, the number of column in the two should be same
print(train_matrix.shape)
print(test_matrix.shape)

(133416, 121712)
(33336, 121712)


In [138]:
lr = LogisticRegression()

In [139]:
lr.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [236]:
## Number of co-efficients which are greater than 0
print("Number of co-efficients >= 0:: %i" %lr.coef_[lr.coef_ >= 0].shape)


Number of co-efficients >= 0:: 85752


85752 - features are in a way features which gives positive sentiment and reamining might be responsible for negative or neutral sentiments

In [147]:
test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1


In [148]:
sample_test_data = test_data[10:13]

In [152]:
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [155]:
sample_test_data.iloc[0].review_clean

'Absolutely love it and all of the Scripture in it  I purchased the Baby Boy version for my grandson when he was born and my daughterinlaw was thrilled to receive the same book again'

In [156]:
sample_test_data.iloc[1].review_clean

'Would not purchase again or recommend The decals were thick almost plastic like and were coming off the wall as I was applying them The would NOT stick Literally stayed stuck for about 5 minutes then started peeling off'

In [157]:
sample_test_matrix = vectorizer.transform(sample_test_data.review_clean)

In [158]:
scores = lr.decision_function(sample_test_matrix)
print(scores)

[  5.59095054  -3.12647284 -10.42233483]


In [160]:
def sentiments(score):
    return 1 if score > 0 else -1

In [164]:
for s in scores:
    print("Sentiment is:: %i" %sentiments(s))

Sentiment is:: 1
Sentiment is:: -1
Sentiment is:: -1


In [165]:
lr.predict(sample_test_matrix)

array([ 1., -1., -1.])

In [172]:
def get_probability(score):
    return 1/(1 + np.exp(-score))

In [178]:
for s in scores:
    print("Probability of having positive sentiment is:: %f" % get_probability(s))

Probability of having positive sentiment is:: 0.996282
Probability of having positive sentiment is:: 0.042028
Probability of having positive sentiment is:: 0.000030


In [179]:
lr.predict_proba(sample_test_matrix)

array([[  3.71760709e-03,   9.96282393e-01],
       [  9.57971612e-01,   4.20283885e-02],
       [  9.99970241e-01,   2.97594272e-05]])

In [180]:
test_positive_prob = lr.predict_proba(test_matrix)[:,1]

In [189]:
test_positive_prob.argsort()[-20:][::-1]

array([18112, 30634, 17558, 24286, 32782, 15732, 24899,  9555, 11923,
        9125, 21531, 25554, 30535, 26830, 20743, 14482,  4140, 30076,
       31271, 26838])

In [237]:
test_data.iloc[test_positive_prob.argsort()[-20:][::-1]]

Unnamed: 0,name,review,rating,review_clean,sentiment,predicted_sentiment,l2_sentiment
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,1,1
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1,1,1
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1,1,1
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1,1,1
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1,1,1
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,1,1
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,1,1
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1,1,1
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1,1,1
50315,"P'Kolino Silly Soft Seating in Tias, Green",I've purchased both the P'Kolino Little Reader...,4,Ive purchased both the PKolino Little Reader C...,1,1,1


In [191]:
arr = np.array([1, 3, 2, 4, 5])
arr.argsort()[:3]
# [::-1]

array([0, 2, 1])

In [238]:
test_data.iloc[test_positive_prob.argsort()[:20]]

Unnamed: 0,name,review,rating,review_clean,sentiment,predicted_sentiment,l2_sentiment
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,-1,-1
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,-1,-1
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,-1,1
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,-1,-1
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,-1,-1
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,-1,-1
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,-1,-1
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,-1,-1
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,-1,-1
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,-1,-1


In [193]:
test_data['predicted_sentiment'] = lr.predict(test_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [196]:
accuracy = (test_data[(test_data.sentiment == 1) & (test_data.predicted_sentiment == 1)].shape[0] +
            test_data[(test_data.sentiment == -1) & (test_data.predicted_sentiment == -1)].shape[0])/ test_data.shape[0]

In [197]:
print("Accuracy of the model is:: %f" %accuracy)

Accuracy of the model is:: 0.932445


Creating model with the smaller set of features

In [198]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [199]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [201]:
lr2 = LogisticRegression()
lr2.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [205]:
lr2_coeff = pd.DataFrame({'words': significant_words, 'coeff': lr2.coef_.flatten()})

In [207]:
lr2_coeff[['words', 'coeff']]

Unnamed: 0,words,coeff
0,love,1.36369
1,great,0.944
2,easy,1.192538
3,old,0.085513
4,little,0.520186
5,perfect,1.509812
6,loves,1.673074
7,well,0.50376
8,able,0.190909
9,car,0.058855


In [211]:
print("Positive coefficients in simple model:: %i" %len(lr2.coef_[lr2.coef_>0]))

Positive coefficients in simple model:: 10


In [218]:
train_data.loc[:,'lr_sentiment'] = lr.predict(train_matrix)

In [219]:
train_data.loc[:,'l2_sentiment'] = lr2.predict(train_matrix_word_subset)

In [223]:
print("Training accuracy for the first model %f", (train_data[(train_data.sentiment == 1) & (train_data.lr_sentiment ==1)].shape[0]+
                                                  train_data[(train_data.sentiment ==-1) & (train_data.lr_sentiment ==-1)].shape[0])/train_data.shape[0])

Training accuracy for the first model %f 0.967710019788


In [228]:
print("Training accuracy for the second model %f", (train_data[(train_data.sentiment == 1) & 
                                                               (train_data.l2_sentiment ==1)].shape[0]+
                                                  train_data[(train_data.sentiment ==-1) & 
                                                             (train_data.l2_sentiment ==-1)].shape[0])/
      train_data.shape[0])

Training accuracy for the second model %f 0.866822570007


In [230]:
test_data.loc[:,'l2_sentiment'] = lr2.predict(test_matrix_word_subset)

In [232]:
print((test_data[(test_data.sentiment == 1) & (test_data.l2_sentiment == 1)].shape[0] +
            test_data[(test_data.sentiment == -1) & (test_data.l2_sentiment == -1)].shape[0])/ test_data.shape[0])

0.869360451164


In [233]:
train_data[train_data.sentiment == 1].shape[0]/train_data.shape[0]

0.8407087605684476

In [234]:
test_data[test_data.sentiment == 1].shape[0]/test_data.shape[0]

0.8427825773938085