In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import string
import json
import math

In [4]:
%matplotlib inline

In [5]:
products=pd.read_csv(r"D:\amazon_baby.csv")

In [6]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# Text Cleaning

In [7]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [8]:
trans_table=str.maketrans('','',string.punctuation)
products['review_clean']=products['review'].apply(lambda text: text.translate(trans_table))
#removing punctuation from the reviews

In [9]:
products.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


# Extract Sentiment

### We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.

In [10]:
products=products[products['rating']!=3]

### we will assign reviews with a rating of 4 or higher to be positive reviews, while the ones with rating of 2 or lower are negative

In [11]:
products['sentiment']=products['rating'].apply(lambda rating: 1 if rating >3 else -1)

In [12]:
products.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [13]:
test_indices=json.load(open(r"D:\test-idx.json"))

In [14]:
train_indices=json.load(open(r"D:\train-idx.json"))

In [15]:
test_data=products.iloc[test_indices]

In [16]:
train_data=products.iloc[train_indices]

# Build the word count vector for each review

In [17]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# \b: word boundary (a zero-width match that matches a position character that has a word character on one side)
# \w: matches a word character
# \w matches a, b, c, d, e, and f in "abc def"
# \b matches the (zero-width) position before a, after c, before d, and after f in "abc def"
# '\b\w+\b': regex of selecting a word

In [18]:
train_matrix = vectorizer.fit_transform(train_data['review_clean'])

In [19]:
vocab=list(vectorizer.vocabulary_.keys())

In [20]:
test_matrix = vectorizer.transform(test_data['review_clean'])

# Train a sentiment classifier with logistic regression

In [21]:
sentiment_model=linear_model.LogisticRegression()

In [22]:
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
sentiment_model.coef_

array([[-1.23864786e+00,  1.63829757e-04,  2.63353308e-02, ...,
         1.17300933e-02,  3.11259447e-03, -6.45352102e-05]])

In [24]:
np.sum(sum(sentiment_model.coef_ >= 0))

85877

In [25]:
coeffs = pd.DataFrame(data=vocab,columns=['coef_name'])

In [26]:
coeffs['value']=sentiment_model.coef_[0]

In [27]:
coeffs.head(20)

Unnamed: 0,coef_name,value
0,it,-1.238648
1,came,0.0001638298
2,early,0.02633533
3,and,0.005493171
4,was,3.536742e-05
5,not,7.308269e-07
6,disappointed,0.002520997
7,i,0.2710052
8,love,0.2561037
9,planet,-0.001744698


# Making predictions with logistic regression

In [28]:
sample_test_data=test_data[10:13]

In [29]:
sample_test_data.iloc[0]['review']  #positive review

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [30]:
sample_test_data.iloc[1]['review'] #negative review

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [31]:
sample_test_matrix=vectorizer.transform(sample_test_data['review_clean'])

In [32]:
scores=sentiment_model.decision_function(sample_test_matrix)

In [33]:
print(scores)

[  5.60840687  -3.12665506 -10.42354879]


In [34]:
yhat=[1 if score>0 else -1 for score in scores]

In [35]:
yhat

[1, -1, -1]

In [36]:
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1], dtype=int64)

In [37]:
sample_test_data['sentiment']

59    1
71   -1
91   -1
Name: sentiment, dtype: int64

# Probability Prediction
### $Pr(y_{i} = +1 |  x_{i}, w) = \frac{1}{1+exp(-w^{T}h(x_{i}))}$

In [38]:
pr=[1/(1+math.exp(-score)) for score in scores]

In [39]:
pr

[0.9963464905221259, 0.04202105249112379, 2.972332361958972e-05]

In [40]:
sentiment_model.predict_proba(sample_test_matrix)  #1st col is for y=0 and 2nd col is for y=1

array([[3.65350948e-03, 9.96346491e-01],
       [9.57978948e-01, 4.20210525e-02],
       [9.99970277e-01, 2.97233236e-05]])

# the most positive (and negative) review

In [41]:
test_data['probability']=sentiment_model.predict_proba(test_matrix)[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [42]:
test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.783055
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,0.934042
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999979
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,0.980318


In [43]:
test_data.sort_values('probability',ascending=False).head(20)

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
114796,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1,1.0
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.0
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,1.0
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,1.0
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,1.0
80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,I just tried this hands free breastpump bra an...,1,1.0
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1,1.0
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1,1.0
147949,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,Amazing Love Love Love it All 5 STARS all the...,1,1.0
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,1.0


In [44]:
test_data.sort_values('probability').head(20)

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,9.16432e-16
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,1.836912e-15
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,7.872963e-14
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,1.373138e-13
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,2.025236e-13
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,4.385238e-13
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,3.503202e-11
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,3.814091e-11
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,9.410738e-11
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,1.032308e-10


# accuracy of the classifier
$\Large accuracy = \frac{\#correctly\ classified\ examples(cle)}{\#total\ example(te)}$

In [45]:
def get_accuracy(matrix,df,model):
    predictions=model.predict(matrix)
    cle=sum(predictions==df['sentiment'])
    te=len(df)
    accuracy=cle/te
    return accuracy

In [46]:
accuracy=get_accuracy(test_matrix,test_data,sentiment_model)

In [47]:
print(accuracy)

0.9321154307655387


# Learn another classifier with fewer words

In [48]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [49]:
vectorizer_ws=CountVectorizer(vocabulary=significant_words)  # ws =word subset

In [50]:
train_matrix_ws=vectorizer_ws.fit_transform(train_data['review_clean'])

In [51]:
test_matrix_ws=vectorizer_ws.transform(test_data['review_clean'])

# Train a logistic regression model on a subset of data

In [52]:
simple_model=linear_model.LogisticRegression()

In [53]:
simple_model.fit(train_matrix_ws,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [54]:
simple_model.coef_[0]

array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
       -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
       -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109])

In [55]:
simple_model_coef_table=pd.DataFrame(data=significant_words,columns=['coef_name'])

In [56]:
simple_model_coef_table['value']=simple_model.coef_[0]

In [57]:
simple_model_coef_table=simple_model_coef_table.sort_values('value',ascending=False)

In [58]:
print(simple_model_coef_table)

       coef_name     value
6          loves  1.673074
5        perfect  1.509812
0           love  1.363690
2           easy  1.192538
1          great  0.944000
4         little  0.520186
7           well  0.503760
8           able  0.190909
3            old  0.085513
9            car  0.058855
11          less -0.209563
16       product -0.320556
18         would -0.362167
12          even -0.511380
15          work -0.621169
17         money -0.898031
10         broke -1.651576
13         waste -2.033699
19        return -2.109331
14  disappointed -2.348298


In [59]:
print("# positive coef =",sum(simple_model_coef_table['value']>0))

# positive coef = 10


In [60]:
compare_coeff=pd.merge(simple_model_coef_table,coeffs,how='left',on='coef_name')

In [61]:
compare_coeff.sort_values('value_x',ascending=False)

Unnamed: 0,coef_name,value_x,value_y
0,loves,1.673074,0.009185
1,perfect,1.509812,-0.637099
2,love,1.36369,0.256104
3,easy,1.192538,-0.005086
4,great,0.944,0.068689
5,little,0.520186,-0.301196
6,well,0.50376,1.4e-05
7,able,0.190909,0.209084
8,old,0.085513,0.008192
9,car,0.058855,0.051406


### positive words in the simple_model does not mean positive words in the sentiment_model (similarly for the negative words)

# Comparing models

In [62]:
train_error_sentiment_model=get_accuracy(train_matrix,train_data,sentiment_model)

In [63]:
train_error_sentiment_model

0.967934880374168

In [64]:
train_error_simple_model=get_accuracy(train_matrix_ws,train_data,simple_model)

In [65]:
train_error_simple_model

0.8668225700065959

In [66]:
test_error_sentiment_model=get_accuracy(test_matrix,test_data,sentiment_model)

In [67]:
test_error_sentiment_model

0.9321154307655387

In [68]:
test_error_simple_model=get_accuracy(test_matrix_ws,test_data,simple_model)

In [69]:
test_error_simple_model

0.8693604511639069

### simple model is underfit

# Baseline: Majority class prediction

In [70]:
num_positive=sum(train_data['sentiment']==1)

In [71]:
num_negative=sum(train_data['sentiment']==-1)

In [72]:
num_positive,num_negative

(112164, 21252)

In [73]:
accuracy_baseline=sum(test_data['sentiment']==1)/len(test_data)

In [74]:
accuracy_baseline

0.8427825773938085

accuracy of simple model is close to majority class prediction model
accuracy of sentiment model is definitely better than the majority classifier