In [81]:
import pandas as pd
import numpy as np
import json
import string

In [31]:
products = pd.read_csv('amazon_baby.csv')
products['review'].fillna('', inplace=True)

In [32]:
def remove_punctuation(text):
    table = str.maketrans({key: None for key in string.punctuation})
    return str(text).translate(table)

products['review_clean'] = products['review'].apply(remove_punctuation)
# remove the rating = 3 reviews
products = products[products['rating'] != 3].reset_index(drop=True)
# add labels
products['sentiment'] = products['rating'].apply(lambda rating : 1 if rating > 3 else -1)

In [33]:
products.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
0,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
1,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
2,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
3,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
4,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [34]:
with open('week1_train_idx.json', 'r') as f:
    train_idx = json.load(f)
with open('week1_test_idx.json', 'r') as f:
    test_idx = json.load(f)

In [42]:
train_data, test_data = products.iloc[train_idx, :], products.iloc[test_idx, :]

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [48]:
# train a logistic regression model
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Quiz 1

In [59]:
sum(sentiment_model.coef_[0] >= 0)

87194

In [65]:
test_data.reset_index(inplace=True, drop=True)
sample_test_data = test_data[10:13]

In [77]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)

### Quiz 2

In [78]:
print(scores)

[  5.60129879  -3.17005306 -10.42408041]


### Quiz 3

In [93]:
train_data.reset_index(inplace=True, drop=True)
train_proba = sentiment_model.predict_proba(train_matrix)[:,1]
top_idx = np.argsort(train_proba)

In [100]:
train_data.iloc[top_idx[-20:], :]

Unnamed: 0,name,review,rating,review_clean,sentiment
64509,"ERGObaby Original Baby Carrier, Galaxy Grey",We purchased this carrier after a recommendati...,5,We purchased this carrier after a recommendati...,1
79239,Chicco Keyfit 22 Pound Infant Car Seat And Bas...,I bought this right before the KeyFit 30 came ...,5,I bought this right before the KeyFit 30 came ...,1
79338,"Britax B-Ready Stroller, Black",Some differences with Uppababy Vs. Britax B-Re...,4,Some differences with Uppababy Vs Britax BRead...,1
73473,ERGO Sport Carrier - Black,I researched carriers for a long time. I am so...,5,I researched carriers for a long time I am so ...,1
67577,"Bright Starts Around We Go Activity Station, D...","When shopping for an exersaucer, I came across...",5,When shopping for an exersaucer I came across ...,1
75186,"Chicco Cortina Together Double Stroller, Fuego",I was very excited when I heard Chicco was fin...,5,I was very excited when I heard Chicco was fin...,1
34821,"Baby K'tan Baby Carrier, Black, X-Large",Check and recheck the K'Tan for size issues be...,5,Check and recheck the KTan for size issues bef...,1
66594,"Dream On Me / Mia Moda Atmosferra Stroller, Nero",I love this stroller SO much! I am not afraid ...,5,I love this stroller SO much I am not afraid t...,1
131546,"Baby Jogger City Lite Stroller, Black",AMAZING stroller! It took me about 2 minutes ...,5,AMAZING stroller It took me about 2 minutes t...,1
130793,Thirsties Duo Wrap Diaper Cover with Hook and ...,The Thirsties are really an awesome concept th...,5,The Thirsties are really an awesome concept th...,1


In [99]:
train_data.iloc[top_idx[:20], :]

Unnamed: 0,name,review,rating,review_clean,sentiment
63316,Baby Einstein Around The World Discovery Center,"First off, I did manage to find this product f...",1,First off I did manage to find this product fo...,-1
127391,"Zooper Twist Escape Stroller, Summer Day",I had to return this stroller for three reason...,1,I had to return this stroller for three reason...,-1
7460,Arms Reach Co-Sleeper brand Mini Co-Sleeper Ba...,"Please see my email to the company:Hello,I am ...",1,Please see my email to the companyHelloI am wr...,-1
107604,Graco Pack 'n Play Playard - Dempsey,My disappointment with this product prompted m...,1,My disappointment with this product prompted m...,-1
65419,"Peg-Perego Aria Twin Stroller, Java",I am so incredibly disappointed with the strol...,1,I am so incredibly disappointed with the strol...,-1
96951,Summer Infant Sleek and Secure Hand Held Video...,The first monitor broke within 1 month of use-...,1,The first monitor broke within 1 month of use ...,-1
1597,Philips Avent 3 Pack 9oz Bottles,"(This is a long review, but if you read the wh...",1,This is a long review but if you read the whol...,-1
87861,The European NANNY Baby Movement Monitor - EU ...,"The previous reviewers laud the ""piece of mind...",1,The previous reviewers laud the piece of mind ...,-1
41642,"Dream On Me Bassinet, Blue",My husband and I are VERY disappointed and sho...,1,My husband and I are VERY disappointed and sho...,-1
87522,Levana Safe N'See Digital Video Baby Monitor w...,I have NEVER written a review before for anyth...,1,I have NEVER written a review before for anyth...,-1


In [106]:
sentiment_model.score(train_matrix, train_data['sentiment'])

0.96850452719314029

In [107]:
sentiment_model.score(test_matrix, test_data['sentiment'])

0.93229541636669067

In [108]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [109]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [110]:
train_matrix_word_subset

<133416x20 sparse matrix of type '<class 'numpy.int64'>'
	with 292197 stored elements in Compressed Sparse Row format>

In [111]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [112]:
simple_model_coef_table = pd.DataFrame(data={'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [125]:
simple_model_coef_table = \
simple_model_coef_table.sort_values('coefficient', ascending=False).reset_index(drop=True)

In [126]:
simple_model_coef_table

Unnamed: 0,coefficient,word
0,1.673074,loves
1,1.509812,perfect
2,1.36369,love
3,1.192538,easy
4,0.944,great
5,0.520186,little
6,0.50376,well
7,0.190909,able
8,0.085513,old
9,0.058855,car


### Quiz

In [150]:
for i, w in enumerate(simple_model_coef_table['word']):
    coef = simple_model_coef_table.ix[i, 'coefficient']
    if coef > 0:
        coef_full = sentiment_model.coef_.flatten()[vectorizer.vocabulary_[w]]
        print('{:>10}{:>6.2f}{:>6.2f}'.format(w, coef, coef_full))

     loves  1.67  1.52
   perfect  1.51  1.87
      love  1.36  1.58
      easy  1.19  1.36
     great  0.94  1.23
    little  0.52  0.64
      well  0.50  0.54
      able  0.19  0.39
       old  0.09  0.05
       car  0.06  0.12


In [151]:
simple_model.score(train_matrix_word_subset, train_data['sentiment'])

0.8668225700065959

In [152]:
simple_model.score(test_matrix_word_subset, test_data['sentiment'])

0.86936045116390692

In [155]:
(test_data['sentiment'] > 0).sum() / test_data.shape[0]

0.84278257739380846