# Import and Clean Data

In [1]:
from sframe import SFrame

In [3]:
products = SFrame('../data/amazon_baby.csv.gz')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [280]:
products

name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4


In [281]:
from string import punctuation

def remove_punctuation(text):
        return text.translate(None, punctuation)

In [282]:
products['review_clean'] = products['review'].apply(remove_punctuation)

# Extract Semtinment

In [283]:
products = products[products['rating'] != 3]

In [284]:
products['sentiment'] = products['rating'].apply(lambda r: +1 if r>3 else -1)

In [285]:
train_data, test_data = products.random_split(.8, seed=1)

# Count Words

In [286]:
from sklearn.feature_extraction.text import CountVectorizer

In [326]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])

In [288]:
train_matrix

<133416x121712 sparse matrix of type '<type 'numpy.int64'>'
	with 7326618 stored elements in Compressed Sparse Row format>

# Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [18]:
model = LogisticRegression()
model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
model.coef_

array([[ -1.23707677e+00,   1.96133895e-04,   2.59841044e-02, ...,
          1.14844613e-02,   3.17099575e-03,  -6.98805068e-05]])

In [37]:
print sum(model.coef_[0] >= 0)
print float(sum(model.coef_[0] >= 0))/len(model.coef_[0])

85752
0.704548442224


# Make Predictions

In [39]:
sample_test_data = test_data[10:13]
sample_test_data

name,review,rating,review_clean,sentiment
Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in ...,5,Absolutely love it and all of the Scripture in ...,1
Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The decals ...,2,Would not purchase again or recommend The decals ...,-1
New Style Trailing Cherry Blossom Tree Decal ...,Was so excited to get this product for my baby ...,1,Was so excited to get this product for my baby ...,-1


In [91]:
print sample_test_data[0]['review']
print
print sample_test_data[1]['review']
print
print sample_test_data[2]['review']

Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.

Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.

Was so excited to get this product for my baby girls bedroom!  When I got it the back is NOT STICKY at all!  Every time I walked into the bedroom I was picking up pieces off of the floor!  Very very frustrating!  Ended up having to super glue it to the wall...very disappointing.  I wouldn't waste the time or money on it.


In [45]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])

In [47]:
model.decision_function(sample_test_matrix)

array([  5.59095054,  -3.12647284, -10.42233483])

In [135]:
from sframe import SArray
def my_predictions(model, test_matrix):
    return SArray([+1 if s >= 0 else -1 for s in model.decision_function(test_matrix)])

In [139]:
print my_predictions(model, sample_test_matrix)
print SArray(model.predict(sample_test_matrix))

[1, -1, -1]
[1, -1, -1]


In [70]:
from numpy import exp
logit = lambda x: 1.0/(1+exp(-x))

In [82]:
def my_proba_predictions(model, test_matrix):
    return [logit(s) for s in model.decision_function(test_matrix)]

In [90]:
print [round(x, 3) for x in my_proba_predictions(model, sample_test_matrix)]
print [round(x[1], 3) for x in model.predict_proba(sample_test_matrix)]


[0.996, 0.042, 0.0]
[0.996, 0.042, 0.0]


# Evaluate Model

In [298]:
products_matrix = vectorizer.transform(products['review_clean'])
products['predictions'] = model.decision_function(products_matrix)
products = products.sort(['predictions'], ascending=False)

In [301]:
print max(products['predictions'])
print min(products['predictions'])

91.3910207484
-48.1228998054


In [305]:
for n in products[0:20]: print round(n['predictions']), n['name'], "\n"

91.0 Baby Jogger 2011 City Mini Double Stroller, Black/Black 

79.0 Joovy Scooter Single Stroller Greenie 

69.0 Joovy Ergo Caboose Tandem Stroller, Black 

69.0 Joovy Ergo Caboose Tandem Stroller Black 

61.0 Baby K'tan Baby Carrier, Black, X-Large 

60.0 Zooper 2011 Waltz Standard Stroller, Flax Brown 

58.0 Bumbleride 2011 Flite Lightweight Compact Travel Stroller, Jet 

57.0 Ubbi Cloth Diaper Pail Liner 

54.0 UPPAbaby PiggyBack Ride Along Board 

54.0 Infantino Wrap and Tie Baby Carrier, Black Blueberries 

52.0 Baby Einstein Around The World Discovery Center 

52.0 Covillow Breastfeeding Cover and Pillow-In-One, Peaceful Periwinkle 

51.0 Chicco Cortina Together Double Stroller, Fuego 

51.0 Joovy Zoom 360 Swivel Wheel Jogging Stroller, Blue 

49.0 Inglesina 2013 Trip Stroller, Lampone Purple 

49.0 HALO SleepSack Big Kids Micro-Fleece Wearable Blanket, Red, 2T- 3T 

48.0 Britax 2012 B-Agile Stroller, Red 

48.0 Ju-Ju-Be Be Right Back Backpack Diaper Bag, Black/Silver 

48.0 Baby

In [325]:
max(products[products['name'] == 'Snuza Portable Baby Movement Monitor']['predictions'])

21.57134407994315

In [306]:
for n in products[-21:-1]:  print round(n['predictions']), n['name'], "\n"

-29.0 Adiri BPA Free Natural Nurser Ultimate Bottle Stage 1 White, Slow Flow (0-3 months) 

-30.0 Evenflo Crosstown Soft Portable Travel Gate 

-30.0 Kids Line Cascade Bow Diaper Bag, Black 

-30.0 Safety 1st Exchangeable Tip 3 in 1 Thermometer 

-31.0 Kinderwagon BLACK Hop Double Child Stroller w/ Canopy 

-32.0 Munchkin Arm and Hammer Diaper Pail, White 

-33.0 iPad Travel Case 

-33.0 Ameda Purely Yours Breast Pump - Carry All 

-33.0 Wimmer-Ferguson Infant Stim-Mobile 

-34.0 Levana Safe N'See Digital Video Baby Monitor with Talk-to-Baby Intercom and Lullaby Control (LV-TW501) 

-34.0 Levana Safe N'See Digital Video Baby Monitor with Talk-to-Baby Intercom and Lullaby Control (LV-TW501) 

-35.0 Fisher-Price Ocean Wonders Aquarium Bouncer 

-36.0 Dream On Me Bassinet, Blue 

-37.0 Philips Avent 3 Pack 9oz Bottles 

-38.0 Summer Infant Sleek and Secure Hand Held Video Monitor, White/ Silver 

-38.0 The European NANNY Baby Movement Monitor - EU Medical device SIDS Certified - NOW ON US

In [142]:
def accuracy(model, test_matrix, sentiments):
    return float(sum(my_predictions(model, test_matrix) == sentiments))/len(sentiments)

In [327]:
round(accuracy(model, test_matrix, test_data['sentiment']), 2)

0.93

# Classifier with Less Words

In [173]:
significant_words = [
    'love', 'great', 'easy', 'old', 'little', 'perfect', 'loves',
    'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 
    'disappointed','work', 'product', 'money', 'would', 'return']
len(significant_words)

20

In [148]:
vectorizer_s = CountVectorizer(vocabulary=significant_words)
train_matrix_s = vectorizer_s.fit_transform(train_data['review_clean'])
test_matrix_s = vectorizer_s.transform(test_data['review_clean'])

In [149]:
model_s = LogisticRegression()
model_s.fit(train_matrix_s, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [174]:
model_s.coef_[0]

array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
       -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
       -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109])

In [205]:
coeffs_s = SFrame({'word': significant_words, 'coeff_s': model_s.coef_[0]})
coeffs_s = coeffs_s.sort('coeff_s', ascending=False)
coeffs_s

coeff_s,word
1.67307389259,loves
1.50981247669,perfect
1.36368975931,love
1.19253827349,easy
0.943999590572,great
0.520185762718,little
0.503760457768,well
0.190908572064,able
0.085512779463,old
0.0588546711527,car


In [203]:
sum(coeffs_s['coeff_s'] >= 0)

10

# Comparing Model Coefficients

In [197]:
vocab = vectorizer.vocabulary_.keys()
coeffs = {vocab[i]: c for i, c in enumerate(model.coef_[0])}

In [199]:
{k: v for k, v in coeffs.iteritems() if k in significant_words}

{u'able': -0.00014944254047634769,
 u'broke': 0.061562578920694153,
 u'car': -0.33749619234699524,
 u'disappointed': 0.0066939913303456196,
 u'easy': 0.00016412098062606868,
 u'even': 0.0016928801524062,
 u'great': 3.5024986862054141e-05,
 u'less': 0.18249244343277596,
 u'little': -0.005879794083691086,
 u'love': 0.18681615792618439,
 u'loves': 0.12383895600248326,
 u'money': 0.027982354342579806,
 u'old': 0.18497299231120351,
 u'perfect': -0.010272616905189295,
 u'product': 0.012491471505591097,
 u'return': -0.93451682340903619,
 u'waste': 0.0032965621258323821,
 u'well': 0.24184153063656716,
 u'work': 6.7656260414778423e-05,
 u'would': 0.0003163222287757264}

In [210]:
all_coeffs = SFrame({
    'word': significant_words, 
    'coeff': [coeffs[k] for k in significant_words], 
    'coeff_s': model_s.coef_[0]
})
all_coeffs.sort('coeff_s', ascending=False)
all_coeffs

coeff,coeff_s,word
0.186816157926,1.36368975931,love
3.50249868621e-05,0.943999590572,great
0.000164120980626,1.19253827349,easy
0.184972992311,0.085512779463,old
-0.00587979408369,0.520185762718,little
-0.0102726169052,1.50981247669,perfect
0.123838956002,1.67307389259,loves
0.241841530637,0.503760457768,well
-0.000149442540476,0.190908572064,able
-0.337496192347,0.0588546711527,car


In [243]:
from numpy import sign
sum(sign(all_coeffs['coeff']) != sign(all_coeffs['coeff_s']))

13

# Comparing Model Accuracy

In [328]:
print round(accuracy(model, train_matrix, train_data['sentiment']), 2)
print round(accuracy(model_s, train_matrix_s, train_data['sentiment']), 2)

0.97
0.8


In [329]:
print round(accuracy(model, test_matrix, test_data['sentiment']), 2)
print round(accuracy(model_s, test_matrix_s, test_data['sentiment']), 2)

0.93
0.81


In [265]:
# majority model
print round(float(sum(train_data['sentiment'] == 1))/len(train_data), 2)
print round(float(sum(test_data['sentiment'] == 1))/len(test_data), 2)

0.84
0.84
