In [1]:
import graphlab

In [2]:
products = graphlab.SFrame("amazon_baby.csv/amazon_baby.csv")

This non-commercial license of GraphLab Create for academic use is assigned to edward30@163.com and will expire on September 27, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Edward\AppData\Local\Temp\graphlab_server_1503627234.log.0


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
products

name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4


# Fill n/a value

In [4]:
products['review'] = products['review'].apply(lambda str : 'NaN' if str == '' else str)

In [5]:
products[products['review']=='']

name,review,rating


# Perform text cleaning

In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

In [7]:
len(products)

183531

# Extract sentiment

In [8]:
products = products[products['rating'] != 3]

In [9]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [10]:
products

name,review,rating,review_clean,sentiment
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5,it came early and was not disappointed i love ...,1
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5,Very soft and comfortable and warmer than it ...,1
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I ...,1
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5,All of my kids have cried nonstop when I tried to ...,1
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5,When the Binky Fairy came to our house we didnt ...,1
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4,Lovely book its bound tightly so you may no ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5,Perfect for new parents We were able to keep ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5,A friend of mine pinned this product on Pinte ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4,This has been an easy way for my nanny to record ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4,I love this journal and our nanny uses it ...,1


# Split into training and test sets

In [11]:
train_data, test_data = products.random_split(.8, seed=1)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [13]:
import sklearn

In [14]:
sentiment_label = sklearn.linear_model.LogisticRegression()

In [15]:
len(train_data['sentiment'])

133416

In [16]:
len(train_data['review_clean'])

133416

In [17]:
sentiment_label.fit(train_matrix,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
coef = sentiment_label.coef_

In [19]:
sum = 0
i=0
while i < coef[0].size:
    if(coef[0][i]>=0):
        sum+=1
    i+=1

In [20]:
print sum

86000


# Make prediction with logistic regression

In [21]:
sample_test_data = test_data[10:13]
print sample_test_data

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
|   Our Baby Girl Memory Book   | Absolutely love it and all... |   5    |
| Wall Decor Removable Decal... | Would not purchase again o... |   2    |
| New Style Trailing Cherry ... | Was so excited to get this... |   1    |
+-------------------------------+-------------------------------+--------+
+-------------------------------+-----------+
|          review_clean         | sentiment |
+-------------------------------+-----------+
| Absolutely love it and all... |     1     |
| Would not purchase again o... |     -1    |
| Was so excited to get this... |     -1    |
+-------------------------------+-----------+
[3 rows x 5 columns]



In [22]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_label.decision_function(sample_test_matrix)
print scores

[  5.58246223  -3.16015917 -10.4308965 ]


In [23]:
scores = sentiment_label.predict(sample_test_matrix)
print scores

[ 1 -1 -1]


# Find the most positive review

In [24]:
test_data['scores'] = sentiment_label.decision_function(test_matrix)

In [25]:
test_data

name,review,rating,review_clean,sentiment
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4,This has been an easy way for my nanny to record ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4,I love this journal and our nanny uses it ...,1
Nature's Lullabies First Year Sticker Calendar ...,"I love this little calender, you can keep ...",5,I love this little calender you can keep ...,1
Nature's Lullabies Second Year Sticker Calendar ...,"I had a hard time finding a second year calendar, ...",5,I had a hard time finding a second year calendar ...,1
"Lamaze Peekaboo, I Love You ...","One of baby's first and favorite books, and i ...",4,One of babys first and favorite books and it is ...,1
"Lamaze Peekaboo, I Love You ...",My son loved this book as an infant. It was ...,5,My son loved this book as an infant It was per ...,1
"Lamaze Peekaboo, I Love You ...",Our baby loves this book & has loved it for a ...,5,Our baby loves this book has loved it for a while ...,1
"SoftPlay Giggle Jiggle Funbook, Happy Bear ...",This bear is absolutely adorable and I would ...,2,This bear is absolutely adorable and I would ...,-1
SoftPlay Peek-A-Boo Where's Elmo A Childr ...,I bought two for recent baby showers! The book ...,5,I bought two for recent baby showers The boo ...,1
Baby's First Year Undated Wall Calendar with ...,I searched high and low for a first year cale ...,5,I searched high and low for a first year cale ...,1

scores
1.25826564559
14.1652449926
2.6105215617
10.7733621112
3.86082631641
10.0630679588
6.67003574828
1.4277504869
6.4580107869
5.90218847207


In [26]:
print_rows(num_rows=20, num_columns=10)

NameError: name 'print_rows' is not defined

In [27]:
(test_data.sort('scores', ascending=False))['name','scores'][10:20]

name,scores
Buttons Cloth Diaper Cover - One Size - 8 ...,40.4490022995
Graco FastAction Fold Jogger Click Connect ...,39.8136930308
Freemie Hands-Free Concealable Breast Pump ...,39.0213161858
"Baby Jogger City Mini GT Single Stroller, ...",38.3277129336
"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce ...",38.0637640157
"Fisher-Price Cradle 'N Swing, My Little ...",37.7603822305
"Britax Decathlon Convertible Car Seat, ...",36.5987964871
Ikea 36 Pcs Kalas Kids Plastic BPA Free ...,36.5352177184
Summer Infant Wide View Digital Color Video ...,35.8246870664
"Baby Jogger City Mini GT Double Stroller, ...",35.6080914826


In [28]:
(test_data.sort('scores', ascending=True))['name','scores'][10:20]

name,scores
Ellaroo Mei Tai Baby Carrier - Hershey ...,-21.5088632048
Cosco Alpha Omega Elite Convertible Car Seat ...,-21.3914584601
"Peg-Perego Tatamia High Chair, White Latte ...",-21.0886636989
Belkin WeMo Wi-Fi Baby Monitor for Apple iPh ...,-21.0641976322
Chicco Cortina KeyFit 30 Travel System in ...,-21.0485837828
NUK Cook-n-Blend Baby Food Maker ...,-20.9732407768
VTech Communications Safe &amp; Sound Digital A ...,-20.896225209
Safety 1st Deluxe 4-in-1 Bath Station ...,-20.606928155
"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs ...",-20.3872163223
"Regalo My Cot Portable Bed, Royal Blue ...",-20.1581053045


In [29]:
print scores.sort(axis=0)

None


In [30]:
print scores[-10:-1]

[-1 -1]


In [31]:
test_data['result'] = sentiment_label.predict(test_matrix)

In [32]:
test_data

name,review,rating,review_clean,sentiment
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4,This has been an easy way for my nanny to record ...,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4,I love this journal and our nanny uses it ...,1
Nature's Lullabies First Year Sticker Calendar ...,"I love this little calender, you can keep ...",5,I love this little calender you can keep ...,1
Nature's Lullabies Second Year Sticker Calendar ...,"I had a hard time finding a second year calendar, ...",5,I had a hard time finding a second year calendar ...,1
"Lamaze Peekaboo, I Love You ...","One of baby's first and favorite books, and i ...",4,One of babys first and favorite books and it is ...,1
"Lamaze Peekaboo, I Love You ...",My son loved this book as an infant. It was ...,5,My son loved this book as an infant It was per ...,1
"Lamaze Peekaboo, I Love You ...",Our baby loves this book & has loved it for a ...,5,Our baby loves this book has loved it for a while ...,1
"SoftPlay Giggle Jiggle Funbook, Happy Bear ...",This bear is absolutely adorable and I would ...,2,This bear is absolutely adorable and I would ...,-1
SoftPlay Peek-A-Boo Where's Elmo A Childr ...,I bought two for recent baby showers! The book ...,5,I bought two for recent baby showers The boo ...,1
Baby's First Year Undated Wall Calendar with ...,I searched high and low for a first year cale ...,5,I searched high and low for a first year cale ...,1

scores,result
1.25826564559,1
14.1652449926,1
2.6105215617,1
10.7733621112,1
3.86082631641,1
10.0630679588,1
6.67003574828,1
1.4277504869,1
6.4580107869,1
5.90218847207,1


In [33]:
len(test_data)

33336

In [34]:
len(test_data[test_data['sentiment']==test_data['result']])

31082

In [35]:
r =31082/33336.0

In [36]:
print r

0.932385409167


# Learn another classifier with fewer words

In [37]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [38]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [39]:
simple_model = sklearn.linear_model.LogisticRegression()

In [40]:
simple_model.fit(train_matrix_word_subset,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
simple_model_coef_table = graphlab.SFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [42]:
simple_model_coef_table.sort('coefficient',ascending=False)[10:20]

coefficient,word
-0.209562864534,less
-0.320556236735,product
-0.362166742274,would
-0.511379631799,even
-0.621168773641,work
-0.898030737715,money
-1.65157634496,broke
-2.03369861394,waste
-2.10933109032,return
-2.3482982195,disappointed


In [43]:
simple_model.coef_

array([[ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
         1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
        -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
        -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109]])

In [44]:
simple_model.intercept_

array([ 1.29937369])

In [45]:
train_data['result1'] = simple_model.predict(train_matrix_word_subset)

In [46]:
train_data['result'] = sentiment_label.predict(train_matrix)

In [47]:
len(train_data[train_data['sentiment']==train_data['result1']])

115648

In [48]:
len(train_data[train_data['sentiment']==train_data['result']])

129162

In [49]:
test_data['result1'] = simple_model.predict(test_matrix_word_subset)

In [50]:
len(test_data[test_data['sentiment']==test_data['result1']])

28981

In [112]:
print 28981/33336.0

0.869360451164
