# Week 1: Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
products = pd.read_csv('amazon_baby.csv')
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [3]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
name      183213 non-null object
review    182702 non-null object
rating    183531 non-null int64
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


Perform text cleaning

In [4]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

products['review_clean'] = products['review'].astype(str).apply(remove_punctuation)

Extract Sentiments

In [5]:
products = products[products['rating'] != 3]

In [6]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

Training and test sets

In [7]:
train_data_json = pd.read_json('module-2-assignment-train-idx.json')
test_data_json = pd.read_json('module-2-assignment-test-idx.json')

In [8]:
test_data_idx = list(test_data_json[0])
test_data = products.iloc[test_data_idx]
train_data_idx = list(train_data_json[0])
train_data = products.iloc[train_data_idx]

Build the word count vector for each review (Bag of Words)

In [9]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # Use this token pattern to keep single-letter words

# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

Train a sentiment classifier with logistic regression

In [10]:
sentiment_model = LogisticRegression().fit(train_matrix, train_data['sentiment'])



In [19]:
w = sentiment_model.coef_
np.sum(w[0] >=0)

85876

Making predictions with logistic regression

In [21]:
sample_test_data = test_data.iloc[10:13]
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [24]:
sample_test_data.iloc[0]['review'] #Positive review

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [25]:
sample_test_data.iloc[1]['review'] #Negative review

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [27]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  5.61494955  -3.13452062 -10.41199092]


In [28]:
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1], dtype=int64)

Probability that a sentiment is positive: $$ P(y_i = +1 | \mathbf{x}_i,\mathbf{w}) = \frac{1}{1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))}. $$

In [42]:
for i in scores:
    prob = 1 / (1 + np.exp(-i))
    print(prob)

0.9963702297569794
0.04170556019994307
3.0068844422809113e-05


In [40]:
sentiment_model.predict_proba(sample_test_matrix)

array([[3.62977024e-03, 9.96370230e-01],
       [9.58294440e-01, 4.17055602e-02],
       [9.99969931e-01, 3.00688444e-05]])

Now with full test dataset

In [74]:
test_prob = sentiment_model.predict_proba(test_matrix)

test_data['positive_prob'] = test_prob[:,1]
top20 = test_data.sort_values("positive_prob",ascending=False)[:20]['name']
top20

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


133651                    Britax 2012 B-Agile Stroller, Red
66059          Evenflo 6 Pack Classic Glass Bottle, 4-Ounce
114796    Fisher-Price Cradle 'N Swing,  My Little Snuga...
180646        Mamas &amp; Papas 2014 Urbo2 Stroller - Black
119182    Roan Rocco Classic Pram Stroller 2-in-1 with B...
87017       Baby Einstein Around The World Discovery Center
80155     Simple Wishes Hands-Free Breastpump Bra, Pink,...
97325     Freemie Hands-Free Concealable Breast Pump Col...
140816           Diono RadianRXT Convertible Car Seat, Plum
168697    Graco FastAction Fold Jogger Click Connect Str...
52631     Evenflo X Sport Plus Convenience Stroller - Ch...
168081    Buttons Cloth Diaper Cover - One Size - 8 Colo...
147949    Baby Jogger City Mini GT Single Stroller, Shad...
100166    Infantino Wrap and Tie Baby Carrier, Black Blu...
50315            P'Kolino Silly Soft Seating in Tias, Green
137034           Graco Pack 'n Play Element Playard - Flint
165593    Ikea 36 Pcs Kalas Kids Plastic

In [76]:
worst20 = test_data.sort_values("positive_prob",ascending=True)[:20]['name']
worst20

16042           Fisher-Price Ocean Wonders Aquarium Bouncer
120209    Levana Safe N'See Digital Video Baby Monitor w...
77072        Safety 1st Exchangeable Tip 3 in 1 Thermometer
48694     Adiri BPA Free Natural Nurser Ultimate Bottle ...
155287    VTech Communications Safe &amp; Sounds Full Co...
94560     The First Years True Choice P400 Premium Digit...
53207                   Safety 1st High-Def Digital Monitor
81332                 Cloth Diaper Sprayer--styles may vary
113995    Motorola Digital Video Baby Monitor with Room ...
10677                     Philips AVENT Newborn Starter Set
9915           Cosco Alpha Omega Elite Convertible Car Seat
59546                Ellaroo Mei Tai Baby Carrier - Hershey
75994            Peg-Perego Tatamia High Chair, White Latte
172090    Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...
40079     Chicco Cortina KeyFit 30 Travel System in Adve...
149987                     NUK Cook-n-Blend Baby Food Maker
154878    VTech Communications Safe &amp

Compute accuracy of the classifier

$$ \mbox{accuracy} = \dfrac{\mbox{# correctly classified examples}}{\mbox{# total examples}} $$