In [10]:
import pandas as pd
import numpy as np
import json, string

In [8]:
products = pd.read_csv('./amazon_baby_subset.csv')
products['review'].fillna('', inplace=True)
with open('important_words.json', 'r') as f:
    important_words = json.load(f)

In [6]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [11]:
def remove_punctuation(text):
    table = str.maketrans({key: None for key in string.punctuation})
    return str(text).translate(table)

products['review_clean'] = products['review'].apply(remove_punctuation)

In [17]:
# add the word count as features
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [18]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
## quiz 1
products['perfect'].sum()

3207

In [39]:
def get_X_and_y(df, feature_list, label_col):
    X = np.insert(df[feature_list].values, 0, 1, axis=1)    
    y = df[label_col].values
    return X, y

X, y = get_X_and_y(products, important_words, 'sentiment')

In [43]:
## quiz 2
X.shape

(53072, 194)

In [55]:
def predict_probability(feature_matrix, coefficients):
    assert(len(coefficients) == feature_matrix.shape[1]) 
    return 1./(1+np.exp(-1.*np.dot(feature_matrix, coefficients)))

In [58]:
def feature_derivative(errors, feature):
    return np.dot(errors, feature)

In [59]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [60]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = initial_coefficients
    for itr in range(max_iter):
        proba = predict_probability(feature_matrix, coefficients)
        indicator = (sentiment==1)
        errors = indicator - proba
        derivative = feature_derivative(errors, feature_matrix)
        coefficients += step_size*derivative
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % 
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients  

In [62]:
coef_ = logistic_regression(X, y, np.zeros(X.shape[1]), 1e-7, 301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [69]:
proba = predict_probability(X, coef_)
pred = (proba > 0.5).astype(int)

In [72]:
## quiz
pred.sum()

25126

In [83]:
## quiz
y = [1 if e==1 else 0 for e in y]
(pred == y).sum() / len(y)

0.7518653904130238

In [76]:
coef_ = list(coef_[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coef_)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [79]:
word_coefficient_tuples[:10]

[('great', 0.066546084170457709),
 ('love', 0.065890762922123272),
 ('easy', 0.06479458680257838),
 ('little', 0.045435626308421372),
 ('loves', 0.044976401394906031),
 ('well', 0.03013500109210706),
 ('perfect', 0.029739937104968459),
 ('old', 0.020077541034775381),
 ('nice', 0.018408707995268989),
 ('daughter', 0.0177031999057017)]

In [80]:
word_coefficient_tuples[-10:]

[('monitor', -0.02448210054589172),
 ('return', -0.026592778462247283),
 ('back', -0.027742697230661331),
 ('get', -0.028711552980192578),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035808),
 ('work', -0.033069515294752716),
 ('money', -0.038982037286487116),
 ('product', -0.04151103339210889),
 ('would', -0.053860148445203128)]

In [81]:
y = 

array([ 1,  1,  1, ..., -1, -1, -1])