### This notebook for week 2 quiz
### Will be using numpy and pandas

In [191]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

#### Load Amazon product data

In [195]:
products = pd.read_csv('amazon_baby_subset.csv')

## Print sample data
products.head()


Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [196]:
### Data exploration 
print 'Total number of samples: ', len(products)
print 'Number of positive sample: ', len(products[products['sentiment'] == 1])
print 'Number of negative sample: ', len(products[products['sentiment'] == -1])

Total number of samples:  53072
Number of positive sample:  26579
Number of negative sample:  26493


#### Load important words

In [197]:
important_words = pd.read_json('important_words.json')
important_words = important_words.values.flatten()

#### Perform data cleaning

In [198]:
def remove_punctuation(text):
    text = text.translate(None, string.punctuation)
    return text

In [199]:
sample_text = "This isn't an issue !!!"
print 'Converted text: ', remove_punctuation(sample_text)

Converted text:  This isnt an issue 


In [200]:
### Fill all n/a in review column with empty ("") string
products = products.fillna({'review':''})

### Remove punctuation
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


#### 'Bag of Words' text analysis on review_clean column

In [201]:
for word in important_words:
    #print word
    products[word] = products['review_clean'].apply(lambda s: s.split().count(word))

In [202]:
#### After bag of words analysis for important words
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


##### Quiz question:

In [203]:
print 'Number of prodict review contains word "perfect" :', len(products[products['perfect'] > 0])

Number of prodict review contains word "perfect" : 2955


##### Convert data into multidimensional array

In [278]:
def get_numpy_data(dataframe, features, label):
    # Initialize the np array with all ones, the number of columns are +1 
    # to accomodate the column for bias feature
    matrix_len = len(dataframe)
    matrix_width = len(features) + 1
    feature_matrix = np.ones((matrix_len, matrix_width))
    
    print 'Feature matrix size', feature_matrix.shape
    i = 1
    for feature in features:
        feature_matrix[:, i] = dataframe[feature]
        i = i +1
    return (feature_matrix, dataframe[label])

In [279]:
## Extract the feature matrix
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')


Feature matrix size (53072, 194)


##### Quiz question

In [280]:
print 'Number of feature in feature_matrix: ', feature_matrix.shape[1]

Number of feature in feature_matrix:  194


In [281]:
def predict_probability(feature_matrix, coefficients):
    return (1.0 / (1 + np.exp(-1 * np.dot(feature_matrix, coefficients))))

In [282]:
def feature_derivative(error, feature):
    #print feature
    return np.dot(feature, error)

The log-likelihood is computed using the following formula (see the advanced optional video if you are curious about the derivation of this equation):

ℓℓ(w)=∑i=1N((1[yi=+1]−1)w⊺h(wi)−ln(1+exp(−w⊺h(xi))))


In [283]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1).values.reshape(-1,1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [284]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients)
    itr = 0;
    while (itr < max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicator = (sentiment==+1).values.reshape(-1,1)
        
        #print 'indicator shape', indicator.shape
        #print 'predictions shape', predictions.shape
        
        error = indicator - predictions

        for i in xrange(len(coefficients)):
            derivative = feature_derivative(error, feature_matrix[:,i])
            coefficients[i] = coefficients[i] + step_size * derivative
    
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
       
        itr = itr +1
    return coefficients

In [297]:
initial_coefficients = np.zeros((feature_matrix.shape[1], 1))
step_size = 1e-7
max_iter = 301




In [298]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [335]:
### Scores with estimated coff
scores = np.dot(feature_matrix, coefficients)
indicies = scores > 0
indicies_low = scores <= 0
scores[indicies] = 1
scores[indicies_low] = -1

#print scores.shape
#print sentiment.shape
#print indicies.shape
miss_classifications =  (abs(scores - sentiment.values.reshape(-1,1))).sum() /2

print 'Number of positive predictions: ',len(feature_matrix[indicies.flatten()])
print 'Number of miss_classifications: ', miss_classifications

Number of positive predictions:  25126
Number of miss_classifications:  13169.0


##### Quiz question

In [332]:
print 'Model accuracy: ', (len(feature_matrix) - miss_classifications) * 1.0/ len(feature_matrix)

Model accuracy:  0.751865390413


In [290]:
coefficients = list(coefficients[1:]) # exclude intercept
#print coefficients
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [291]:
word_coefficient_tuples

[(u'great', array([ 0.06654608])),
 (u'love', array([ 0.06589076])),
 (u'easy', array([ 0.06479459])),
 (u'little', array([ 0.04543563])),
 (u'loves', array([ 0.0449764])),
 (u'well', array([ 0.030135])),
 (u'perfect', array([ 0.02973994])),
 (u'old', array([ 0.02007754])),
 (u'nice', array([ 0.01840871])),
 (u'daughter', array([ 0.0177032])),
 (u'soft', array([ 0.01757027])),
 (u'fits', array([ 0.01688247])),
 (u'happy', array([ 0.0168053])),
 (u'baby', array([ 0.0155657])),
 (u'recommend', array([ 0.01540845])),
 (u'also', array([ 0.0152162])),
 (u'best', array([ 0.01499179])),
 (u'comfortable', array([ 0.01325399])),
 (u'car', array([ 0.01268594])),
 (u'clean', array([ 0.01201817])),
 (u'son', array([ 0.01194482])),
 (u'bit', array([ 0.01170825])),
 (u'works', array([ 0.01170316])),
 (u'size', array([ 0.01071597])),
 (u'stroller', array([ 0.00990916])),
 (u'room', array([ 0.00978324])),
 (u'price', array([ 0.00957273])),
 (u'play', array([ 0.00917843])),
 (u'easily', array([ 0.00903

In [292]:
print 'Ten "most positive" words'
word_coefficient_tuples[0:10]

Ten "most positive" words


[(u'great', array([ 0.06654608])),
 (u'love', array([ 0.06589076])),
 (u'easy', array([ 0.06479459])),
 (u'little', array([ 0.04543563])),
 (u'loves', array([ 0.0449764])),
 (u'well', array([ 0.030135])),
 (u'perfect', array([ 0.02973994])),
 (u'old', array([ 0.02007754])),
 (u'nice', array([ 0.01840871])),
 (u'daughter', array([ 0.0177032]))]

In [293]:
print 'Ten "most negative" words'
word_coefficient_tuples[-11:-1]

Ten "most negative" words


[(u'waste', array([-0.02404275])),
 (u'monitor', array([-0.0244821])),
 (u'return', array([-0.02659278])),
 (u'back', array([-0.0277427])),
 (u'get', array([-0.02871155])),
 (u'disappointed', array([-0.02897898])),
 (u'even', array([-0.03005125])),
 (u'work', array([-0.03306952])),
 (u'money', array([-0.03898204])),
 (u'product', array([-0.04151103]))]