In [13]:
import pandas as pd
import numpy as np

In [14]:
products = pd.read_csv('amazon_baby_subset.csv')
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [15]:
print len(products[products['sentiment']==1])
print len(products[products['sentiment']==-1])

26579
26493


In [8]:
important_words = ["baby", "one", "great", "love", "use", "would", "like", "easy", "little", "seat", "old", "well", "get", "also", "really", "son", "time", "bought", "product", "good", "daughter", "much", "loves", "stroller", "put", "months", "car", "still", "back", "used", "recommend", "first", "even", "perfect", "nice", "bag", "two", "using", "got", "fit", "around", "diaper", "enough", "month", "price", "go", "could", "soft", "since", "buy", "room", "works", "made", "child", "keep", "size", "small", "need", "year", "big", "make", "take", "easily", "think", "crib", "clean", "way", "quality", "thing", "better", "without", "set", "new", "every", "cute", "best", "bottles", "work", "purchased", "right", "lot", "side", "happy", "comfortable", "toy", "able", "kids", "bit", "night", "long", "fits", "see", "us", "another", "play", "day", "money", "monitor", "tried", "thought", "never", "item", "hard", "plastic", "however", "disappointed", "reviews", "something", "going", "pump", "bottle", "cup", "waste", "return", "amazon", "different", "top", "want", "problem", "know", "water", "try", "received", "sure", "times", "chair", "find", "hold", "gate", "open", "bottom", "away", "actually", "cheap", "worked", "getting", "ordered", "came", "milk", "bad", "part", "worth", "found", "cover", "many", "design", "looking", "weeks", "say", "wanted", "look", "place", "purchase", "looks", "second", "piece", "box", "pretty", "trying", "difficult", "together", "though", "give", "started", "anything", "last", "company", "come", "returned", "maybe", "took", "broke", "makes", "stay", "instead", "idea", "head", "said", "less", "went", "working", "high", "unit", "seems", "picture", "completely", "wish", "buying", "babies", "won", "tub", "almost", "either"]

In [19]:
products['review'] = products['review'].fillna('')

In [20]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

products['review_clean'] = products['review'].apply(remove_punctuation)

In [21]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


In [22]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [26]:
len(products[products['perfect']>0]) # number of reviews that contain the word 'perfect'

2955

In [27]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    feature_matrix = dataframe[['constant'] + features].as_matrix()
    label_matrix = dataframe[label].as_matrix()
    return (feature_matrix, label_matrix)

In [28]:
(feature_matrix, sentiment) = get_numpy_data(products,important_words,'sentiment')

In [30]:
len(feature_matrix[1,:]) # number of features

194

In [45]:
def predict_probability(feature_matrix,coefficients):
    score = np.dot(feature_matrix,coefficients)
    prob_prediction = 1/(1. + np.exp(-score))
    return prob_prediction

In [32]:
def feature_derivative(errors,feature):
    derivative = np.dot(feature,errors)
    return derivative

In [38]:
def compute_log_likelihood(feature_matrix,sentiment,coefficients):
    scores = np.dot(feature_matrix, coefficients)
    indicator = (sentiment == 1)
    llh = np.sum((indicator-1)*scores - np.log(1.+np.exp(-scores)))
    return llh

In [50]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix,initial_coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors,feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] += step_size * derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [51]:
coefficients = logistic_regression(feature_matrix,sentiment,np.zeros(194),1e-7,301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13127954
iteration   2: log likelihood of observed labels = -36769.34795095
iteration   3: log likelihood of observed labels = -36763.56769899
iteration   4: log likelihood of observed labels = -36757.79052366
iteration   5: log likelihood of observed labels = -36752.01642492
iteration   6: log likelihood of observed labels = -36746.24540276
iteration   7: log likelihood of observed labels = -36740.47745714
iteration   8: log likelihood of observed labels = -36734.71258803
iteration   9: log likelihood of observed labels = -36728.95079539
iteration  10: log likelihood of observed labels = -36723.19207918
iteration  11: log likelihood of observed labels = -36717.43643934
iteration  12: log likelihood of observed labels = -36711.68387583
iteration  13: log likelihood of observed labels = -36705.93438858
iteration  14: log likelihood of observed labels = -36700.1879

In [56]:
products['prediction'] = products.apply(lambda row: +1 if np.dot(get_numpy_data(row, important_words, sentiment)[0], coefficients) > 0 else -1, axis=1)

In [66]:
print len(products[products['prediction'] == +1])
print len(products[products['prediction'] == -1])

21348
31724


In [58]:
products['correctly_classified'] = products.apply(lambda row: +1 if row['sentiment'] == row['prediction'] else 0, axis=1)

In [60]:
len(products[products['correctly_classified'] == 1]) / float(len(products)) # accuracy

0.7409745251733494

In [61]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [62]:
word_coefficient_tuples[:10]

[('great', 0.069275149999999647),
 ('love', 0.069004250000000072),
 ('easy', 0.06748420000000005),
 ('little', 0.046790450000000261),
 ('loves', 0.046414200000000044),
 ('well', 0.030355849999999917),
 ('perfect', 0.030355849999999917),
 ('old', 0.020272350000000126),
 ('nice', 0.018481399999999922),
 ('soft', 0.017954649999999985)]

In [63]:
word_coefficient_tuples[-10:]

[('return', -0.027977950000000137),
 ('monitor', -0.028324099999999935),
 ('disappointed', -0.030054849999999821),
 ('back', -0.031589949999999895),
 ('even', -0.03347119999999984),
 ('get', -0.03396785000000007),
 ('work', -0.036195249999999971),
 ('money', -0.04141760000000029),
 ('product', -0.047452650000000124),
 ('would', -0.063811999999999619)]