In [1]:
import pandas as pd
import numpy as np
import json
import string
import math

# Load review dataset

In [2]:
products=pd.read_csv(r"D:\New folder\amazon_baby.csv")
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [3]:
print("Name of the first 10 products:\n")
print(products['name'][:10].values)

Name of the first 10 products:

["Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book"
 "Nature's Lullabies Second Year Sticker Calendar"
 "Nature's Lullabies Second Year Sticker Calendar"
 'Lamaze Peekaboo, I Love You'
 "SoftPlay Peek-A-Boo Where's Elmo A Children's Book"
 'Our Baby Girl Memory Book'
 'Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers'
 'Blessed By Pope Benedict XVI Divine Mercy Full Color Medal'
 'Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)'
 'Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)']


In [4]:
print("Number of positive reviws = ",sum(products['sentiment']==1))

Number of positive reviws =  26579


In [5]:
print("Number of positive reviws = ",sum(products['sentiment']==-1))

Number of positive reviws =  26493


# Apply text cleaning on the review data

In [6]:
with open(r"D:\New folder\important_words.json") as f:
    important_words=json.load(f)

In [7]:
isinstance(important_words,list)

True

In [8]:
products = products.fillna({'review':''})  ## fill in N/A's in the review column

In [9]:
translation_table=str.maketrans("","",string.punctuation)     # remove punctuation
products['review_clean']=products['review'].apply(lambda text: text.translate(translation_table))

In [10]:
# For each word in important_words, we compute a count for the number of times the word occurs in the review and
# store this count in a separate column.
for word in important_words: 
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [11]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [12]:
print("Number of reviews that contain the word perfect is", sum(products['perfect']>=1))

Number of reviews that contain the word perfect is 2955


# Convert data frame to multi-dimensional array

In [13]:
def get_numpy_data(dataframe, features, label):
    '''
    dataframe: a data frame to be converted
    features: a  list of string, containing the names of the columns that are used as features
    label: a string, containing the name of the single column that is used as a class labels
    return: 2D array of features, 1D array of class labels
    '''
    dataframe['constant'] = 1 # for intercept
    features = ['constant'] + features
    features_df = dataframe[features]
    features_matrix=features_df.as_matrix()
    label_array=dataframe[label].values
    return features_matrix,label_array    

In [14]:
feature_matrix, sentiment=get_numpy_data(products, important_words, 'sentiment')

In [15]:
print(feature_matrix)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [16]:
print(sentiment)

[ 1  1  1 ... -1 -1 -1]


In [17]:
print("number of features in feature_matrix is",feature_matrix.shape[1])

number of features in feature_matrix is 194


# Estimating conditional probability

# $\textsf Pr(y_i\ =\ +1|\textbf{x}_i,\ \textbf{w})\ =\ \frac{1}{1+exp(-\textbf{w}^Th(\textbf{x}_i))}$

In [18]:
def predict_probability(feature_matrix, coefficients):
    score = feature_matrix.dot(coefficients)
    predictions=1/(1+np.exp(-score))
    return predictions

#### derivative of log likelihood with respect to a single coefficient
## $\frac{\partial ll(\textbf{w})}{\partial w_j} = \sum_{i=1}^{N}h_j(\textbf{x}_i)(I[y_i=+1]-Pr(y_i=+1|\textbf{x}_i,\textbf{w}))$
#### where,
#### $I[y_i=+1] = \begin{cases} 1, & \mbox{if } sentiment_i\mbox{ is +1} \\ 0, & \mbox{if } sentiment_i\mbox{ is -1} \end{cases}\\and\ sentiment_i\ is\ the\ sentiment\ value\ of\ i^{th}\ product$

In [19]:
def feature_derivative(errors,feature_matrix):
    derivative = np.dot(errors,feature_matrix)
    return derivative

#### log likelihood function
## $ll(\textbf{w}) = \sum_{i=1}^{N}((I[y_i=+1]-1)\textbf{w}^Th(\textbf{x}_i)-ln(1+exp(-\textbf{w}^Th(\textbf{x}_i)))$

In [20]:
def compute_log_liklihood(feature_matrix,indicator,coefficients):
    scores=np.dot(feature_matrix,coefficients)
    ll=np.sum((indicator-1)*scores-np.log(1+np.exp(-scores)))
    return ll

# Taking gradient steps

In [21]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients)
    indicator=(sentiment==1)
    for itr in range(max_iter):
        predictions=predict_probability(feature_matrix,coefficients)
        errors=indicator-predictions
        derivative=feature_derivative(errors,feature_matrix)
        coefficients=coefficients+(step_size*derivative)
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            ll=compute_log_liklihood(feature_matrix,indicator,coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                         (int(np.ceil(np.log10(max_iter))), itr, ll))
    return coefficients

In [22]:
initial_coefficients=np.zeros(194)
step_size = 1e-7
max_iter = 301

In [23]:
coefficients=logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

# Predicting sentiments

In [24]:
scores_hat = np.dot(feature_matrix, coefficients)

In [25]:
predicted_sentiment = np.array([1 if score>0 else -1 for score in scores_hat])

In [26]:
predicted_sentiment

array([ 1, -1,  1, ..., -1,  1, -1])

In [27]:
print("predicted reviews with positive sentiment =",sum(predicted_sentiment==1))

predicted reviews with positive sentiment = 25126


# Measuring accuracy

In [28]:
print("accuracy =", float(sum(predicted_sentiment==sentiment)/len(sentiment)))

accuracy = 0.7518653904130238


In [29]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [30]:
word_coefficient_tuples[:10]

[('great', 0.06654608417045771),
 ('love', 0.06589076292212327),
 ('easy', 0.06479458680257838),
 ('little', 0.04543562630842137),
 ('loves', 0.04497640139490603),
 ('well', 0.03013500109210706),
 ('perfect', 0.02973993710496846),
 ('old', 0.02007754103477538),
 ('nice', 0.01840870799526899),
 ('daughter', 0.0177031999057017)]

In [31]:
word_coefficient_tuples_negative  = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)

In [32]:
word_coefficient_tuples_negative[:10]

[('would', -0.05386014844520313),
 ('product', -0.04151103339210889),
 ('money', -0.038982037286487116),
 ('work', -0.033069515294752716),
 ('even', -0.030051249236035808),
 ('disappointed', -0.028978976142317068),
 ('get', -0.028711552980192578),
 ('back', -0.02774269723066133),
 ('return', -0.026592778462247283),
 ('monitor', -0.02448210054589172)]