In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


#Visualizations library
import seaborn as sns 
import matplotlib.pyplot as plt


#Stats library
from scipy import stats
from scipy.stats import norm, skew 

#Preprocessing Libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.special import boxcox1p
from sklearn.feature_extraction.text import CountVectorizer


# Modellling Librries
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LassoCV, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import GridSearchCV 

import os
# print(os.listdir("../House Price Predictor"))

# Any results you write to the current directory are saved as output.

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


In [4]:
importantWords = pd.read_json('important_words.json')

In [13]:
importantWords

Unnamed: 0,0
0,baby
1,one
2,great
3,love
4,use
5,would
6,like
7,easy
8,little
9,seat


In [6]:
products.isnull().sum()

name          90
review       241
rating         0
sentiment      0
dtype: int64

In [7]:
products.fillna({'review': ''}, inplace=True)

In [8]:
def removePunctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [9]:
products['reviewClean'] = products.review.apply(removePunctuation)

In [14]:
for word in importantWords[0]:
    products[word] = products['reviewClean'].apply(lambda s : s.split().count(word))

baby
one
great
love
use
would
like
easy
little
seat
old
well
get
also
really
son
time
bought
product
good
daughter
much
loves
stroller
put
months
car
still
back
used
recommend
first
even
perfect
nice
bag
two
using
got
fit
around
diaper
enough
month
price
go
could
soft
since
buy
room
works
made
child
keep
size
small
need
year
big
make
take
easily
think
crib
clean
way
quality
thing
better
without
set
new
every
cute
best
bottles
work
purchased
right
lot
side
happy
comfortable
toy
able
kids
bit
night
long
fits
see
us
another
play
day
money
monitor
tried
thought
never
item
hard
plastic
however
disappointed
reviews
something
going
pump
bottle
cup
waste
return
amazon
different
top
want
problem
know
water
try
received
sure
times
chair
find
hold
gate
open
bottom
away
actually
cheap
worked
getting
ordered
came
milk
bad
part
worth
found
cover
many
design
looking
weeks
say
wanted
look
place
purchase
looks
second
piece
box
pretty
trying
difficult
together
though
give
started
anything
last
company
c

In [20]:
products.head()

Unnamed: 0,name,review,rating,sentiment,reviewClean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [24]:
# Number of reviews having perfect.
products[products.perfect > 0].review.count()

2955

In [33]:
def convertToArray(df, feats,label):
    df['constant'] = 1
    feats = ['constant'] + feats
    featMatrix = df[feats].as_matrix()
    labelArray = df[label].as_matrix()
    return(featMatrix, labelArray)

In [30]:
importantWordsList = list(importantWords[0])

In [34]:
featMatrix, labelArray = convertToArray(products, importantWordsList, 'sentiment')

  after removing the cwd from sys.path.
  """


In [38]:
featMatrix.shape

(53072, 194)

In [75]:
def predictProbability(featMatrix, coefs):
    score = np.dot(featMatrix, coefs)
    prob = 1/(1 + np.exp(-score))
    return prob
    

In [110]:
def featureDerivative(errors, feature):
    der = np.dot(errors, feature)
    return der

In [111]:
def computeLogLikelihood(featMatrix, sentiment, coefs):
    indicator = (sentiment==+1)
    scores = np.dot(featMatrix, coefs)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [112]:
def logisticRegression(featMatrix, sentiment, initCoefs, stepSize, maxIter):
    coefs = np.array(initCoefs)
    for i in range(maxIter):
        predictions = predictProbability(featMatrix, coefs)
        indicators = (sentiment==+1)
        errors = indicators - predictions
        for j in range(len(coefs)):
            derivative = featureDerivative(errors, featMatrix[:,j])
            coefs[j] = coefs[j] + (stepSize * derivative)
        if i <= 15 or (i <= 100 and i % 10 == 0) or(i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 1000 == 0) or i % 10000 == 0:
            lp = computeLogLikelihood(featMatrix, sentiment, coefs)
            print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(maxIter))), i, lp))
            
    return coefs

In [113]:
initCoefs = [0.0]*194
stepSize = 1e-7
maxIter = 301
coefs = logisticRegression(featMatrix, products.sentiment, initCoefs, stepSize, maxIter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [152]:
scores = np.dot(featMatrix, coefs)
prediction = pd.Series(scores).apply(lambda scores:  1 if scores > 0 else -1)

In [157]:
posPredict = (prediction == 1)

In [158]:
np.sum(posPredict)

25126

In [154]:
err = (pd.Series(prediction) == products.sentiment)

In [155]:
err[err==True].sum()

39903

In [156]:
39903/53072

0.7518653904130238

In [167]:
coefs

array([ 5.16220157e-03,  1.55656966e-02, -8.50204675e-03,  6.65460842e-02,
        6.58907629e-02,  5.01743882e-03, -5.38601484e-02, -3.50488413e-03,
        6.47945868e-02,  4.54356263e-02,  3.98353364e-03,  2.00775410e-02,
        3.01350011e-02, -2.87115530e-02,  1.52161964e-02,  2.72592062e-04,
        1.19448177e-02, -1.82461935e-02, -1.21706420e-02, -4.15110334e-02,
        2.76820391e-03,  1.77031999e-02, -4.39700067e-03,  4.49764014e-02,
        9.90916464e-03,  8.99239081e-04, -1.36219516e-03,  1.26859357e-02,
        8.26466695e-03, -2.77426972e-02,  6.10128809e-04,  1.54084501e-02,
       -1.32134753e-02, -3.00512492e-02,  2.97399371e-02,  1.84087080e-02,
        2.86178752e-03, -1.05768015e-02, -6.57350362e-04, -1.01476555e-02,
       -4.79579528e-03,  7.50891810e-03,  4.27938289e-03,  3.06785501e-03,
       -2.20317661e-03,  9.57273354e-03,  9.91666827e-05, -1.98462567e-02,
        1.75702722e-02,  1.55478612e-03, -1.77375440e-02,  9.78324102e-03,
        1.17031606e-02, -

In [168]:
coefs = list(coefs[1:])


In [183]:
wordCoef = [(word, coef) for word, coef in zip(list(importantWords[0]), coefs)]
wordCoef = sorted(wordCoef, key=lambda x:x[1], reverse=True)

In [179]:
# For most positive
wordCoef[:10]

[('great', 0.0665460842),
 ('love', 0.0658907629),
 ('easy', 0.0647945868),
 ('little', 0.0454356263),
 ('loves', 0.0449764014),
 ('well', 0.0301350011),
 ('perfect', 0.0297399371),
 ('old', 0.020077541),
 ('nice', 0.018408708),
 ('daughter', 0.0177031999),
 ('soft', 0.0175702722),
 ('fits', 0.0168824711),
 ('happy', 0.0168052959),
 ('baby', 0.0155656966),
 ('recommend', 0.0154084501),
 ('also', 0.0152161964),
 ('best', 0.0149917916),
 ('comfortable', 0.0132539901),
 ('car', 0.0126859357),
 ('clean', 0.0120181744),
 ('son', 0.0119448177),
 ('bit', 0.0117082481),
 ('works', 0.0117031606),
 ('size', 0.0107159665),
 ('stroller', 0.00990916464),
 ('room', 0.00978324102),
 ('price', 0.00957273354),
 ('play', 0.00917842898),
 ('easily', 0.00903281814),
 ('kids', 0.008582843),
 ('still', 0.00826466695),
 ('lot', 0.00799938935),
 ('around', 0.0075089181),
 ('need', 0.00717190727),
 ('take', 0.00671012331),
 ('keep', 0.00643766808),
 ('crib', 0.00600279979),
 ('without', 0.00592353611),
 ('year

In [180]:
wordCoef = sorted(wordCoef, key=lambda x:x[1], reverse=False)

In [182]:
# For most negative 
wordCoef[:10]

[('would', -0.0538601484),
 ('product', -0.0415110334),
 ('money', -0.0389820373),
 ('work', -0.0330695153),
 ('even', -0.0300512492),
 ('disappointed', -0.0289789761),
 ('get', -0.028711553),
 ('back', -0.0277426972),
 ('return', -0.0265927785),
 ('monitor', -0.0244821005)]