In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


#Visualizations library
import seaborn as sns 
import matplotlib.pyplot as plt


#Stats library
from scipy import stats
from scipy.stats import norm, skew 

#Preprocessing Libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.special import boxcox1p
from sklearn.feature_extraction.text import CountVectorizer


# Modellling Librries
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LassoCV, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import GridSearchCV 

import os
# print(os.listdir("../House Price Predictor"))

# Any results you write to the current directory are saved as output.

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


In [3]:
importantWords = pd.read_json('important_words.json')

In [13]:
importantWords

Unnamed: 0,0
0,baby
1,one
2,great
3,love
4,use
5,would
6,like
7,easy
8,little
9,seat


In [11]:
trainInd = pd.read_json('module-4-assignment-train-idx.json')
testInd = pd.read_json('module-4-assignment-validation-idx.json')

In [4]:
products.isnull().sum()

name          90
review       241
rating         0
sentiment      0
dtype: int64

In [6]:
products.fillna({'review': ''}, inplace=True)

In [7]:
def removePunctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [8]:
products['reviewClean'] = products.review.apply(removePunctuation)

In [9]:
for word in importantWords[0]:
    products[word] = products['reviewClean'].apply(lambda s : s.split().count(word))

In [10]:
products.head()

Unnamed: 0,name,review,rating,sentiment,reviewClean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [24]:
# Number of reviews having perfect.
products[products.perfect > 0].review.count()

2955

In [16]:
def convertToArray(df, feats,label):
    df['constant'] = 1
    feats = ['constant'] + feats
    featMatrix = df[feats].as_matrix()
    labelArray = df[label].as_matrix()
    return(featMatrix, labelArray)

In [19]:
importantWordsList = list(importantWords[0])

In [34]:
featMatrix, labelArray = convertToArray(products, importantWordsList, 'sentiment')

  after removing the cwd from sys.path.
  """


In [38]:
featMatrix.shape

(53072, 194)

In [23]:
def predictProbability(featMatrix, coefs):
    score = np.dot(featMatrix, coefs)
    prob = 1/(1 + np.exp(-score))
    return prob
    

In [110]:
def featureDerivative(errors, feature):
    der = np.dot(errors, feature)
    return der

In [111]:
def computeLogLikelihood(featMatrix, sentiment, coefs):
    indicator = (sentiment==+1)
    scores = np.dot(featMatrix, coefs)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [112]:
def logisticRegression(featMatrix, sentiment, initCoefs, stepSize, maxIter):
    coefs = np.array(initCoefs)
    for i in range(maxIter):
        predictions = predictProbability(featMatrix, coefs)
        indicators = (sentiment==+1)
        errors = indicators - predictions
        for j in range(len(coefs)):
            derivative = featureDerivative(errors, featMatrix[:,j])
            coefs[j] = coefs[j] + (stepSize * derivative)
        if i <= 15 or (i <= 100 and i % 10 == 0) or(i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 1000 == 0) or i % 10000 == 0:
            lp = computeLogLikelihood(featMatrix, sentiment, coefs)
            print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(maxIter))), i, lp))
            
    return coefs

In [113]:
initCoefs = [0.0]*194
stepSize = 1e-7
maxIter = 301
coefs = logisticRegression(featMatrix, products.sentiment, initCoefs, stepSize, maxIter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [152]:
scores = np.dot(featMatrix, coefs)
prediction = pd.Series(scores).apply(lambda scores:  1 if scores > 0 else -1)

In [157]:
posPredict = (prediction == 1)

In [158]:
np.sum(posPredict)

25126

In [154]:
err = (pd.Series(prediction) == products.sentiment)

In [155]:
err[err==True].sum()

39903

In [156]:
39903/53072

0.7518653904130238

In [167]:
coefs

array([ 5.16220157e-03,  1.55656966e-02, -8.50204675e-03,  6.65460842e-02,
        6.58907629e-02,  5.01743882e-03, -5.38601484e-02, -3.50488413e-03,
        6.47945868e-02,  4.54356263e-02,  3.98353364e-03,  2.00775410e-02,
        3.01350011e-02, -2.87115530e-02,  1.52161964e-02,  2.72592062e-04,
        1.19448177e-02, -1.82461935e-02, -1.21706420e-02, -4.15110334e-02,
        2.76820391e-03,  1.77031999e-02, -4.39700067e-03,  4.49764014e-02,
        9.90916464e-03,  8.99239081e-04, -1.36219516e-03,  1.26859357e-02,
        8.26466695e-03, -2.77426972e-02,  6.10128809e-04,  1.54084501e-02,
       -1.32134753e-02, -3.00512492e-02,  2.97399371e-02,  1.84087080e-02,
        2.86178752e-03, -1.05768015e-02, -6.57350362e-04, -1.01476555e-02,
       -4.79579528e-03,  7.50891810e-03,  4.27938289e-03,  3.06785501e-03,
       -2.20317661e-03,  9.57273354e-03,  9.91666827e-05, -1.98462567e-02,
        1.75702722e-02,  1.55478612e-03, -1.77375440e-02,  9.78324102e-03,
        1.17031606e-02, -

In [168]:
coefs = list(coefs[1:])


In [183]:
wordCoef = [(word, coef) for word, coef in zip(list(importantWords[0]), coefs)]
wordCoef = sorted(wordCoef, key=lambda x:x[1], reverse=True)

In [179]:
# For most positive
wordCoef[:10]

[('great', 0.0665460842),
 ('love', 0.0658907629),
 ('easy', 0.0647945868),
 ('little', 0.0454356263),
 ('loves', 0.0449764014),
 ('well', 0.0301350011),
 ('perfect', 0.0297399371),
 ('old', 0.020077541),
 ('nice', 0.018408708),
 ('daughter', 0.0177031999),
 ('soft', 0.0175702722),
 ('fits', 0.0168824711),
 ('happy', 0.0168052959),
 ('baby', 0.0155656966),
 ('recommend', 0.0154084501),
 ('also', 0.0152161964),
 ('best', 0.0149917916),
 ('comfortable', 0.0132539901),
 ('car', 0.0126859357),
 ('clean', 0.0120181744),
 ('son', 0.0119448177),
 ('bit', 0.0117082481),
 ('works', 0.0117031606),
 ('size', 0.0107159665),
 ('stroller', 0.00990916464),
 ('room', 0.00978324102),
 ('price', 0.00957273354),
 ('play', 0.00917842898),
 ('easily', 0.00903281814),
 ('kids', 0.008582843),
 ('still', 0.00826466695),
 ('lot', 0.00799938935),
 ('around', 0.0075089181),
 ('need', 0.00717190727),
 ('take', 0.00671012331),
 ('keep', 0.00643766808),
 ('crib', 0.00600279979),
 ('without', 0.00592353611),
 ('year

In [180]:
wordCoef = sorted(wordCoef, key=lambda x:x[1], reverse=False)

In [182]:
# For most negative 
wordCoef[:10]

[('would', -0.0538601484),
 ('product', -0.0415110334),
 ('money', -0.0389820373),
 ('work', -0.0330695153),
 ('even', -0.0300512492),
 ('disappointed', -0.0289789761),
 ('get', -0.028711553),
 ('back', -0.0277426972),
 ('return', -0.0265927785),
 ('monitor', -0.0244821005)]

In [12]:
trInd = (trainInd[0].unique())
teInd = testInd[0].unique()

In [14]:
trainData = products.loc[trInd]

In [21]:
validData = products.loc[teInd]

In [20]:
featMatrixTrain, sentimentTrain = convertToArray(trainData, importantWordsList, 'sentiment')

  after removing the cwd from sys.path.
  """


In [22]:
featMatrixValid, sentimentValid = convertToArray(validData, importantWordsList, 'sentiment')

  after removing the cwd from sys.path.
  """


In [24]:
def featureDerivativeWithL2(errors, feature, coef, l2Penalty, isConstant):
    der = np.dot(errors, feature)
    
    if not isConstant:
        der = der - (2*l2Penalty*coef)
    return der

In [25]:
def computeLogLikelihoodWithL2(featMatrix, sentiment, coefs, l2Penalty):
    indicator = (sentiment==+1)
    scores = np.dot(featMatrix, coefs)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - (l2Penalty *np.sum(coefs[1:]**2) )
    return lp

In [32]:
def logisticRegressionWithL2(featMatrix, sentiment, initCoefs, stepSize, maxIter, l2Penalty):
    coefs = np.array(initCoefs)
    for i in range(maxIter):
        predictions = predictProbability(featMatrix, coefs)
        indicators = (sentiment==+1)
        errors = indicators - predictions
        for j in range(len(coefs)):
            isIntercept = (j==0)
            derivative = featureDerivativeWithL2(errors, featMatrix[:,j], coefs[j], l2Penalty, isIntercept)
            coefs[j] = coefs[j] + (stepSize * derivative)
        if i <= 15 or (i <= 100 and i % 10 == 0) or(i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 1000 == 0) or i % 10000 == 0:
            lp = computeLogLikelihoodWithL2(featMatrix, sentiment, coefs, l2Penalty)
            print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(maxIter))), i, lp))
            
    return coefs

In [33]:
initCoefs = [0.0] * 194
stepSize = 5e-6
maxIter = 501
penalizedCoefs = []
for l2Penalty in [0, 4, 10, 1e2, 1e3, 1e5]:
    penalizedCoefs.append(logisticRegressionWithL2(featMatrixTrain, sentimentTrain, initCoefs, stepSize, maxIter, l2Penalty))

iteration   0: log likelihood of observed labels = -29179.39138303
iteration   1: log likelihood of observed labels = -29003.71259047
iteration   2: log likelihood of observed labels = -28834.66187288
iteration   3: log likelihood of observed labels = -28671.70781507
iteration   4: log likelihood of observed labels = -28514.43078198
iteration   5: log likelihood of observed labels = -28362.48344665
iteration   6: log likelihood of observed labels = -28215.56713122
iteration   7: log likelihood of observed labels = -28073.41743783
iteration   8: log likelihood of observed labels = -27935.79536396
iteration   9: log likelihood of observed labels = -27802.48168669
iteration  10: log likelihood of observed labels = -27673.27331484
iteration  11: log likelihood of observed labels = -27547.98083656
iteration  12: log likelihood of observed labels = -27426.42679977
iteration  13: log likelihood of observed labels = -27308.44444728
iteration  14: log likelihood of observed labels = -27193.8767

iteration   9: log likelihood of observed labels = -27935.93902900
iteration  10: log likelihood of observed labels = -27831.15045502
iteration  11: log likelihood of observed labels = -27731.59955260
iteration  12: log likelihood of observed labels = -27636.98108219
iteration  13: log likelihood of observed labels = -27547.01092670
iteration  14: log likelihood of observed labels = -27461.42422295
iteration  15: log likelihood of observed labels = -27379.97375625
iteration  20: log likelihood of observed labels = -27027.18208317
iteration  30: log likelihood of observed labels = -26527.22737267
iteration  40: log likelihood of observed labels = -26206.59048765
iteration  50: log likelihood of observed labels = -25995.96903148
iteration  60: log likelihood of observed labels = -25854.95710284
iteration  70: log likelihood of observed labels = -25759.08109950
iteration  80: log likelihood of observed labels = -25693.05688014
iteration  90: log likelihood of observed labels = -25647.0992

In [34]:
penalizedCoefs

[array([-6.37421352e-02,  7.40730059e-02,  1.27525058e-02,  8.01624990e-01,
         1.05855398e+00, -1.04152191e-04, -2.87021444e-01, -3.38447399e-03,
         9.84558820e-01,  5.24419456e-01, -8.69675407e-02,  2.08912434e-01,
         4.53866487e-01, -1.96835211e-01,  1.58163325e-01, -1.79058177e-02,
         1.28396325e-01, -7.24293854e-02, -1.51817046e-01, -2.63330304e-01,
         1.56507228e-01,  2.63417760e-01, -1.32474753e-02,  1.05248405e+00,
        -3.75326583e-02, -3.29713873e-04, -6.79948371e-02,  1.93363694e-01,
         1.88508247e-01, -2.68954361e-01,  9.62841996e-02,  3.58309842e-01,
        -4.63096879e-02, -3.68678195e-01,  8.35693208e-01,  4.29393687e-01,
        -6.53274724e-03, -1.18953292e-01,  4.52597544e-02, -1.05433750e-01,
        -1.40179533e-01,  1.15329513e-01,  4.70226839e-02,  2.77177536e-02,
        -1.93377062e-01,  2.65797426e-01,  7.83957751e-02, -1.76600523e-01,
         3.61782536e-01,  1.02765639e-01, -2.71592217e-01,  2.60319903e-01,
         3.3

In [36]:
penalizedWordCoef = pd.DataFrame()

In [37]:
penalizedWordCoef['word'] = importantWords[0].values

In [43]:
penalizedWordCoef['0Coefs'] = penalizedCoefs[0][1:]

In [45]:
penalizedWordCoef['4Coefs'] = penalizedCoefs[1][1:]
penalizedWordCoef['10Coefs'] = penalizedCoefs[2][1:]
penalizedWordCoef['100Coefs'] = penalizedCoefs[3][1:]
penalizedWordCoef['1000Coefs'] = penalizedCoefs[4][1:]
penalizedWordCoef['100000Coefs'] = penalizedCoefs[5][1:]

In [59]:
penalizedWordCoef

Unnamed: 0,word,0Coefs,4Coefs,10Coefs,100Coefs,1000Coefs,100000Coefs
0,baby,0.074073,0.073994,0.073877,0.072360,0.059752,0.001784
1,one,0.012753,0.012495,0.012115,0.007247,-0.008761,-0.001827
2,great,0.801625,0.796897,0.789935,0.701425,0.376012,0.008950
3,love,1.058554,1.050856,1.039529,0.896644,0.418354,0.009042
4,use,-0.000104,0.000163,0.000556,0.005481,0.017326,0.000418
5,would,-0.287021,-0.286027,-0.284564,-0.265993,-0.188662,-0.008127
6,like,-0.003384,-0.003442,-0.003527,-0.004635,-0.007043,-0.000827
7,easy,0.984559,0.977600,0.967362,0.838245,0.401904,0.008808
8,little,0.524419,0.521385,0.516917,0.460235,0.251221,0.005941
9,seat,-0.086968,-0.086125,-0.084883,-0.069109,-0.017718,0.000611


In [57]:
negWords = penalizedWordCoef.sort_values(by='0Coefs', ascending=True).word[:5].values

In [56]:
posWords = penalizedWordCoef.sort_values(by='0Coefs', ascending=False).word[:5].values

In [58]:
negWords, posWords

(array(['disappointed', 'money', 'return', 'waste', 'returned'],
       dtype=object),
 array(['love', 'loves', 'easy', 'perfect', 'great'], dtype=object))

In [61]:
penalizedWordCoef.sort_values(by='0Coefs', ascending=True)[:5]

Unnamed: 0,word,0Coefs,4Coefs,10Coefs,100Coefs,1000Coefs,100000Coefs
105,disappointed,-0.955437,-0.94698,-0.934518,-0.775625,-0.266095,-0.004013
96,money,-0.768793,-0.762734,-0.753818,-0.641406,-0.275883,-0.005487
113,return,-0.742085,-0.735502,-0.725807,-0.602646,-0.215199,-0.00373
112,waste,-0.617809,-0.612475,-0.60462,-0.505189,-0.190631,-0.003345
168,returned,-0.572707,-0.567518,-0.55987,-0.462056,-0.150021,-0.002225


In [62]:
penalizedWordCoef.sort_values(by='0Coefs', ascending=False)[:5]

Unnamed: 0,word,0Coefs,4Coefs,10Coefs,100Coefs,1000Coefs,100000Coefs
3,love,1.058554,1.050856,1.039529,0.896644,0.418354,0.009042
22,loves,1.052484,1.043903,1.031265,0.870794,0.34587,0.00615
7,easy,0.984559,0.9776,0.967362,0.838245,0.401904,0.008808
33,perfect,0.835693,0.828555,0.818038,0.684143,0.250614,0.003989
2,great,0.801625,0.796897,0.789935,0.701425,0.376012,0.00895


In [65]:
len(sentimentTrain)

42361

In [68]:
len(sentimentValid)

10711

In [None]:
penal

In [70]:
accuracyTrain = []
total = 42361
for coefs in penalizedCoefs:
    scores = np.dot(featMatrixTrain, coefs)
    prediction = pd.Series(scores).apply(lambda scores:  1 if scores > 0 else -1)
    err = (pd.Series(prediction) == pd.Series(sentimentTrain))
    correct = err[err==True].sum()
    accuracyTrain.append(correct/total)

In [71]:
accuracyValid = []
total = 10711
for coefs in penalizedCoefs:
    scores = np.dot(featMatrixValid, coefs)
    prediction = pd.Series(scores).apply(lambda scores:  1 if scores > 0 else -1)
    err = (pd.Series(prediction) == pd.Series(sentimentValid))
    correct = err[err==True].sum()
    accuracyValid.append(correct/total)

In [79]:
accuracyTrain

[0.6803663747314747,
 0.7758551497839994,
 0.7839758268218409,
 0.7849909114515711,
 0.7851089445480512,
 0.7851561577866434]

In [80]:
accuracyValid

[0.667818130893474,
 0.7713565493417982,
 0.781066193632714,
 0.781439641490057,
 0.7815330034543927,
 0.7817197273830642]