In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix

pd.options.display.max_rows = 3000
pd.options.display.max_colwidth = 300


def my_remove_highly_correlated(X,threshold):
    
    #filter out non numbers
    X = X.select_dtypes(include=[np.bool])  #.dropna()
    
    cols = X.columns
 
    #run correlation matrix
    df = X.corr()

    #put df into array
    a = df.values

    #label top half with -99999 
    #we want to ignore top half of matrix
    iu1 = np.triu_indices(len(df))
    a[iu1] = -99999

    #put data back into daraframe
    df = pd.DataFrame(a, columns=cols)
    df['var'] = cols

    #unstack to get a list of var1, var2, correlation
    df = pd.melt(df, id_vars='var')

    #remove those flagged with -99999
    df = df[df.value != -99999].sort_values(by='var', ascending=True)

    #flag remove vs keep based on corr threshold
    df_remove = df[df.value > threshold]
    keep_list = df.loc[df.value <= threshold,'var'].unique()
 
    print('{} out of {} vars removed due to corr greater than {}'.
          format(df_remove.shape[0],X.shape[1],threshold))
    
    print(df_remove)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(X[keep_list].shape))
    print('\n')
    return X[keep_list]

def my_confusion_matrix(array_Expected,array_Predicted,colName):
    a = np.array(confusion_matrix(array_Expected, array_Predicted ))
    totalExpectedFalse = a[0,0] + a[0,1]
    totalExpectedTrue = a[1,0] + a[1,1]
    correctFalse = a[0,0] 
    correctTrue = a[1,1] 
    correctTruePct = np.round(correctTrue / totalExpectedTrue,3)
    correctFalsePct = np.round(correctFalse / totalExpectedFalse,3)
    print('Regarding ' + colName + '...')
    print('The model correctly predicted {} Negatives out of {} expected Negatives: {}'.format(
        correctFalse,totalExpectedFalse,correctFalsePct))
    print('The model correctly predicted {} Positives out of {} expected Positives: {}'.format(
        correctTrue,totalExpectedTrue,correctTruePct))    
    print(a)


In [45]:
file = 'reviews_Toys_and_Games_5.json'
raw_df = pd.read_json(file, lines=True)

print(raw_df.shape)
raw_df.head()

(167597, 9)


Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,439893577,"[0, 0]",5,I like the item pricing. My granddaughter wanted to mark on it but I wanted it just for the letters.,"01 29, 2014",A1VXOAVRGKGEAK,Angie,Magnetic board,1390953600
1,439893577,"[1, 1]",4,Love the magnet easel... great for moving to different areas... Wish it had some sort of non skid pad on bottom though...,"03 28, 2014",A8R62G708TSCM,Candace,it works pretty good for moving to different areas,1395964800
2,439893577,"[1, 1]",5,"Both sides are magnetic. A real plus when you're entertaining more than one child. The four-year old can find the letters for the words, while the two-year old can find the pictures the words spell. (I bought letters and magnetic pictures to go with this board). Both grandkids liked it a lot...","01 28, 2013",A21KH420DK0ICA,capemaychristy,love this!,1359331200
3,439893577,"[0, 0]",5,"Bought one a few years ago for my daughter and she loves it, still using it today. For the holidays we bought one for our niece and she loved it too.","02 8, 2014",AR29QK6HPFYZ4,dcrm,Daughters love it,1391817600
4,439893577,"[1, 1]",4,I have a stainless steel refrigerator therefore there are not much space for my son to play with his magnet. Brought this for him to put his magnet on. He enjoys sticking his magnet on it. Great to have so he can play with his alphabet magnets.,"05 5, 2014",ACCH8EOML6FN5,DoyZ,Great to have so he can play with his alphabet ...,1399248000


In [57]:
df = raw_df.copy()

#Sample down the data to manageable size
df = df.sample(n=20000)
df.shape

(20000, 9)

In [58]:
def clean_series(df, col):
        df[col] = df[col].str.lower().str.strip()
        df[col] = df[col].str.replace('.','')
        df[col] = df[col].str.replace(",",'')
        df[col] = df[col].str.replace("!",'')
        df[col] = df[col].str.replace("?",'')
        df[col] = df[col].str.replace(";",'')
        df[col] = df[col].str.replace(":",'')
        df[col] = df[col].str.replace("'",'')
        df[col] = df[col].str.replace("-",'')
        df[col] = df[col].str.replace('"','')
        df[col] = df[col].str.replace('(','') 
        df[col] = df[col].str.replace(')','')
        return df
    

#Set Ratings of 4/5 as Positive
df['Positive'] = np.where(df.overall >= 4, 1, 0)

#Parse all reviews and get counts of each word by Postive vs Negative

#split words for positive reviews and place in list
list_positivewords = []
df.loc[df['Positive'] ==1, 'reviewText'].str.split(' ').apply(list_positivewords.extend)

#Load list into dataframe so that we can get counts in next step
df_positive = pd.DataFrame(list_positivewords, columns=['Words'])
df_positive['Words'] = clean_series(df_positive, 'Words')


#Load counts into new dataframe
array_positive = [df_positive['Words'].value_counts().index.tolist(), df_positive['Words'].value_counts().values.tolist()]  
df_freq_positive = pd.DataFrame(array_positive).transpose()
df_freq_positive.columns = ['Word','Positive_Freq']


list_negativewords = []
df.loc[df['Positive'] ==0, 'reviewText'].str.split(' ').apply(list_negativewords.extend)
df_negative = pd.DataFrame(list_negativewords, columns=['Words'])
df_negative = clean_series(df_negative, 'Words')
array_negative = [df_negative['Words'].value_counts().index.tolist(), df_negative['Words'].value_counts().values.tolist()]  
df_freq_negative = pd.DataFrame(array_negative).transpose()
df_freq_negative.columns = ['Word','Negative_Freq']

merged = pd.merge(df_freq_negative, df_freq_positive, how='outer', left_on='Word', right_on='Word')
print(merged.shape)

#export list to spreadsheet
##merged.to_csv('amazon_pos_neg.csv') 


(52236, 3)


In [64]:
negative_words = ['poor', 'bad','waste','worst','terrible',
                  'dissapo','horrible','broke','junk','didnt',
                  'useless','difficult','return','none','crap',
                  'weak','unreliable','mistake','unfortunate',
                  'nothing','hate','boring','bored','break',
                  'tinny','drain','refund','not good','slow'
                  'dead','fail','worthless','dying','lost',
                  'complain','essentially','unusable', 'no good',
                  'stupid','flimsy','beware','wrong','neither'
                  'forget','lose','warning','joke','bother',
                  '$$$','embarass','flaw','fail','defect','cumbersom',
                  'died','forced','toilet','wobbly','crack',
                  'shouldnt','disap','gimmi','wrong','ugly',
                  'loose','ripped','jerk','mislead','lousy',
                  'excess','stuck','counterf','fake','garbage',
                  'trash','zero','whine','pain','frustr','delay',
                  'unbear','dirty','isnt','disgust','forger',
                  'weird','unsatis','bland','restock','unaccept',
                  'noise','freeze','awkward','too expen','too price',
                  'constant','infuriat','trash','garble','muffle',
                  'suck','unaccept','regret','intermit',
                  'cheap','drawback','sorry','remorse','mess ',
                  'darn', 'damn','crawl','wish',
                  'yell','piti','refuse','fool','inexcus','phony',
                  'phoney','leak', 'unhappy', 'not ',
                  'not recommend', 'be careful', 'died', 'dies', 'barely'
                  'awful','overprice','pointless','shame','shoddy',
                  'mislead','faulty','mediocre','aggravat','return',
                  'dizzy','sick','dud','never','sad','decept','stink'
                 ]

positive_words = ['nice','love','perfect','definit','sturdy',
                 'value','charm','great','super','phenom','wonderf',
                 'outstand','awesom','stell','pleased','comfort',
                 'beaut','fantast','overall','glad',
                 'rocks', 'blast','cute',
                 'excite','amaz','exact','tremend',' slim ','wow',
                 'certain','incred','flawless','surpri',
                 'solid','pleasant','smooth','plenty',
                 'awsom','fabul','prompt','highly',
                 'effective',' mega','#1','best','convenient','sweet',
                 'clever','excel','magic','entertain','enjoy',
                 'exceptional','exceed','winner','hands down',
                 '100%','premium','psyched', 'seamless', 'favorite',
                 'cute','classy','pretty','neat', 'thumbs up',
                 'well', 'good', 'easy', 'on time',
                 'really', 'like', 'peachy', 'compliment', ' plus',
                 'five star', 'a+', '10', 'happy', 'much', 'impress',
                 'must have', 'absolut', 'brilliant', 'easy',
                 'timely', 'cool', 'complete package', 'total package',
                 'holding up', 'very', 'omg', 'fun', 'play'
                 'easy','kids','many','easily','pure','hours']

In [65]:
#Create a column for each word in our list
#Loop through all of our reviews
#   and label each word as True/False whether that word exists in the review
df2 = df.copy()

for word in negative_words:     
    
    df2[str(word)] = df2['reviewText'].str.contains(word, case=False)


for word in positive_words:    
    
    df2[str(word)] = df2['reviewText'].str.contains(word, case=False)   
    
df2.shape

(20000, 241)

In [66]:
#split data into X and y

y_train = df2.Positive
X_train = df2.drop(columns=['Positive'], axis=1)
X_train = my_remove_highly_correlated(df2, threshold=.85)

0 out of 232 vars removed due to corr greater than 0.85
Empty DataFrame
Columns: [var, variable, value]
Index: []

Shape before: (20000, 232)
Shape after: (20000, 220)




In [67]:
from sklearn.naive_bayes import BernoulliNB 

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(X_train, y_train)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(X_train)

mislabeled = (y_train != y_pred).sum()
total = X_train.shape[0]

# Display our results.
my_confusion_matrix(y_train, y_pred, 'Toy Reviews')

print("\nNumber of mislabeled points out of a total {} points : {} ({}%)".format(
    total,mislabeled,round(mislabeled/total*100, 2) 
))

#View the mismatches
results = df.copy()
results['y_pred'] = y_pred
results.loc[results.y_pred != results.Positive, ['reviewText','overall','y_pred','Positive']].sort_values('y_pred', ascending=False)


Regarding Toy Reviews...
The model correctly predicted 1228 Negatives out of 3193 expected Negatives: 0.385
The model correctly predicted 15862 Positives out of 16807 expected Positives: 0.944
[[ 1228  1965]
 [  945 15862]]

Number of mislabeled points out of a total 20000 points : 2910 (14.55%)


Unnamed: 0,reviewText,overall,y_pred,Positive
68707,"A very inexpensive Nerf gun. Works fine until it jams, which it does often. Worth the $10 but not more then that.",3,1,0
43502,"The tree is lousy to put together and the dinks don't hang on it well. I had to go buy my own mini tree else where. The only cool things are the the dinks, the star beads and pencils. I always have my own pack of colored pencils for more variety.",2,1,0
31319,This was a gift for my grandchildren. I thought it would be a great toy to play with but it didn't sound anything like I thought. It was more mechanical and didn't have the 8 different voices it claimed.,3,1,0
138382,"We bought this to add on to our Melissa and Doug Wooden Railway set. My son loves Thomas, and it's cute and the right size for the train set, but the magnet is not nearly as strong as his other train cars. We actually suspect that there's not a magnet in it at all because it doesn't repel othe...",3,1,0
39379,"My 4 year-old got this for Christmas. We followed the installation instructions and got NOTHING on the TV. After reviewing the terse trouble shooting instructions, we narrowed it down to reset for both the console and keyboard. However, there were no instructions on how to reset both the cons...",3,1,0
58854,"My son loves Caillou. When we saw this doll, we thought it would be a great gift for his 3rd Birthday. As we were dressing and undressing Caillou, his arm fell right off! I was even helping my son with this so it wasn't that the Caillou doll was forced awkwardly into positions in order to get...",3,1,0
129802,"If you can get past the point that this figure is intended to hold other figures, its a cool toy. It won't turn into the truck, it just folds in half and various parts of its body fold to form places to stick smaller transforms. It does light up and make sounds when you fold it or push its buttons.",3,1,0
63602,"My child at about 1 was so intrigued with this toy and wanted it at the store until I finally bought it for him on his second birthday, even though it was a little young for him he played with it for a full day at Grandma's and Grandma sent it home and said ""you listen to it"". Indeed, I didn't t...",2,1,0
87145,"I bought this for my 2 year old daughter. She loves cars and Mater is her favorite.The 2 ""sayings"" that come out of it aren't from the movie. Which is fine but she loves the movie enough that I think if they had been it would have increased her enjoyment. But she loves the flash light portion...",3,1,0
155598,"I got this for my nephew's son, who is the All American Boy. With the car/dino combo, it's hard to go wrong. Better yet, the car comes out of the dino's bottom---what five-year-old is not going to roll on the ground over that?With that in mind, it's too expensive to come with only one car. When ...",3,1,0
