In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sb

# This is the model we'll be using.
from sklearn import tree

# A convenience for displaying visualizations.
from IPython.display import Image

# Packages for rendering our tree.
import pydotplus
import graphviz

%matplotlib inline

import json

path = 'Musical_Instruments_5.json' 
data = pd.read_json(path, lines=True) 
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1384719342,"[0, 0]",5,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600
1,1384719342,"[13, 14]",5,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000
2,1384719342,"[1, 1]",5,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000
3,1384719342,"[0, 0]",5,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000
4,1384719342,"[0, 0]",5,This pop filter is great. It looks and perform...,"02 21, 2014",A94QU4C90B1AX,SEAN MASLANKA,No more pops when I record my vocals.,1392940800
5,B00004Y2UT,"[0, 0]",5,So good that I bought another one. Love the h...,"12 21, 2012",A2A039TZMZHH9Y,"Bill Lewey ""blewey""",The Best Cable,1356048000
6,B00004Y2UT,"[0, 0]",5,"I have used monster cables for years, and with...","01 19, 2014",A1UPZM995ZAH90,Brian,Monster Standard 100 - 21' Instrument Cable,1390089600
7,B00004Y2UT,"[0, 0]",3,I now use this cable to run from the output of...,"11 16, 2012",AJNFQI3YR6XJ5,"Fender Guy ""Rick""",Didn't fit my 1996 Fender Strat...,1353024000
8,B00004Y2UT,"[0, 0]",5,Perfect for my Epiphone Sheraton II. Monster ...,"07 6, 2008",A3M1PLEYNDEYO8,"G. Thomas ""Tom""",Great cable,1215302400
9,B00004Y2UT,"[0, 0]",5,Monster makes the best cables and a lifetime w...,"01 8, 2014",AMNTZU1YQN1TH,Kurt Robair,Best Instrument Cables On The Market,1389139200


In [2]:
data.isna().sum()

asin               0
helpful            0
overall            0
reviewText         0
reviewTime         0
reviewerID         0
reviewerName      27
summary            0
unixReviewTime     0
dtype: int64

In [3]:
data.dropna(inplace = True)

In [4]:
data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1384719342,"[0, 0]",5,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600
1,1384719342,"[13, 14]",5,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000
2,1384719342,"[1, 1]",5,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000
3,1384719342,"[0, 0]",5,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000
4,1384719342,"[0, 0]",5,This pop filter is great. It looks and perform...,"02 21, 2014",A94QU4C90B1AX,SEAN MASLANKA,No more pops when I record my vocals.,1392940800


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10234 entries, 0 to 10260
Data columns (total 9 columns):
asin              10234 non-null object
helpful           10234 non-null object
overall           10234 non-null int64
reviewText        10234 non-null object
reviewTime        10234 non-null object
reviewerID        10234 non-null object
reviewerName      10234 non-null object
summary           10234 non-null object
unixReviewTime    10234 non-null int64
dtypes: int64(2), object(7)
memory usage: 799.5+ KB
None


In [6]:
target = data.overall.copy()
review_data = data.reviewText.copy()
summary_data = data.summary.copy()

# TFIDF on review_data

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 1),
    max_features=30000)

#Applying the vectorizer
tfidf_review_data =word_vectorizer.fit_transform(review_data)
print("Number of features: %d" % tfidf_review_data.get_shape()[1])

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 20241 to 450.
svd= TruncatedSVD(1000)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
truncated_tfidf_review_data = lsa.fit_transform(tfidf_review_data)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Create our testing and training data set, with spread of 70% in training and 30% in testing
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(truncated_tfidf_data, target, test_size=0.3, random_state=20)

# fit and transform on it the training features
#word_vectorizer.fit(X_train)
#X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
#word_vectorizer.fit(X_test)
#test_features = word_vectorizer.transform(X_test)

import time
start_time = time.time() 
reset_time = time.time() - start_time

Number of features: 20241
Percent variance captured by all components: 60.83335565911644


# TFIDF on summary data

In [8]:
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 1),
    max_features=30000)

#Applying the vectorizer
tfidf_summary_data =word_vectorizer.fit_transform(summary_data)
print("Number of features: %d" % tfidf_summary_data.get_shape()[1])

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 20241 to 450.
svd= TruncatedSVD(500)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
truncated_tfidf_summary_data = lsa.fit_transform(tfidf_summary_data)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Number of features: 3983
Percent variance captured by all components: 73.52250362295567


# Create train/test set

In [9]:
#Combine the two arrays into one data set
selected_data = np.concatenate((truncated_tfidf_summary_data, truncated_tfidf_review_data), axis=1)
#Create our testing and training data set, with spread of 70% in training and 30% in testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_data, target, test_size=0.3, random_state=20)

# Logistic Regression

In [10]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
# Set up the regression model to predict defaults using all other
# variables as features.
regr = LogisticRegression(solver='lbfgs')
regrfit = regr.fit(X_train, y_train)

# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = regrfit.predict(X_train)

# Code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Use score method to get accuracy of model
print('\n Model accuracy')
score = regr.score(X_test, y_test)
print(score)
print("--- %s seconds ---" % (time.time() - start_time - reset_time))
reset_time = time.time() - start_time




 Model accuracy
0.6991208075545425
--- 7.91845703125 seconds ---


# Random Decision Tree

In [11]:
from sklearn import ensemble
#Create a generic random forest tree
rfc = ensemble.RandomForestClassifier(
    criterion='entropy',
    max_features = 'auto',
    max_depth = 2 ,
    n_estimators = 2,
    n_jobs=-1
    )

from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators': [10, 15, 20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [1, 2, 3],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}
gd_sr = GridSearchCV(estimator=rfc,  
                     param_grid=parameters,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

gd_sr.fit(X_train, y_train)  
best_result = gd_sr.best_score_  
print("Best Accuracy: ", best_result)  
print("--- %s seconds ---" % (time.time() - start_time - reset_time))
reset_time = time.time() - start_time

Best Accuracy:  0.6763925729442971
--- 109.30828547477722 seconds ---


# Gradient Boosting

In [12]:
params = {'n_estimators': 10,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

# SVM

In [13]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import cross_val_score 
from sklearn.metrics import mean_squared_error
from math import sqrt

model_names = [regrfit, gd_sr, clf, svclassifier]
for model_name in model_names:
    print('\n\nModel Type: ', model_name)
    print('\nWith 20% Holdout: ' + str(model_name.fit(X_train, y_train).score(X_test, y_test)))
    print('Testing on Sample: ' + str(model_name.fit(selected_data, target).score(selected_data, target)))

    print('\nCross Validation')
    print(cross_val_score(model_name, selected_data, target, cv=10))

    prediction = model_name.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cnf_matrix = confusion_matrix(y_test, prediction)
    print('\n Confusion Matrix')
    print(cnf_matrix)

    from sklearn.metrics import classification_report
    print('\n Classification Report')
    print(classification_report(y_test, prediction))
    print("--- %s seconds ---" % (time.time() - start_time - reset_time))
    reset_time = time.time() - start_time



Model Type:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

With 20% Holdout: 0.6991208075545425
Testing on Sample: 0.7540551104162595

Cross Validation
[0.68780488 0.68261719 0.70019531 0.70410156 0.70996094 0.70898438
 0.70772239 0.68914956 0.68786693 0.71498531]

 Confusion Matrix
[[  16    2    2    9   30]
 [   0    2   19   14   50]
 [   1    0   79   52  114]
 [   0    0   12  196  397]
 [   0    0    3   51 2022]]

 Classification Report
              precision    recall  f1-score   support

           1       0.94      0.27      0.42        59
           2       0.50      0.02      0.04        85
           3       0.69      0.32      0.44       246
           4       0.61      0.32      0.42       605
           5       0.77      0.97      0.86      2076

   

Out of these four models, SVM shows the best accuracy in its model in terms of it's cross validation score as well as in it's classification report. However for it's runtime, Logistic Regression has the lowest, followed by SVM, Random Forest and Gradient Boosting.


The reason for why the random forest tree's poor performance is due to the low amount of n-estimators for modeling the data set, ideally a n-estimator value of 500 or more would be ideal, however it is not possible with my hardware to achieve in any reasonable amount of time. It's classification report score also leads me to believe that random forest does not perform very well in terms of modeling text data. This may be due to the low amount of n-estimators, but even when compared to gradient boosting atleast gradient boosting was able to correctly predict a few reviews. Random forests low score might be due to how exact the data set is, and that multiple approaches to the same data set actually induces more variance or wrong assumptions which causes the model to underperform.

Logistic regression works really well because of the nature of the problem being a classification problem and that values are either a hit or miss, but with the number of features that we have to experiement with (which captures 60-70% of the variance of summary and reviewText) we were able to capture alot of the data. Compared to SVM however, although it is more accurate in its cross validation, SVM is able to correctly guess the correct review more times then logistic regression.

# Conclusion

SVM has the best model for this dataset overall, it's modeling method of grouping support vectors and categorizing data into being "good" or "bad" fits very closely to how the rating is gauged by how many words with positive or negative conotations there are (assuming there are no large amount of misleading reviews).
Random Forest does not work as well in this scenerio due this kind of dataset looking for more depth instead of breadth (and that my computer cannot handle a random forest that is too large). In this case Gradient Boosting works much better as it is basically looking for more depth in the data, which allows it to pick up more on the small variances on the dataset more accurately. 