# Helpful Reviews Machine Learning Tests

This notebook presents an outline of functions and tests that were utilized to select a machine learning model to predict helpful reviews. The number of rows has been limited in order to expedite speed. Tests run with millions of rows gave similar results.

In [1]:
ROWS = 100000

In [2]:
# Import essentials
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

  from numpy.core.umath_tests import inner1d


In [3]:
def helpful_reviews_pipeline(nrows=500000, subset=True):
    
    # Open Dataframe
    if subset:
        df = pd.read_csv('df_10.csv', nrows=nrows)
    else:
        df = pd.read_csv('df_10.csv')
        
    # Rename column error
    df.rename(columns={'Helpful?': 'Helpful'}, inplace=True)
    
    # Cut Middle Rows
    df = cut_middle_rows(df)
    
    # Choose relevant columns
    df = df[['reviewText', 'Helpful']]
            
    return df

In [4]:
# Define function to eliminate middle rows
def cut_middle_rows(df, high=0.90, low=0.5, hand_pick=True):

    if not hand_pick:
        high = df['Helpful_Rating'].median() + MIDDLEPERCENTAGE*0.25
        low = df['Helpful_Rating'].median() - MIDDLEPERCENTAGE*0.75

    df = df[(df['Helpful_Rating']<low) | (df['Helpful_Rating']>high)]
    
    print('Length of new dataframe: ', len(df), 'rows.')
    
    return df

## Make Corpus

In [5]:
def make_corpus(df):
    
    def normalize_document(doc):
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
        doc = doc.lower()
        doc = doc.strip()
        # tokenize document
        tokens = wpt.tokenize(doc)
        # filter stopwords out of document
        filtered_tokens = [token for token in tokens if token not in stop_words]
        # re-create document from filtered tokens
        doc = ' '.join(filtered_tokens)
        return doc
    
    nltk.download('stopwords')

    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')

    corpus = df['reviewText']
    normalize_corpus = np.vectorize(normalize_document)
    norm_corpus = normalize_corpus(corpus)
    
    return norm_corpus

## X,y Functions

In [6]:
def make_xy(df, vectorizer):
    vectorizer = vectorizer
    X = vectorizer.fit_transform(df.reviewText)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df['Helpful']
    return X, y

def make_xy_norm(df, vectorizer, norm_corpus):
    vectorizer = vectorizer
    X = vectorizer.fit_transform(norm_corpus)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df['Helpful']
    return X, y

## Test Function

In [7]:
def run_tests(df, ml_test, vectorizer):
    X,y = make_xy(df, vectorizer)
    ml_test(X,y)
    
def run_norm_tests(df, ml_test, vectorizer, norm_corpus):
    X,y = make_xy_norm(df, vectorizer, norm_corpus)
    ml_test(X,y)

## Initial Test Function

In [8]:
# Create function to return the results of machine learning tests
def ml_classification_initial_tests(X, y, test_pct=0.15):
        
    #------------------------------------------------------------------------
    
    # NAIVE BAYES
    print('\nNAIVE BAYES')
    
    # Create a multinomial classifier
    mnb = MultinomialNB()
        
    # Compute 5-fold cross-validation scores: cv_scores
    cv_scores = cross_val_score(mnb, X, y, cv=5)

    # Print the 5-fold cross-validation scores
    print(cv_scores)

    print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
    
    #------------------------------------------------------------------------

    
    # LOGISTIC REGRESSION
    print('\nLOGISTIC REGRESSION')

    # Instantiate a logistic regression classifier: logreg
    lr = LogisticRegression()

    # Compute 5-fold cross-validation scores: cv_scores
    cv_scores = cross_val_score(lr, X, y, cv=5)

    # Print the 5-fold cross-validation scores
    print(cv_scores)

    print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
    
    
    #------------------------------------------------------------------------
    
    # DECISION TREE TUNED
    print('\nDECISION TREE')
    
    # Instantiate a Decision Tree classifier
    dt = DecisionTreeClassifier()
        
    # Compute 5-fold cross-validation scores: cv_scores
    cv_scores = cross_val_score(dt, X, y, cv=5)

    # Print the 5-fold cross-validation scores
    print(cv_scores)

    print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
    
    #------------------------------------------------------------------------
    
    # RANDOM FORESTS
    print('\nRANDOM FORESTS')
    
    # Instantiate a Random Forest Classifier
    rfc = RandomForestClassifier()
    
    # Compute 5-fold cross-validation scores: cv_scores
    cv_scores = cross_val_score(rfc, X, y, cv=5)

    # Print the 5-fold cross-validation scores
    print(cv_scores)

    print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

## Initial Tests

In [9]:
def split_data(ROWS, pipeline):
    
    df = pipeline(nrows=ROWS)
    
    split = int(len(df)*0.9)
    
    df_train = df[:split]
    print('Length of df_train:', len(df_train))
    
    df_test = df[split:]
    print('Length of df_test:', len(df_test))
    
    return df_train, df_test

In [10]:
df_train, df_test = split_data(ROWS, helpful_reviews_pipeline)

Length of new dataframe:  42196 rows.
Length of df_train: 37976
Length of df_test: 4220


In [None]:
norm_corpus = make_corpus(df_train)
run_tests(df_train, ml_classification_initial_tests, CountVectorizer())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/coreyjwade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

NAIVE BAYES
[0.82622433 0.85215903 0.85174457 0.84094799 0.84685278]
Average 5-Fold CV Score: 0.8435857398135311

LOGISTIC REGRESSION


Naive Bayes, Logistic Regression and Random Forests are all worth pursuing going forward.

## HyperParameter Tests

In [None]:
# Create function to return the results of machine learning tests
def ml_classification_tests(X, y, test_pct=0.15):
        
    # Split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_pct)
    
    #------------------------------------------------------------------------
    
    # NAIVE BAYES
    print('\nNAIVE BAYES')
    
    #the grid of parameters to search over
    alphas = [0.001, 0.01, .1, 1, 5]
    
    param_grid = {'alpha': alphas}
    
    # Create a multinomial classifier
    mnb = MultinomialNB()
    
    mnb_cv = GridSearchCV(mnb, param_grid, cv=5)
    
    # Fit the classifier to the data
    mnb_cv.fit(X_train, y_train)
    
    # Print the tuned parameters and score
    print("Best Naive Bayes alpha: {}".format(mnb_cv.best_params_)) 
    print("Best Naive Bayes score: {}".format(mnb_cv.best_score_))
    
    # Predict the labels of the test set: y_pred
    y_pred = mnb_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    #------------------------------------------------------------------------
    
    # LOGISTIC REGRESSION
    print('\nLOGISTIC REGRESSION')
    
    # Setup the hyperparameter grid
    c_space = np.logspace(-5, 8, 10)
    param_grid = {'C': c_space}

    # Instantiate a logistic regression classifier: logreg
    logreg = LogisticRegression()

    # Instantiate the GridSearchCV object: logreg_cv
    logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

    # Fit it to the data
    logreg_cv.fit(X_train,y_train)

    # Print the tuned parameters and score
    print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
    print("Best Logistic Regression score: {}".format(logreg_cv.best_score_))
    
        # Predict the labels of the test set: y_pred
    y_pred = logreg_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))    
   
    #------------------------------------------------------------------------
    
    # RANDOM FORESTS
    print('\nRANDOM FORESTS')
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}    
    
    # Instantiate a Random Forest Classifier
    rfc = RandomForestClassifier()
    
    # Instantiate RandomizedSearchCV
    rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, cv = 5, n_jobs = -1, n_iter=5)
    
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    # Print the tuned parameters and score
    print("Tuned Random Forest Parameters: {}".format(rf_random.best_params_))
    print("Best Random Forest score: {}".format(rf_random.best_score_))
    
    # Predict the labels of the test set: y_pred
    y_pred = mnb_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))

In [None]:
run_tests(df_train, ml_classification_tests, CountVectorizer())

Logistic Regression is clearly outperforming Random Forests and Naive Bayes with C=0.007742636826811269.

The logistic regression function below can be used for subsequent tests. The Confusion Matrix is essential due to the class imbalance.

In [None]:
def logistic_regression(X, y, test_pct=0.15):
    
    # Split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_pct)
    
    # LOGISTIC REGRESSION
    print('\nLOGISTIC REGRESSION')

    # Instantiate a logistic regression classifier: logreg
    logreg = LogisticRegression(C=0.007742636826811269)

    # Fit it to the data
    logreg.fit(X_train,y_train)
    
    # Predict the labels of the test set: y_pred
    y_pred = logreg.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    return logreg

## Choose Vectorizer

For each standard vectorizer, CountVectorizer, and TfidfVectorizer, there are various n_gram options.

In [None]:
def choose_vectorizer(df, ml_test, norm_corpus):
    vectorizers = [CountVectorizer(), CountVectorizer(ngram_range=(1,2)), CountVectorizer(ngram_range=(1,3)), TfidfVectorizer(min_df=0.), TfidfVectorizer(ngram_range=(1, 2), min_df=0.), TfidfVectorizer(ngram_range=(1, 3), min_df=0.) ]
    for vect in vectorizers:
        print(str(vect))
        print('\nmake_xy')
        run_tests(df, ml_test, vect)
        print('\nmake_xy_norm')
        run_norm_tests(df, ml_test, vect, norm_corpus)
        print('\n')

In [None]:
choose_vectorizer(df_train, logistic_regression, norm_corpus)

make_xy is consistently outperforming make_xy-norm. CountVectorizer(n_gram=(1,3)) is currently best, but CountVectorizer(n_gram=(1,2)) is close and faster.

## Vectorizer Parameters

Logistic regression has a min_df parameter that can be tuned to obtain better results. When between 0 and 1, it discounts the percentage of frequency of words. For instance, df_min = 0.01 would discount words that appear in less than 1% of documents.

In [None]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from six.moves import range

# Setup Seaborn
sns.set_style("whitegrid")
sns.set_context("poster")

In [None]:
# Define Cumulative Frequency function
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n

    return x, y

In [None]:
# Turn X (df.reviewText) into one dimensional array
def sort_reviews_into_1D_list(df, vectorizer):
    X = vectorizer.fit_transform(df.reviewText)
    review_list = list(sorted((X > 0).sum(axis=0).reshape(-1).tolist()[0]))
    return review_list

In [None]:
review_list = sort_reviews_into_1D_list(df_train, CountVectorizer(ngram_range=(1,2)))

#### Graph Word Frequency

In [None]:
# Compute ECDF
x1, y1 = ecdf(review_list)

# Generate plot
plt.plot(x1, y1, marker='.', linestyle='none')

# Make the margins nice
plt.margins(.02)

# Label the axes
plt.xlabel('Word Count')
plt.ylabel('ECDF')
plt.title('Word Frequency')

In [None]:
# Generate reduced plot
plt.plot(x1, y1, marker='.', linestyle='none')

# Make the margins nice
plt.margins(.02)

# Label the axes
plt.xlabel('Word Count')
plt.ylabel('ECDF')

#Limit axes
plt.xlim(0,80)

In [None]:
# Generate reduced plot
plt.plot(x1, y1, marker='.', linestyle='none')

# Make the margins nice
plt.margins(.02)

# Label the axes
plt.xlabel('Word Count')
plt.ylabel('ECDF')

#Limit axes
plt.xlim(0,20)

The graph above indicates that 75% of the distinct one and two-word combinations in the entire corpus only appear once. The graph suggests that we try df_mins of 2-10.

A df_min of 2 means that the classifer will discount all words that appear in less than 2 reviews. There is also an option to use percentages.

#### Adjust min_df

In [None]:
def min_df(df, test):
    min_dfs = [1, 2, 4, 6, 8, 10]
    for val in min_dfs:
        vect = CountVectorizer(ngram_range=(1,2), min_df=val)
        print(vect)
        run_tests(df, test, vect)
        print('\n')

In [None]:
min_df(df_train, logistic_regression)

The best result on a 250,000 subset comes from min_df = 2, which is very close to min_df=1.

#### Max Graphs

I will use the new dataset, df_overall_2, to determine a max_df since it's larger than the previous dataset. A max_df of 0.99 would discount words that appear in more than 99% of all documents, presumably words like 'a', 'the', and other common words.

In [None]:
# Generate max plot
plt.plot(x1, y1, marker='.', linestyle='none')

# Make the margins nice
plt.margins(.02)

# Label the axes
plt.xlabel('Word Count')
plt.ylabel('ECDF')
plt.title('Word Frequency')

#Limit axes
plt.ylim(0.95, 1.001)
plt.show()

In [None]:
# Generate zoom plot
plt.plot(x1, y1, marker='.', linestyle='none')

# Make the margins nice
plt.margins(.02)

# Label the axes
plt.xlabel('Word Count')
plt.ylabel('ECDF')
plt.title('Word Frequency')

#Limit axes
plt.ylim(0.995, 1.001)
plt.show()

The graph really changes at around 0.999

#### Test Percentages

In [None]:
def max_df(df, test):
    max_dfs = [0.9, 0.99, 0.999, 0.9999, 1.0]
    for val in max_dfs:
        vect = CountVectorizer(ngram_range=(1,2), max_df=val)
        print(vect)
        run_tests(df, test, vect)
        print('\n')

In [None]:
max_df(df_train, logistic_regression)

The best results for df_train are max_df=1.0. 

## Evaluate Combined Model

In [None]:
from sklearn.externals import joblib

def log_reg_fin(X, y):
        
    lr = LogisticRegression(C=0.007742636826811269)
    
    lr.fit(X, y)
    
    scores = cross_val_score(lr, X, y, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        
    predicted = cross_val_predict(lr, X, y, cv=5)
    
    report = classification_report(y, predicted) 
    
    print('\n')    
    print(report)
    
    joblib.dump(lr, 'lr_model.pkl')
    print('Logistic Regression model saved as "lr_model.pkl"')
    
    return lr

In [None]:
run_tests(df_train, log_reg_fin, CountVectorizer(ngram_range=(1,3), max_df=1.0, min_df=2))

In [None]:
run_tests(df_test, log_reg_fin, CountVectorizer(ngram_range=(1,3), max_df=1.0, min_df=2))

## Star Tests

It's worth running the same tests to try and determine if someone likes or dislikes a book basked on the review. This can translate directly to the number of stars given.

#### Create Stars Column (binary)

In [None]:
df_10 = pd.read_csv('df_10.csv')

In [None]:
# Define function
def stars(row):
    # Give 1 star reviews a value of 0
    if row['overall']==1:
        return 0
    # Give 2 star reviews a value of 0
    elif row['overall']==2:
        return 0
    # Give 3 star reviews a value of 0
    elif row['overall']==3:
        return 0
    # Give 4,5 star reviews a value of 0
    else:
        return 1

# Create column
df_10['Stars'] = df_10.apply(stars, axis=1)

In [None]:
# Save updated file
df_10.to_csv('df_10.csv')

In [None]:
# Define new dataframe that eliminates the middle, 3 and 4-star reviews
#df_stars = df_10[(df_10['overall']!=3) | (df_10['overall']!=4)]
df_stars = df_10[(df_10['overall']!=3)]

# Save updated file
df_stars.to_csv('df_stars.csv')

In [None]:
def star_reviews_pipeline(nrows=10000, subset=True):
    
    # Open Dataframe
    if subset:
        df = pd.read_csv('df_stars.csv', nrows=nrows)
    else:
        df = pd.read_csv('df_stars.csv')
    
    # Choose relevant columns
    df = df[['reviewText', 'Stars']]
            
    return df

#### Adjust Functions for Initial Tests

In [None]:
def make_xy_stars(df, vectorizer):
    vectorizer = vectorizer
    X = vectorizer.fit_transform(df.reviewText)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df.Stars
    return X, y

In [None]:
def run_star_tests(df, ml_test, vectorizer):
    X,y = make_xy_stars(df, vectorizer)
    ml_test(X,y)

#### Initial Test

In [None]:
df_train_stars, df_test_stars = split_data(ROWS, star_reviews_pipeline)

In [None]:
run_star_tests(df_train_stars, logistic_regression, CountVectorizer(ngram_range=(1,2)))

In [None]:
run_star_tests(df_test_stars, logistic_regression, CountVectorizer(ngram_range=(1,2)))

This is definitely worth pursuing going forward.

## Results

Best result:
LogisticRegression(C=0.007742636826811269)
CountVectorizer(ngram_range=(1,2))
Precision, Recall and F1 Scores: 91%, 91%, 91%

The results are very compelling. With some hyperparameter tweaking, logistic regression reaches 90% accuracy with CountVectorizer. Multiple tests revealed the same results. Over multiple test-sizes, CountVectorizer(ngram_range=1,2) and CountVectorizer(ngram_range=1,3) with min_df = 2, or 1 and max_df = 0.9999 or 1 were best.

Tfidf has gone down in performance from a couple months ago, so further investigation is required. When performing optimally, however, it still did not outperform CountVectorizer. 

Another consistent test result is that make_xy outperform makes_xy_norm. This means that the general corpus generated by CountVectorizer does better than the normed corpus that I created.