# Random Forests

In [1]:
# See binary_classification.py
from binary_classification import *

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 16})

In [3]:
# Load pickled data
df = pd.read_pickle("data/yelp_df_final.pkl")

# Split data
train_df = df[df["dataset"] == "train"]
valid_df = df[df["dataset"] == "val"].reset_index(drop = True)

In [4]:
# Parameters excluded from predictions
exclude = ["review_id", 
           "business_id", 
           "user_id", 
           "label", 
           "date", 
           "categories", 
           "is_open",
           "postal_code", 
           "dataset"]

# Predictors
predictors = list(train_df.columns)
for col in exclude:   
    predictors.remove(col)

# Label
outcome = "label"

## Cross-Validation

Tune Random Forest Classifier hyperparameters number of estimators and max depth of estimator via grid search over values with accuracy on validation data as criterion

In [5]:
def rf_grid_seach(train_df,
                  valid_df, 
                  predictors,
                  outcome,
                  n_estimators,
                  depths, 
                  measure = "Accuracy"):
    '''
    Execute grid search over random forest classifier hyperparameters
    max depth and number of estimators
    '''
    
    # Initialize results
    result = {}
    
    # Store best model
    best_model = None
    best_acc = 0
    
    # Grid search
    for d in depths:
        for n in n_estimators:
            
            # Initialize class instance
            rf = RandomForest(train_data = train_df,
                              valid_data = valid_df,
                              test_data = None,
                              predictors = predictors,
                              outcome = outcome)
            
            # Train model
            rf.train(criterion = "entropy",
                     n_estimators = 10,
                     max_depth = 4)
            
            # Save trained model object to pickle
            pickle_out = open("results/model_rf_{}_{}.pkl".format(n,d), "wb")
            pickle.dump(rf.model, pickle_out)
            pickle_out.close()
            
            # Compute accuracy 
            rf.compute_prob(prob_set = "Valid")
            acc = rf.performance_metric(prob_set = "Valid",
                                        measure = measure)
            
            # Update best model
            if acc > best_acc:
                best_acc = acc
                best_model = rf.model
                
            # Record result
            result[(n, d)] = acc
            print("d: {}, n: {}, Validation Accuracy: {:.2f}".format(d, n, acc))
            
    return best_model, result

In [6]:
# Depths to search over 
depths = list(range(1, 6))

# Number of estimators to search over 
n_estimators = [10, 20, 30, 40, 50]

In [7]:
# Execute grid search 
best_rf, accs = rf_grid_seach(train_df,
                              valid_df,
                              predictors,
                              outcome, 
                              n_estimators,
                              depths, 
                              "Accuracy")

Time to train: 114.66
d: 1, n: 10, Validation Accuracy: 0.68
Time to train: 99.79
d: 1, n: 20, Validation Accuracy: 0.68
Time to train: 87.04
d: 1, n: 30, Validation Accuracy: 0.68
Time to train: 114.17
d: 1, n: 40, Validation Accuracy: 0.68
Time to train: 100.91
d: 1, n: 50, Validation Accuracy: 0.68
Time to train: 127.36
d: 2, n: 10, Validation Accuracy: 0.68
Time to train: 105.59
d: 2, n: 20, Validation Accuracy: 0.68
Time to train: 111.70
d: 2, n: 30, Validation Accuracy: 0.68
Time to train: 93.59
d: 2, n: 40, Validation Accuracy: 0.68
Time to train: 120.46
d: 2, n: 50, Validation Accuracy: 0.68
Time to train: 104.47
d: 3, n: 10, Validation Accuracy: 0.68
Time to train: 92.02
d: 3, n: 20, Validation Accuracy: 0.68
Time to train: 112.08
d: 3, n: 30, Validation Accuracy: 0.68
Time to train: 98.03
d: 3, n: 40, Validation Accuracy: 0.68
Time to train: 87.95
d: 3, n: 50, Validation Accuracy: 0.68
Time to train: 113.39
d: 4, n: 10, Validation Accuracy: 0.68
Time to train: 108.51
d: 4, n:

In [8]:
# Save best model
pickle_out = open("results/model_rf_best.pkl", "wb")
pickle.dump(best_rf, pickle_out)
pickle_out.close()

In [11]:
best_n, best_d = max(accs, key=accs.get)
print("Best n: {}, Best d: {}".format(best_n, best_d))

Best n: 10, Best d: 1
