# Adaboost

In [1]:
# See binary_classification.py
from binary_classification import *

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 16})

In [3]:
# Load pickled data
df = pd.read_pickle("data/yelp_df_final.pkl")

# Split data
train_df = df[df["dataset"] == "train"]
valid_df = df[df["dataset"] == "val"].reset_index(drop = True)

In [4]:
# Parameters excluded from predictions
exclude = ["review_id", 
           "business_id", 
           "user_id", 
           "label", 
           "date", 
           "categories", 
           "is_open",
           "postal_code", 
           "dataset"]

# Predictors
predictors = list(train_df.columns)
for col in exclude:   
    predictors.remove(col)

# Label
outcome = "label"

## Cross-Validation

Tune AdaBoost classifier hyperparameters maximum tree depth of base DecisionTreeClassifier and number of estimators with accuracy on validation data

In [5]:
def ab_grid_seach(train_df,
                  valid_df, 
                  predictors,
                  outcome,
                  n_estimators,
                  depths, 
                  measure = "Accuracy"):
    '''
    Execute grid search over Adaboost hyperparameters
    max depth and number of estimators
    '''
    
    # Initialize results
    result = {}
    
    # Store best model
    best_model = None
    best_params = None
    best_acc = 0
    
    # Grid search
    for d in depths:
        for n in n_estimators:
            
            # Initialize class instance
            ab = AdaBoost(train_data = train_df,
                          valid_data = valid_df,
                          test_data = None,
                          predictors = predictors,
                          outcome = outcome)
            
            # Train model
            ab.train(base_depth = d,
                     n_estimators = n)
            
            # Save trained model object to pickle
            pickle_out = open("results/model_ab_{}_{}.pkl".format(n,d), "wb")
            pickle.dump(ab.model, pickle_out)
            pickle_out.close()
            
            # Compute accuracy 
            ab.compute_prob(prob_set = "Valid")
            acc = ab.performance_metric(prob_set = "Valid",
                                        measure = measure)
            
            # Update best model
            if acc > best_acc:
                best_acc = acc
                best_model = ab.model
                best_params = (n, d)
                
            # Record result
            result[(n, d)] = acc
            print("d: {}, n: {}, Validation Accuracy: {:.2f}".format(d, n, acc))
            
    return best_model, best_params, result

In [6]:
# Depths to search over 
depths = list(range(1, 6))

# Number of estimators to search over 
n_estimators = [10, 20, 30, 40, 50]

In [7]:
# Execute grid search 
best_ab, best_params, accs = ab_grid_seach(train_df,
                                           valid_df,
                                           predictors,
                                           outcome, 
                                           n_estimators,
                                           depths, 
                                           "Accuracy")

Time to train: 282.85
d: 1, n: 10, Validation Accuracy: 0.74
Time to train: 501.98
d: 1, n: 20, Validation Accuracy: 0.75
Time to train: 734.65
d: 1, n: 30, Validation Accuracy: 0.76
Time to train: 971.79
d: 1, n: 40, Validation Accuracy: 0.76
Time to train: 1201.65
d: 1, n: 50, Validation Accuracy: 0.76
Time to train: 477.75
d: 2, n: 10, Validation Accuracy: 0.75
Time to train: 945.81
d: 2, n: 20, Validation Accuracy: 0.75
Time to train: 1405.75
d: 2, n: 30, Validation Accuracy: 0.76
Time to train: 1869.15
d: 2, n: 40, Validation Accuracy: 0.76
Time to train: 2331.57
d: 2, n: 50, Validation Accuracy: 0.76
Time to train: 702.93
d: 3, n: 10, Validation Accuracy: 0.76
Time to train: 1389.02
d: 3, n: 20, Validation Accuracy: 0.76
Time to train: 2067.68
d: 3, n: 30, Validation Accuracy: 0.76
Time to train: 2760.20
d: 3, n: 40, Validation Accuracy: 0.76
Time to train: 3451.57
d: 3, n: 50, Validation Accuracy: 0.76
Time to train: 930.88
d: 4, n: 10, Validation Accuracy: 0.76
Time to train: 1

KeyboardInterrupt: 

In [None]:
# Save best model
pickle_out = open("results/model_ab_best.pkl", "wb")
pickle.dump(best_rf, pickle_out)
pickle_out.close()

In [None]:
print("Best n: {}, Best d: {}".format(best_params[0], best_params[1]))