# Model Selection

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# See binary_classification.py
from binary_classification import *

## Load Data and Predictors

In [3]:
# Load pickled data
df = pd.read_pickle("data/yelp_df_final.pkl")

# Split data
train_df = df[df["dataset"] == "train"]
valid_df = df[df["dataset"] == "val"].reset_index(drop = True)

In [4]:
# Parameters excluded from predictions
exclude = ["review_id", 
           "business_id", 
           "user_id", 
           "label", 
           "date", 
           "categories", 
           "is_open",
           "postal_code", 
           "dataset"]

# Predictors
predictors = list(train_df.columns)
for col in exclude:   
    predictors.remove(col)

# Label
outcome = "label"

## Load Models

In [5]:
# Logistic regression with L1 and L2 penalization
l1_model = pickle.load(open("results/model_l1.pkl", "rb"))
l2_model = pickle.load(open("results/model_l2.pkl", "rb"))

# GDA
gda_model = pickle.load(open("results/model_gda.pkl", "rb"))

# Decision tree
dt_model = pickle.load(open("results/model_dt_best.pkl", "rb"))

# Random Forest
rf_model = pickle.load(open("results/model_rf_best.pkl", "rb"))

# AdaBoost
ab_model = pickle.load(open("results/model_ab_40_4.pkl", "rb"))

## Accuracy on Train and Valid

In [6]:
bc = BinaryClassifier(train_data = None,
                    valid_data = train_df,
                    test_data = valid_df,
                    predictors = predictors,
                    outcome = outcome)

In [7]:
def select_model(train_df, 
                 valid_df, 
                 predictors, 
                 outcome, 
                 model):
    
    # Initialize class instance
    bc = BinaryClassifier(train_data = None,
                        valid_data = train_df,
                        test_data = valid_df,
                        predictors = predictors,
                        outcome = outcome)
    
    # Set model
    bc.set_model(model)
    
    # Training set accuracy
    bc.compute_prob(prob_set = "Valid")
    train_acc = bc.performance_metric(prob_set = "Valid", measure = "Accuracy")
    
    # Validation set accuracy
    bc.compute_prob(prob_set = "Test")
    valid_acc = bc.performance_metric(prob_set = "Test", measure = "Accuracy")
    
    print("Train Accuracy: {:.4f}".format(train_acc))
    print("Train Accuracy: {:.4f}".format(valid_acc))
    

## L2-Regularized Logistic Regression

In [8]:
select_model(train_df, valid_df, predictors, outcome, l2_model)

Train Accuracy: 0.6373
Train Accuracy: 0.6053


## L1-Regularized Logisitic Regression

In [9]:
select_model(train_df, valid_df, predictors, outcome, l1_model)

Train Accuracy: 0.7415
Train Accuracy: 0.7487


## GDA

In [10]:
select_model(train_df, valid_df, predictors, outcome, gda_model)

Train Accuracy: 0.7407
Train Accuracy: 0.7469


## Decision Tree

In [11]:
select_model(train_df, valid_df, predictors, outcome, dt_model)

Train Accuracy: 0.7537
Train Accuracy: 0.7597


## Random Forest

In [12]:
select_model(train_df, valid_df, predictors, outcome, rf_model)

Train Accuracy: 0.7168
Train Accuracy: 0.7156


## AdaBoost

In [13]:
select_model(train_df, valid_df, predictors, outcome, ab_model)

Train Accuracy: 0.7533
Train Accuracy: 0.7613
