# Model Comparison

This notebook examplify how to use the functions in the py file.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, mean_squared_error, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import model_comp as mc 

folder = '/Users/danielferreira/Documents/git/pySETTV/06 - Utility & References/Data'
file = 'player_batting_enriched.csv'
index = 'player_id'
bat = pd.read_csv(folder+'/'+file, index_col=index)
train = bat[bat['year']==2021]
test = bat[bat['year']==2022]
y_train = train['hr_10'].copy()
y_test = test['hr_10'].copy()

In [2]:
def outputs(model):
    """ Quick function to get outputs in one 2x2 list"""
    return [[model.predict(X_train), model.predict(X_test)], [model.predict_proba(X_train)[:,1], model.predict_proba(X_test)[:,1]]]

# Model 1 - Logistic Regression without the variable that contains missing
X_train = train[['ab', 'batting_avg','r_total_stolen_base']].copy()
X_test = test[['ab', 'batting_avg','r_total_stolen_base']].copy()
model1 = LogisticRegression()
model1.fit(X_train, y_train)
model1_outputs = outputs(model1)

# Model 2 - KNN
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
model2_outputs = outputs(model2)

# Model 3 - Logistic Regression with missing imputation
X_train = train[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_train['exit_velocity_avg'] = X_train['exit_velocity_avg'].fillna(X_train['exit_velocity_avg'].median())
X_test = test[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_test['exit_velocity_avg'] = X_test['exit_velocity_avg'].fillna(X_test['exit_velocity_avg'].median())
model3 = LogisticRegression()
model3.fit(X_train, y_train)
model3_outputs = outputs(model3)

# Model 4 - Decision Tree
model4 = DecisionTreeClassifier(max_depth=3, min_samples_split=30, min_samples_leaf = 20, max_features=3)
model4.fit(X_train, y_train)
model4_outputs = outputs(model4)

# Model 5 - Random Forest
model5 = RandomForestClassifier(random_state=42)
model5.fit(X_train,y_train)
model5_outputs = outputs(model5)

model_dict = {'Logistic Regression No Imp': model1_outputs, 'KNN': model2_outputs, 'Logistic Regression Imp': model3_outputs,
              'Decision Tree': model4_outputs, 'Random Forest': model5_outputs }
y_actual = [y_train, y_test]

In [3]:
import model_comparison as mc 
problem = mc.problem("2022 Home Runs", model_dict, y_actual)

In [5]:
problem.stat_table()

Unnamed: 0,model_name,ds,acc,miss,precision_1,precision_0,recall_1,recall_0,f1_1,f1_0,roc_auc,ase,log_loss_value
0,Logistic Regression No Imp,Train,0.881148,0.118852,0.830918,0.900952,0.767857,0.931102,0.798144,0.915779,0.948239,0.085915,0.276115
1,Logistic Regression No Imp,Test,0.854037,0.145963,0.741228,0.915865,0.828431,0.865909,0.782407,0.890187,0.933946,0.102356,0.318521
2,KNN,Train,0.900273,0.099727,0.835556,0.928994,0.839286,0.927165,0.837416,0.928079,0.965248,0.067923,0.204159
3,KNN,Test,0.838509,0.161491,0.692308,0.9375,0.882353,0.818182,0.775862,0.873786,0.900847,0.122174,1.899749
4,Logistic Regression Imp,Train,0.887978,0.112022,0.838095,0.908046,0.785714,0.933071,0.81106,0.920388,0.957563,0.077813,0.243255
5,Logistic Regression Imp,Test,0.877329,0.122671,0.77533,0.932854,0.862745,0.884091,0.816705,0.907818,0.950758,0.084601,0.270096
6,Decision Tree,Train,0.898907,0.101093,0.916667,0.893116,0.736607,0.970472,0.816832,0.930189,0.960428,0.068555,0.217802
7,Decision Tree,Test,0.878882,0.121118,0.831579,0.898678,0.77451,0.927273,0.80203,0.912752,0.932509,0.095516,0.307881
8,Random Forest,Train,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.01172,0.064243
9,Random Forest,Test,0.863354,0.136646,0.754386,0.923077,0.843137,0.872727,0.796296,0.897196,0.942781,0.091149,0.33486


quick check if random forest got 100% in train indeed

In [12]:
validation = pd.DataFrame()
validation['actual'] = y_train
validation['pred'] = model5.predict(X_train)
validation['right'] = validation['pred'] == validation['actual']
validation['right'].sum()/len(validation)

np.float64(1.0)

In [19]:
df = problem.stat_table().sort_values('ds')
df

Unnamed: 0,model_name,ds,acc,miss,precision_1,precision_0,recall_1,recall_0,f1_1,f1_0,roc_auc,ase,log_loss_value
1,Logistic Regression No Imp,Test,0.854037,0.145963,0.741228,0.915865,0.828431,0.865909,0.782407,0.890187,0.933946,0.102356,0.318521
3,KNN,Test,0.838509,0.161491,0.692308,0.9375,0.882353,0.818182,0.775862,0.873786,0.900847,0.122174,1.899749
5,Logistic Regression Imp,Test,0.877329,0.122671,0.77533,0.932854,0.862745,0.884091,0.816705,0.907818,0.950758,0.084601,0.270096
7,Decision Tree,Test,0.878882,0.121118,0.831579,0.898678,0.77451,0.927273,0.80203,0.912752,0.932509,0.095516,0.307881
9,Random Forest,Test,0.863354,0.136646,0.754386,0.923077,0.843137,0.872727,0.796296,0.897196,0.942781,0.091149,0.33486
0,Logistic Regression No Imp,Train,0.881148,0.118852,0.830918,0.900952,0.767857,0.931102,0.798144,0.915779,0.948239,0.085915,0.276115
2,KNN,Train,0.900273,0.099727,0.835556,0.928994,0.839286,0.927165,0.837416,0.928079,0.965248,0.067923,0.204159
4,Logistic Regression Imp,Train,0.887978,0.112022,0.838095,0.908046,0.785714,0.933071,0.81106,0.920388,0.957563,0.077813,0.243255
6,Decision Tree,Train,0.898907,0.101093,0.916667,0.893116,0.736607,0.970472,0.816832,0.930189,0.960428,0.068555,0.217802
8,Random Forest,Train,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.01172,0.064243


In [22]:
df = problem.stat_table().sort_values('ds')
pivoted_df = df.pivot(index='model_name', columns='ds')
pivoted_df.columns = [f"{col[1]}_{col[0]}" for col in pivoted_df.columns]
pivoted_df = pivoted_df.reset_index()
cols = ['model_name'] + sorted([col for col in pivoted_df.columns if col != 'model_name'])
pivoted_df = pivoted_df[cols]
pivoted_df

Unnamed: 0,model_name,Test_acc,Test_ase,Test_f1_0,Test_f1_1,Test_log_loss_value,Test_miss,Test_precision_0,Test_precision_1,Test_recall_0,...,Train_ase,Train_f1_0,Train_f1_1,Train_log_loss_value,Train_miss,Train_precision_0,Train_precision_1,Train_recall_0,Train_recall_1,Train_roc_auc
0,Decision Tree,0.878882,0.095516,0.912752,0.80203,0.307881,0.121118,0.898678,0.831579,0.927273,...,0.068555,0.930189,0.816832,0.217802,0.101093,0.893116,0.916667,0.970472,0.736607,0.960428
1,KNN,0.838509,0.122174,0.873786,0.775862,1.899749,0.161491,0.9375,0.692308,0.818182,...,0.067923,0.928079,0.837416,0.204159,0.099727,0.928994,0.835556,0.927165,0.839286,0.965248
2,Logistic Regression Imp,0.877329,0.084601,0.907818,0.816705,0.270096,0.122671,0.932854,0.77533,0.884091,...,0.077813,0.920388,0.81106,0.243255,0.112022,0.908046,0.838095,0.933071,0.785714,0.957563
3,Logistic Regression No Imp,0.854037,0.102356,0.890187,0.782407,0.318521,0.145963,0.915865,0.741228,0.865909,...,0.085915,0.915779,0.798144,0.276115,0.118852,0.900952,0.830918,0.931102,0.767857,0.948239
4,Random Forest,0.863354,0.091149,0.897196,0.796296,0.33486,0.136646,0.923077,0.754386,0.872727,...,0.01172,1.0,1.0,0.064243,0.0,1.0,1.0,1.0,1.0,1.0
