# Model Comparison

This notebook examplify how to use the functions in the py file.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, mean_squared_error, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import model_comp as mc 

folder = '/Users/danielferreira/Documents/git/pySETTV/06 - Utility & References/Data'
file = 'player_batting_enriched.csv'
index = 'player_id'
bat = pd.read_csv(folder+'/'+file, index_col=index)
train = bat[bat['year']==2021]
test = bat[bat['year']==2022]
y_train = train['hr_10'].copy()
y_test = test['hr_10'].copy()

In [3]:
# Model 1 - Logistic Regression without the variable that contains missing
X_train = train[['ab', 'batting_avg','r_total_stolen_base']].copy()
X_test = test[['ab', 'batting_avg','r_total_stolen_base']].copy()

model1 = LogisticRegression()
model1.fit(X_train, y_train)
model1_pred_train = model1.predict(X_train)
model1_prob_train = model1.predict_proba(X_train)[:,1]
model1_pred_test = model1.predict(X_test)
model1_prob_test = model1.predict_proba(X_test)[:,1]


# Model 2 - KNN
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
model2_pred_train = model2.predict(X_train)
model2_prob_train = model2.predict_proba(X_train)[:,1]
model2_pred_test = model2.predict(X_test)
model2_prob_test = model2.predict_proba(X_test)[:,1]

model_dict = {'Logistic Regression': [[model1_pred_train, model1_pred_test], [model1_prob_train, model1_prob_test]], 'KNN': [[model2_pred_train, model2_pred_test], [model2_prob_train, model2_prob_test]] }
y_actual = [y_train, y_test]

In [4]:
import model_comparison as mc 
problem = mc.problem("2022 Home Runs", model_dict, y_actual)

In [5]:
problem.stats_1_model_ds(y_test, model1_pred_test, model1_prob_test, 'Logistic Regression no imputation', 'Test')

{'model_name': 'Logistic Regression no imputation',
 'ds': 'Test',
 'acc': np.float64(0.8540372670807453),
 'miss': np.float64(0.14596273291925466),
 'precision_1': np.float64(0.7412280701754386),
 'precision_0': np.float64(0.9158653846153846),
 'recall_1': np.float64(0.8284313725490197),
 'recall_0': np.float64(0.865909090909091),
 'f1_1': np.float64(0.7824074074074074),
 'f1_0': np.float64(0.8901869158878505),
 'roc_auc': np.float64(0.9339460784313725),
 'ase': np.float64(0.10235604669628749),
 'log_loss_value': np.float64(0.3185214076545152)}

In [6]:
problem.stat_table()

0 Logistic Regression 0 Train
0 Logistic Regression 1 Test
1 KNN 0 Train
1 KNN 1 Test


In [6]:

# Model 3 - Logistic Regression with missing imputation
X_train = train[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_train['exit_velocity_avg'] = X_train['exit_velocity_avg'].fillna(X_train['exit_velocity_avg'].median())
X_test = test[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_test['exit_velocity_avg'] = X_test['exit_velocity_avg'].fillna(X_test['exit_velocity_avg'].median())

model3 = LogisticRegression()
model3.fit(X_train, y_train)
model3_pred = model3.predict(X_test)
model3_prob = model3.predict_proba(X_test)

# Model 4 - Decision Tree
model4 = DecisionTreeClassifier(max_depth=3, min_samples_split=30, min_samples_leaf = 20, max_features=3)
model4.fit(X_train, y_train)
model4_pred = model4.predict(X_test)
model4_prob = model4.predict_proba(X_test)

# Model 5 - Random Forest
model5 = RandomForestClassifier(random_state=42)
model5.fit(X_train,y_train)
model5_pred = model5.predict(X_test)
model5_prob = model5.predict_proba(X_test)