In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import model_comp as mc 

folder = '/Users/danielferreira/Documents/git/pySETTV/06 - Utility & References/Data'
file = 'player_batting_enriched.csv'
index = 'player_id'
bat = pd.read_csv(folder+'/'+file, index_col=index)
train = bat[bat['year']==2021]
test = bat[bat['year']==2022]
y_train = train['hr_10'].copy()
y_test = test['hr_10'].copy()

In [2]:
# Model 1 - Logistic Regression without the variable that contains missing
X_train = train[['ab', 'batting_avg','r_total_stolen_base']].copy()
X_test = test[['ab', 'batting_avg','r_total_stolen_base']].copy()

model1 = LogisticRegression()
model1.fit(X_train, y_train)
model1_pred = model1.predict(X_test)

# Model 2 - KNN
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
model2_pred = model2.predict(X_test)

# Model 3 - Logistic Regression with missing imputation
X_train = train[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_train['exit_velocity_avg'] = X_train['exit_velocity_avg'].fillna(X_train['exit_velocity_avg'].median())
X_test = test[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_test['exit_velocity_avg'] = X_test['exit_velocity_avg'].fillna(X_test['exit_velocity_avg'].median())

model3 = LogisticRegression()
model3.fit(X_train, y_train)
model3_pred = model3.predict(X_test)


# Model 4 - Decision Tree
model4 = DecisionTreeClassifier(max_depth=3, min_samples_split=30, min_samples_leaf = 20, max_features=3)
model4.fit(X_train, y_train)
model4_pred = model4.predict(X_test)

# Model 5 - Random Forest
model5 = RandomForestClassifier(random_state=42)
model5.fit(X_train,y_train)
model5_pred = model5.predict(X_test)

In [3]:
import model_comp as mc 
problem = mc.model_comparison(y_test, "2022 Home Runs")


In [4]:
models = {
    "Logistic Regression without imputation": model1_pred,
    "KNN": model2_pred,
    "Logistic Regression with imputation": model3_pred,
    "Decision Tree": model4_pred,
    "Random Forest": model5_pred
}

# Iterate over the dictionary
for name, pred in models.items():
    print(f'\n{name}\n')
    problem.stats_v1(pred)



Logistic Regression without imputation

Accuracy: 0.854
Miss Rate: 0.146
Precision (Class 1): 0.741
Precision (Class 0): 0.916
Recall (Class 1): 0.828
Recall (Class 0): 0.866

KNN

Accuracy: 0.839
Miss Rate: 0.161
Precision (Class 1): 0.692
Precision (Class 0): 0.938
Recall (Class 1): 0.882
Recall (Class 0): 0.818

Logistic Regression with imputation

Accuracy: 0.877
Miss Rate: 0.123
Precision (Class 1): 0.775
Precision (Class 0): 0.933
Recall (Class 1): 0.863
Recall (Class 0): 0.884

Decision Tree

Accuracy: 0.839
Miss Rate: 0.161
Precision (Class 1): 0.692
Precision (Class 0): 0.938
Recall (Class 1): 0.882
Recall (Class 0): 0.818

Random Forest

Accuracy: 0.863
Miss Rate: 0.137
Precision (Class 1): 0.754
Precision (Class 0): 0.923
Recall (Class 1): 0.843
Recall (Class 0): 0.873
