In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import os

folder = '/Users/danielferreira/Documents/git/pySETTV/06 - Utility & References/Data'
file = 'player_batting_enriched.csv'
index = 'player_id'
os.chdir(folder)
bat = pd.read_csv(file, index_col=index)
train = bat[bat['year']==2021]
test = bat[bat['year']==2022]
y_train = train['hr_10'].copy()
y_test = test['hr_10'].copy()

In [None]:
# Model 1 - Logistic Regression with missing imputation
X_train = train[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_train['exit_velocity_avg'] = X_train['exit_velocity_avg'].fillna(X_train['exit_velocity_avg'].median())
X_test = test[['ab','exit_velocity_avg', 'batting_avg','r_total_stolen_base']].copy()
X_test['exit_velocity_avg'] = X_test['exit_velocity_avg'].fillna(X_test['exit_velocity_avg'].median())

model1 = LogisticRegression()
model1.fit(X_train, y_train)
model1_pred = model1.predict(X_test)

# Model 2 - Logistic Regression without the variable that contains missing
X_train = train[['ab', 'batting_avg','r_total_stolen_base']].copy()
X_test = test[['ab', 'batting_avg','r_total_stolen_base']].copy()

model2 = LogisticRegression()
model2.fit(X_train, y_train)
model2_pred = model2.predict(X_test)

# Model 3 - KNN
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

model3 = KNeighborsClassifier()
model3.fit(X_train, y_train)
model3_pred = model3.predict(X_test)

# Model 4 


In [None]:

def stats_v1(y, pred):
    cm = confusion_matrix(y, pred)
    acc = (cm[0,0]+cm[1,1])/cm.sum()
    miss = 1-acc
    TP = cm[1,1] 
    FP = cm[0,1]
    TN = cm[0,0]
    FN = cm[1,0]
    precision_1 = TP / (TP+FP)
    precision_0 = TN / (TN+FN)
    recall_1 = TP / (TP+FN)
    recall_0 = TN / (TN+FP)
    metrics = [acc,miss,precision_1,precision_0,recall_1,recall_0]
    for i, metric_name in zip(metrics, ['Accuracy', 'Miss Rate', 'Precision (Class 1)', 'Precision (Class 0)', 'Recall (Class 1)', 'Recall (Class 0)']):
        print(f'{metric_name}: {round(i,3)}')

In [10]:
stats_v1(y_test,model1_pred)

Accuracy: 0.877
Miss Rate: 0.123
Precision (Class 1): 0.775
Precision (Class 0): 0.933
Recall (Class 1): 0.863
Recall (Class 0): 0.884


In [14]:
stats_v1(y_test,model2_pred)

Accuracy: 0.854
Miss Rate: 0.146
Precision (Class 1): 0.741
Precision (Class 0): 0.916
Recall (Class 1): 0.828
Recall (Class 0): 0.866


In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

model3 = KNeighborsClassifier()
model3.fit(X_train, y_train)
model3_pred = model3.predict(X_test)

In [19]:
stats_v1(y_test,model3_pred)

Accuracy: 0.839
Miss Rate: 0.161
Precision (Class 1): 0.692
Precision (Class 0): 0.938
Recall (Class 1): 0.882
Recall (Class 0): 0.818


In [None]:
# Simple Decision Tree 
model1 = DecisionTreeClassifier(max_depth=3, min_samples_split=30, min_samples_leaf = 20, max_features=3)
model1.fit(X_train, y_train)
pred_test = model1.predict(X_test)
print(accuracy_score(pred_test, y_test))
plt.figure(figsize=(12, 8))
plot_tree(model1, feature_names=list(X_train.columns), class_names=['<10','>=10'])
plt.show()

#%%
# Default Random Forest
model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train,y_train)
pred_test = model2.predict(X_test)
accuracy_score(pred_test, y_test)