##  Predicting Baseball Hall of Fame Induction

###  Analysis 2 of 4 -- Catchers

####  Eryk Wdowiak and Ken Hoffman

data from Lahman Baseball Database

In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
import pickle

import numpy as np
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LogisticRegression 
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import VotingClassifier ##, BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from imblearn.over_sampling import SMOTE

# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
##  the little date string
dt_str = '2020-08-19a'

##  pick a dataframe from pickle files
# pitchers_df = pickle.load(open('pitchers-df_'+ dt_str +'.p','rb'))
catchers_df = pickle.load(open('catchers-df_'+ dt_str +'.p','rb'))
# infielders_df = pickle.load(open('infielders-df_'+ dt_str +'.p','rb'))
# outfielders_df = pickle.load(open('outfielders-df_'+ dt_str +'.p','rb'))

##  give that dataframe an alias
df = catchers_df

In [3]:
def bball_log(x):
    warnings.filterwarnings('ignore',category=RuntimeWarning)
    return np.where(x<1,0,np.log(x))

In [4]:
def dummies(x, prefix):
    return pd.get_dummies(x, prefix = prefix, drop_first = False)

### improve features

In [5]:
##  list to take logs of
log_list = ['bG','AB','R','H','2B','3B','HR','RBI','bSB','bCS','BB','SO','HBP','SH',
            'bG_ps','AB_ps','R_ps','H_ps','2B_ps','3B_ps','HR_ps',
            'RBI_ps','bSB_ps','bCS_ps','BB_ps','SO_ps','HBP_ps','SH_ps',
            'fG','PO','A','E','DP','fSB','fCS','fG_ps',
            'PO_ps','A_ps','E_ps','DP_ps','fSB_ps','fCS_ps','nu_sns']

##  take logs
for vbl in log_list:
    new = 'ln_'+vbl
    df[new] = bball_log(df[vbl])

##  years since retirement
df['since_lst'] = 2018 - df['lst_sn']
df['ln_since'] = np.log(df['since_lst'])

###  prepare training and test sets

In [6]:
##  when splitting, how many in test?
tts_test_size = 100

##  what should be SMOTE's ratio of minority to majority?
smote_ratio = 0.125

##  random states
tts_randm = 19
smote_randm = 42

##  train_test_split()
XX = df.drop(columns=['induct','position'])
yy = df['induct']
X_train, X_test, y_train, y_test = train_test_split(XX, yy, 
                                                    random_state = tts_randm,
                                                    test_size = tts_test_size)

##  Fit SMOTE to training data
Xs_train = X_train.drop(columns=['playerID','teamID'])
Xs_test  = X_test.drop(columns=['playerID','teamID'])
X_smote, y_smote = SMOTE(sampling_strategy = smote_ratio,
                         random_state = smote_randm).fit_sample(Xs_train, y_train)

##  recreate the old layout
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)
df_smote = X_smote.join(y_smote)

##  clean up!
del XX, yy

###  estimation and prediction

In [7]:
##  keep a list of models
# models = {}

In [8]:
##  list of exogenous variables for regression model
exog = []
# exog = exog + ['ln_bG']
# exog = exog + ['ln_fG']
# exog = exog + ['ln_R']
# exog = exog + ['ln_H']
# exog = exog + ['ln_2B']
# exog = exog + ['ln_3B']
exog = exog + ['ln_HR']
# exog = exog + ['ln_RBI']
# exog = exog + ['ln_bSB']
exog = exog + ['ln_PO']
# exog = exog + ['ln_A']
# exog = exog + ['ln_DP']
exog = exog + ['ln_fCS']
# exog = exog + ['ln_nu_sns']
exog = exog + ['ln_since']

##  regression formula
m01a_fmla = 'induct~'
m01a_fmla = m01a_fmla + '+'.join(exog)

##  run logit
print()
print('logit results below on fit to WHOLE dataset')
print()
m01a_lgt = smf.logit(m01a_fmla,data=df).fit()
print(m01a_lgt.summary())

## instantiate standard logit model
logit_mdl = LogisticRegression(penalty='none',max_iter=500) 

##  add to list of models
# models['logit_mdl'] = logit_mdl

## fit the model
#logit_mdl.fit(X_train[exog], y_train)
logit_mdl.fit(X_smote[exog], y_smote)

## generate predictions
y_hat_train = logit_mdl.predict(X_train[exog])
y_hat_pred  = logit_mdl.predict(X_test[exog])

## calculate F1 scores
fone_train = f1_score(y_train,y_hat_train) * 100
fone_test  = f1_score(y_test, y_hat_pred)  * 100
print()
print('F1 on train data: {:.2f}'.format(fone_train))
print('F1 on test data:  {:.2f}'.format(fone_test))

## calculate F1 scores
acc_train = accuracy_score(y_train,y_hat_train) * 100
acc_test  = accuracy_score(y_test, y_hat_pred)  * 100
# print()
# print('Acc on train data: {:.2f}'.format(acc_train))
# print('Acc on test data:  {:.2f}'.format(acc_test))

## clean up
del y_hat_train, y_hat_pred, fone_train, fone_test, acc_train, acc_test


logit results below on fit to WHOLE dataset

Optimization terminated successfully.
         Current function value: 0.034410
         Iterations 13
                           Logit Regression Results                           
Dep. Variable:                 induct   No. Observations:                  237
Model:                          Logit   Df Residuals:                      232
Method:                           MLE   Df Model:                            4
Date:                Fri, 21 Aug 2020   Pseudo R-squ.:                  0.8032
Time:                        08:37:06   Log-Likelihood:                -8.1553
converged:                       True   LL-Null:                       -41.441
                                        LLR p-value:                 1.201e-13
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -154.6839     56.435     -2.741      0.006    -26

### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [10]:
# Train Decision Tree Classifer
clf = clf.fit(X_smote[exog], y_smote)

#predict the training set
y_pred_train = clf.predict(X_train[exog]) 
y_pred_test = clf.predict(X_test[exog]) 

fone_train_clf = f1_score(y_train, y_pred_train) * 100
fone_test_clf = f1_score(y_test, y_pred_test) * 100

# Model Accuracy, how often is the classifier correct?
print('F1 on train data: {:.2f}'.format(fone_train_clf))
print('F1 on test data:  {:.2f}'.format(fone_test_clf))

F1 on train data: 100.00
F1 on test data:  72.73


### Random Forest

In [11]:
# Instantiate the classifier using 100 trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators=100, max_depth=7)

In [12]:
# Train Decision Tree Classifer
rfc = rfc.fit(X_smote[exog], y_smote)

#predict the training set
y_pred_train = rfc.predict(X_train[exog])
y_pred_test = rfc.predict(X_test[exog])

fone_train_rfc = f1_score(y_train, y_pred_train) * 100
fone_test_rfc = f1_score(y_test, y_pred_test) * 100

# Model Accuracy, how often is the classifier correct?
print('F1 on train data: {:.2f}'.format(fone_train_rfc))
print('F1 on test data:  {:.2f}'.format(fone_test_rfc))

F1 on train data: 100.00
F1 on test data:  72.73
