##  Predicting Baseball Hall of Fame Induction

###  Analysis 1 of 4 -- Pitchers

####  Eryk Wdowiak and Ken Hoffman

data from Lahman Baseball Database

In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
import pickle

import numpy as np
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import metrics
from sklearn.linear_model import LogisticRegression 
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import VotingClassifier ##, BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE

# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
##  the little date string
dt_str = '2020-08-19a'

##  pick a dataframe from pickle files
pitchers_df = pickle.load(open('pitchers-df_'+ dt_str +'.p','rb'))
# catchers_df = pickle.load(open('catchers-df_'+ dt_str +'.p','rb'))
# infielders_df = pickle.load(open('infielders-df_'+ dt_str +'.p','rb'))
# outfielders_df = pickle.load(open('outfielders-df_'+ dt_str +'.p','rb'))

##  give that dataframe an alias
df = pitchers_df

In [3]:
def bball_log(x):
    warnings.filterwarnings('ignore',category=RuntimeWarning)
    return np.where(x<1,0,np.log(x))

In [4]:
def dummies(x, prefix):
    return pd.get_dummies(x, prefix = prefix, drop_first = False)

### improve features

In [5]:
##  list to take logs of
log_list = ['G','G_ps','W','W_ps','SO','SO_ps',
            'ER','ER_ps','IPouts','IPouts_ps','nu_sns']

##  take logs
for vbl in log_list:
    new = 'ln_'+vbl
    df[new] = bball_log(df[vbl])

##  years since retirement
df['since_lst'] = 2018 - df['lst_sn']
df['ln_since'] = np.log(df['since_lst'])

In [6]:
##  I do NOT think it's a good idea to use team dummies.
##  The estimated regression coefficients for a team with one (or few) 
##  inductees could be large, while the true values are zero.
##  
##  Nonetheless, to include team dummies, you would add the line:
##  ##  exog = exog + tm_incl
##  to the model formulation below.

##  create dummies for teams
team_dums = dummies(df['teamID'],'tm')
# df = df.join(team_dums)

##  just need the list
team_cols = team_dums.columns

##  exclude teams without pitcher inductees
tm_excl = ['tm_ANA', 'tm_ARI', 'tm_BRO', 'tm_BSN', 'tm_CIN', 'tm_COL', 
           'tm_FLO', 'tm_HOU', 'tm_KC1', 'tm_KCA', 'tm_LAA', 'tm_MIL', 
           'tm_ML4', 'tm_MON', 'tm_NY1', 'tm_PHA', 'tm_PIT', 'tm_SLA', 
           'tm_TBA', 'tm_TEX', 'tm_TOR', 'tm_WAS', 'tm_WS1', 'tm_WS2']

##  make everything relative to the New York Yankees
tm_excl = tm_excl + ['tm_NYA']

##  columns to include
tm_incl = [dum for dum in team_cols if dum not in tm_excl]

###  prepare training and test sets

In [7]:
##  when splitting, how many in test?
tts_test_size = 400

##  what should be SMOTE's ratio of minority to majority?
smote_ratio = 0.125

##  random states
tts_randm = 19
smote_randm = 42

##  train_test_split()
XX = df.drop(columns=['induct'])
yy = df['induct']
X_train, X_test, y_train, y_test = train_test_split(XX, yy, 
                                                    random_state = tts_randm,
                                                    test_size = tts_test_size)

##  Fit SMOTE to training data
Xs_train = X_train.drop(columns=['playerID','teamID'])
Xs_test  = X_test.drop(columns=['playerID','teamID'])
X_smote, y_smote = SMOTE(sampling_strategy = smote_ratio,
                         random_state = smote_randm).fit_sample(Xs_train, y_train)

##  recreate the old layout
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)
df_smote = X_smote.join(y_smote)

##  clean up!
del XX, yy

###  estimation and prediction

In [8]:
##  keep a list of models
# models = {}

In [9]:
##  list of exogenous variables for regression model
exog = ['ln_G']
# exog = exog + ['ln_G_ps']
exog = exog + ['ln_W']
# exog = exog + ['ln_W_ps']
exog = exog + ['ln_SO']
# exog = exog + ['ln_SO_ps']
exog = exog + ['ln_ER']
# exog = exog + ['ln_ER_ps']
exog = exog + ['ln_IPouts']
# exog = exog + ['ln_IPouts_ps']
# exog = exog + ['ln_nu_sns']
exog = exog + ['ln_since']

##  regression formula
m01a_fmla = 'induct~'
m01a_fmla = m01a_fmla + '+'.join(exog)

##  run logit
print()
print('logit results below on fit to WHOLE dataset')
print()
m01a_lgt = smf.logit(m01a_fmla,data=df).fit()
print(m01a_lgt.summary())

## instantiate standard logit model
logit_mdl = LogisticRegression(penalty='none',max_iter=500) 

##  add to list of models
# models['logit_mdl'] = logit_mdl

## fit the model
#logit_mdl.fit(X_train[exog], y_train)
logit_mdl.fit(X_smote[exog], y_smote)

## generate predictions
y_hat_train = logit_mdl.predict(X_train[exog])
y_hat_pred  = logit_mdl.predict(X_test[exog])

## calculate F1 scores
fone_train = f1_score(y_train,y_hat_train) * 100
fone_test  = f1_score(y_test, y_hat_pred)  * 100
print()
print('F1 on train data: {:.2f}'.format(fone_train))
print('F1 on test data:  {:.2f}'.format(fone_test))

## calculate F1 scores
acc_train = accuracy_score(y_train,y_hat_train) * 100
acc_test  = accuracy_score(y_test, y_hat_pred)  * 100
# print()
# print('Acc on train data: {:.2f}'.format(acc_train))
# print('Acc on test data:  {:.2f}'.format(acc_test))

## clean up
del y_hat_train, y_hat_pred, fone_train, fone_test, acc_train, acc_test


logit results below on fit to WHOLE dataset

Optimization terminated successfully.
         Current function value: 0.049565
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:                 induct   No. Observations:                  900
Model:                          Logit   Df Residuals:                      893
Method:                           MLE   Df Model:                            6
Date:                Thu, 20 Aug 2020   Pseudo R-squ.:                  0.7109
Time:                        12:07:43   Log-Likelihood:                -44.609
converged:                       True   LL-Null:                       -154.31
Covariance Type:            nonrobust   LLR p-value:                 1.390e-44
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -60.0081     18.796     -3.193      0.001     -9

### Decision Tree

In [10]:
# Create matrix of features


X = df[['ln_W', 'ln_G', 'ln_IPouts', 'ln_SO', 'ln_ER', 'ln_since']]

# Create target variable
y = df['induct']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.20)

In [12]:
clf = DecisionTreeClassifier()

In [13]:
X_train[exog].head()

Unnamed: 0,ln_G,ln_W,ln_SO,ln_ER,ln_IPouts,ln_since
666,6.725034,3.89182,6.44572,5.860786,7.867871,2.564949
347,6.322565,3.555348,6.364751,5.771441,7.620705,2.70805
784,6.12905,4.672829,6.799056,6.820016,8.658866,4.043051
592,5.332719,3.526361,5.945421,5.598422,7.593374,2.890372
582,6.056784,3.332205,6.230481,5.717028,7.579168,3.178054


In [14]:
# Train Decision Tree Classifer
clf = clf.fit(X_smote[exog], y_smote)

#predict the training set
y_pred_train = clf.predict(X_train[exog]) 
y_pred_test = clf.predict(X_test[exog]) 

fone_train_clf = f1_score(y_train, y_pred_train) * 100
fone_test_clf = f1_score(y_test, y_pred_test) * 100

# Model Accuracy, how often is the classifier correct?
print('F1 on train data: {:.2f}'.format(fone_train_clf))
print('F1 on test data:  {:.2f}'.format(fone_test_clf))

F1 on train data: 80.00
F1 on test data:  85.71


### Random Forest

In [15]:
# Instantiate the classifier using 100 trees
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators=100, max_depth=7)

In [16]:
#fit the model to the training data
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=7, random_state=1)

In [17]:
# Train Decision Tree Classifer
rfc = rfc.fit(X_smote[exog], y_smote)

#predict the training set
y_pred_train = rfc.predict(X_train[exog])
y_pred_test = rfc.predict(X_test[exog])

fone_train_rfc = f1_score(y_train, y_pred_train) * 100
fone_test_rfc = f1_score(y_test, y_pred_test) * 100

# Model Accuracy, how often is the classifier correct?
print('F1 on train data: {:.2f}'.format(fone_train_rfc))
print('F1 on test data:  {:.2f}'.format(fone_test_rfc))

F1 on train data: 76.36
F1 on test data:  90.91
