##  Predicting Baseball Hall of Fame Induction

###  Data Analysis

####  Eryk Wdowiak and Ken Hoffman

data from Lahman Baseball Database

In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
import pickle

import numpy as np
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

# from sklearn.linear_model import LogisticRegression 
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import VotingClassifier ##, BaggingClassifier, RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score, accuracy_score

# from imblearn.over_sampling import SMOTE

# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
##  the little date string
dt_str = '2020-08-19a'

# get the dataframes from pickle files
pitchers_df = pickle.load(open('pitchers-df_'+ dt_str +'.p','rb'))
catchers_df = pickle.load(open('catchers-df_'+ dt_str +'.p','rb'))
infielders_df = pickle.load(open('infielders-df_'+ dt_str +'.p','rb'))
outfielders_df = pickle.load(open('outfielders-df_'+ dt_str +'.p','rb'))

In [3]:
def bball_log(x):
    warnings.filterwarnings('ignore',category=RuntimeWarning)
    return np.where(x<1,0,np.log(x))

In [4]:
def dummies(x, prefix):
    return pd.get_dummies(x, prefix = prefix, drop_first = False)

### pitchers

In [5]:
##  take logs
pitchers_df['ln_G'] = bball_log(pitchers_df['G'])
pitchers_df['ln_G_ps'] = bball_log(pitchers_df['G_ps'])
pitchers_df['ln_W'] = bball_log(pitchers_df['W'])
pitchers_df['ln_W_ps'] = bball_log(pitchers_df['W_ps'])
pitchers_df['ln_SO'] = bball_log(pitchers_df['SO'])
pitchers_df['ln_SO_ps'] = bball_log(pitchers_df['SO_ps'])
pitchers_df['ln_ER'] = bball_log(pitchers_df['ER'])
pitchers_df['ln_ER_ps'] = bball_log(pitchers_df['ER_ps'])
pitchers_df['ln_IPouts'] = bball_log(pitchers_df['IPouts'])
pitchers_df['ln_IPouts_ps'] = bball_log(pitchers_df['IPouts_ps'])

##  not sure about this one
pitchers_df['ln_nu_sns'] = bball_log(pitchers_df['nu_sns'])

##  years since retirement
pitchers_df['since_lst'] = 2018 - pitchers_df['lst_sn']
pitchers_df['ln_since'] = np.log(pitchers_df['since_lst'])

##  create dummies for teams
team_dums = dummies(pitchers_df['teamID'],'tm')
pitchers_df = pitchers_df.join(team_dums)

##  just need the list
team_cols = team_dums.columns

In [6]:
##  exclude teams without inductees
tm_excl = ['tm_ANA', 'tm_ARI', 'tm_BRO', 'tm_BSN', 'tm_CIN', 'tm_COL', 
           'tm_FLO', 'tm_HOU', 'tm_KC1', 'tm_KCA', 'tm_LAA', 'tm_MIL', 
           'tm_ML4', 'tm_MON', 'tm_NY1', 'tm_PHA', 'tm_PIT', 'tm_SLA', 
           'tm_TBA', 'tm_TEX', 'tm_TOR', 'tm_WAS', 'tm_WS1', 'tm_WS2']

##  make everything relative to the Yankees
tm_excl = tm_excl + ['tm_NYA']

##  columns to include
tm_incl = [dum for dum in team_cols if dum not in tm_excl]

##  list of exogenous variables for regression model
exog = ['ln_G']
exog = exog + ['ln_G_ps']
exog = exog + ['ln_W']
exog = exog + ['ln_W_ps']
exog = exog + ['ln_SO']
exog = exog + ['ln_SO_ps']
exog = exog + ['ln_ER']
exog = exog + ['ln_ER_ps']
exog = exog + ['ln_IPouts']
exog = exog + ['ln_IPouts_ps']
exog = exog + ['ln_nu_sns']
exog = exog + ['ln_since']
exog = exog + tm_incl

#list(team_cols[[dum not in tm_excl for dum in team_cols]])

##  regression formula
m01a_fmla = 'induct~'
m01a_fmla = m01a_fmla + '+'.join(exog)

##  run logit
m01a_lgt = smf.logit(m01a_fmla,data=pitchers_df).fit() 
m01a_lgt.summary()

Optimization terminated successfully.
         Current function value: 0.024227
         Iterations 16


0,1,2,3
Dep. Variable:,induct,No. Observations:,900.0
Model:,Logit,Df Residuals:,869.0
Method:,MLE,Df Model:,30.0
Date:,"Wed, 19 Aug 2020",Pseudo R-squ.:,0.8587
Time:,12:58:14,Log-Likelihood:,-21.804
converged:,True,LL-Null:,-154.31
,,LLR p-value:,1.866e-39

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-29.1007,30.663,-0.949,0.343,-89.199,30.998
ln_G,7.3710,4.546,1.621,0.105,-1.539,16.281
ln_G_ps,-2.7617,1.867,-1.480,0.139,-6.420,0.897
ln_W,53.3876,20.081,2.659,0.008,14.030,92.745
ln_W_ps,1.4232,1.254,1.135,0.256,-1.035,3.881
ln_SO,3.0498,3.484,0.875,0.381,-3.778,9.877
ln_SO_ps,4.8338,2.843,1.700,0.089,-0.738,10.406
ln_ER,-32.6699,10.680,-3.059,0.002,-53.603,-11.737
ln_ER_ps,-2.1209,1.668,-1.271,0.204,-5.390,1.149
