# I. Comparing Event Run Values

In [1]:
#Load packages
import pandas as pd
import numpy as np

In [2]:
def Run_Expectancy(path):
    
    RE = pd.read_csv(path)
    RE.drop(['Unnamed: 0'], axis=1, inplace=True)
    RE = RE[['home_team','away_team','half','gameId','batterName','batterId','event', 'start1B', 'start2B', 'start3B',\
             'end1B', 'end2B', 'end3B', 'startOuts','endOuts','runsFuture','runsOnPlay','outsInInning','venueId','batterPos']]
    RE['Start1'] = np.where(pd.isnull(RE['start1B']),0,1)
    RE['Start2'] = np.where(pd.isnull(RE['start2B']),0,1)
    RE['Start3'] = np.where(pd.isnull(RE['start3B']),0,1)
    RE['Start_State'] = (RE['Start1'].astype(str) + RE['Start2'].astype(str) + RE['Start3'].astype(str)+\
                          " " + RE['startOuts'].astype(str))
    RE['End1'] = np.where(pd.isnull(RE['end1B']),0,1)
    RE['End2'] = np.where(pd.isnull(RE['end2B']),0,1)
    RE['End3'] = np.where(pd.isnull(RE['end3B']),0,1)
    RE['End_State'] = (RE['End1'].astype(str) + RE['End2'].astype(str) + RE['End3'].astype(str) + \
                        " " + RE['endOuts'].astype(str))
    RE = RE[((RE.Start_State != RE.End_State) | (RE.runsOnPlay > 0)) & (RE.outsInInning == 3)]
    Start_RunExp = RE.groupby(['Start_State'])['runsFuture'].mean().reset_index().rename(columns={'runsFuture':'Start_RE'})
    RE = pd.merge(RE, Start_RunExp, on=['Start_State'], how='left')
    Base_State_3 = [pd.Series(['000 3', 0], index=Start_RunExp.columns),
                pd.Series(['001 3', 0], index=Start_RunExp.columns),
                pd.Series(['010 3', 0], index=Start_RunExp.columns),
                pd.Series(['011 3', 0], index=Start_RunExp.columns),
                pd.Series(['100 3', 0], index=Start_RunExp.columns),
                pd.Series(['101 3', 0], index=Start_RunExp.columns),
                pd.Series(['110 3', 0], index=Start_RunExp.columns),
                pd.Series(['111 3', 0], index=Start_RunExp.columns)]
    Start_RunExp = Start_RunExp.append(Base_State_3, ignore_index=True)
    End_RunExp  = Start_RunExp.rename(columns={'Start_State':'End_State', 'Start_RE':'End_RE'})
    RE = pd.merge(RE, End_RunExp, on=['End_State'], how='left')
    RE['Run_Value'] = RE['runsOnPlay'] + RE['End_RE'] - RE['Start_RE']
    
    return RE;

In [3]:
# Calculate run value for every event in seasons 2014-2017
RE_14 = Run_Expectancy("../MLBAM14.csv") 
RE_15 = Run_Expectancy("../MLBAM15.csv")                          
RE_16 = Run_Expectancy("../MLBAM16.csv") 
RE_17 = Run_Expectancy("../MLBAM17.csv") 

# III. Comparing Team Run Values

In [14]:
# Create team variable in each data frame
RE_14['team']= np.where(RE_14['half']=='top',RE_14['away_team'],RE_14['home_team'])
RE_15['team']= np.where(RE_15['half']=='top',RE_15['away_team'],RE_15['home_team'])
RE_16['team']= np.where(RE_16['half']=='top',RE_16['away_team'],RE_16['home_team'])
RE_17['team']= np.where(RE_17['half']=='top',RE_17['away_team'],RE_17['home_team'])

In [15]:
# Compute aggregate team level run values by season
REteam_14= RE_14.groupby(['team'])['Run_Value'].sum().reset_index().rename(columns= {"Run_Value": 'RV14'})
REteam_15= RE_15.groupby(['team'])['Run_Value'].sum().reset_index().rename(columns= {"Run_Value": 'RV15'})
REteam_16= RE_16.groupby(['team'])['Run_Value'].sum().reset_index().rename(columns= {"Run_Value": 'RV16'})
REteam_17= RE_17.groupby(['team'])['Run_Value'].sum().reset_index().rename(columns= {"Run_Value": 'RV17'})

In [16]:
# Merge run values into one dataframe
REteam = pd.merge(REteam_14, REteam_15, on=['team'])
REteam = pd.merge(REteam, REteam_16, on=['team'])
REteam = pd.merge(REteam, REteam_17, on=['team'])
display(REteam)

Unnamed: 0,team,RV14,RV15,RV16,RV17
0,ana,104.641532,-25.056067,-3.184209,-47.579782
1,ari,-48.253012,31.924551,15.870968,62.616716
2,atl,-81.885752,-108.464134,-84.633901,-28.259905
3,bal,46.586965,29.882644,29.29794,-23.678031
4,bos,-37.383914,65.372953,160.812546,9.445347
5,cha,-1.307578,-79.136738,-41.643638,-47.416033
6,chn,-54.987532,-3.627047,85.327151,71.002092
7,cin,-57.227567,-61.401274,-17.643638,5.420218
8,cle,-3.725729,-4.178644,48.815791,73.780465
9,col,92.456063,47.127799,122.825528,78.616716


In [17]:
# Compute correlation matrix
REteam.corr()

Unnamed: 0,RV14,RV15,RV16,RV17
RV14,1.0,0.363681,0.261814,0.065225
RV15,0.363681,1.0,0.43761,0.193061
RV16,0.261814,0.43761,1.0,0.351708
RV17,0.065225,0.193061,0.351708,1.0


In [18]:
# Regression model
TeamRV_Reg = smf.ols(formula = 'RV17 ~ RV14 + RV15 + RV16', data=REteam).fit()
TeamRV_Reg.summary()

0,1,2,3
Dep. Variable:,RV17,R-squared:,0.127
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,1.265
Date:,"Fri, 17 Jul 2020",Prob (F-statistic):,0.307
Time:,11:12:38,Log-Likelihood:,-166.93
No. Observations:,30,AIC:,341.9
Df Residuals:,26,BIC:,347.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4437,12.406,-0.036,0.972,-25.944,25.056
RV14,-0.0553,0.241,-0.230,0.820,-0.550,0.440
RV15,0.0706,0.240,0.294,0.771,-0.424,0.565
RV16,0.3788,0.231,1.638,0.113,-0.097,0.854

0,1,2,3
Omnibus:,0.584,Durbin-Watson:,1.712
Prob(Omnibus):,0.747,Jarque-Bera (JB):,0.061
Skew:,0.065,Prob(JB):,0.97
Kurtosis:,3.179,Cond. No.,77.2
