# I. Comparing Event Run Values

In [1]:
#Load packages
import pandas as pd
import numpy as np

In [2]:
def Run_Expectancy(path):
    
    RE = pd.read_csv(path)
    RE.drop(['Unnamed: 0'], axis=1, inplace=True)
    RE = RE[['home_team','away_team','half','gameId','batterName','batterId','event', 'start1B', 'start2B', 'start3B',\
             'end1B', 'end2B', 'end3B', 'startOuts','endOuts','runsFuture','runsOnPlay','outsInInning','venueId','batterPos']]
    RE['Start1'] = np.where(pd.isnull(RE['start1B']),0,1)
    RE['Start2'] = np.where(pd.isnull(RE['start2B']),0,1)
    RE['Start3'] = np.where(pd.isnull(RE['start3B']),0,1)
    RE['Start_State'] = (RE['Start1'].astype(str) + RE['Start2'].astype(str) + RE['Start3'].astype(str)+\
                          " " + RE['startOuts'].astype(str))
    RE['End1'] = np.where(pd.isnull(RE['end1B']),0,1)
    RE['End2'] = np.where(pd.isnull(RE['end2B']),0,1)
    RE['End3'] = np.where(pd.isnull(RE['end3B']),0,1)
    RE['End_State'] = (RE['End1'].astype(str) + RE['End2'].astype(str) + RE['End3'].astype(str) + \
                        " " + RE['endOuts'].astype(str))
    RE = RE[((RE.Start_State != RE.End_State) | (RE.runsOnPlay > 0)) & (RE.outsInInning == 3)]
    Start_RunExp = RE.groupby(['Start_State'])['runsFuture'].mean().reset_index().rename(columns={'runsFuture':'Start_RE'})
    RE = pd.merge(RE, Start_RunExp, on=['Start_State'], how='left')
    Base_State_3 = [pd.Series(['000 3', 0], index=Start_RunExp.columns),
                pd.Series(['001 3', 0], index=Start_RunExp.columns),
                pd.Series(['010 3', 0], index=Start_RunExp.columns),
                pd.Series(['011 3', 0], index=Start_RunExp.columns),
                pd.Series(['100 3', 0], index=Start_RunExp.columns),
                pd.Series(['101 3', 0], index=Start_RunExp.columns),
                pd.Series(['110 3', 0], index=Start_RunExp.columns),
                pd.Series(['111 3', 0], index=Start_RunExp.columns)]
    Start_RunExp = Start_RunExp.append(Base_State_3, ignore_index=True)
    End_RunExp  = Start_RunExp.rename(columns={'Start_State':'End_State', 'Start_RE':'End_RE'})
    RE = pd.merge(RE, End_RunExp, on=['End_State'], how='left')
    RE['Run_Value'] = RE['runsOnPlay'] + RE['End_RE'] - RE['Start_RE']
    
    return RE;

In [3]:
# Calculate run value for every event in seasons 2014-2017
RE_14 = Run_Expectancy("../MLBAM14.csv") 
RE_15 = Run_Expectancy("../MLBAM15.csv")                          
RE_16 = Run_Expectancy("../MLBAM16.csv") 
RE_17 = Run_Expectancy("../MLBAM17.csv") 

# II. Comparing Player Run Values

In [10]:
# Aggregate player level run values for each season
Player_Value14 = RE_14.groupby(['batterId','batterName'])['Run_Value'].sum().reset_index().rename(columns = {"Run_Value": 'RV14'})
Player_Value15 = RE_15.groupby(['batterId','batterName'])['Run_Value'].sum().reset_index().rename(columns = {"Run_Value": 'RV15'})
Player_Value16 = RE_16.groupby(['batterId','batterName'])['Run_Value'].sum().reset_index().rename(columns = {"Run_Value": 'RV16'})
Player_Value17 = RE_17.groupby(['batterId','batterName'])['Run_Value'].sum().reset_index().rename(columns = {"Run_Value": 'RV17'})

In [11]:
# Merge player run values into one data frame
Player_Value = pd.merge(Player_Value14, Player_Value15, on=['batterId','batterName'])
Player_Value = pd.merge(Player_Value, Player_Value16, on=['batterId','batterName'])
Player_Value = pd.merge(Player_Value, Player_Value17, on=['batterId','batterName'])
display(Player_Value)

Unnamed: 0,batterId,batterName,RV14,RV15,RV16,RV17
0,112526,Colon,-0.709186,-8.838568,-15.380841,-6.053060
1,134181,Beltre,29.004125,11.918638,29.745115,27.205050
2,136860,Beltran,-6.208773,2.355753,21.279393,-16.883042
3,150029,Werth,37.882267,-3.377653,6.368448,-7.246145
4,282332,Sabathia,-1.118087,-0.472414,-0.944580,-0.292495
...,...,...,...,...,...,...
367,608379,Wacha,-5.461352,-6.021507,-10.598557,-12.735364
368,621035,"Taylor, C",-1.686367,-8.587988,-2.211697,20.588698
369,622072,"Wood, A",-5.490464,-10.162674,-2.623027,-11.017261
370,624577,Puig,31.184967,1.223632,1.871468,5.385486


In [12]:
# Compute correlation matrix
Player_Value.corr()

Unnamed: 0,batterId,RV14,RV15,RV16,RV17
batterId,1.0,-0.14421,-0.003344,-0.056668,0.091117
RV14,-0.14421,1.0,0.466299,0.426629,0.322764
RV15,-0.003344,0.466299,1.0,0.546136,0.510132
RV16,-0.056668,0.426629,0.546136,1.0,0.457391
RV17,0.091117,0.322764,0.510132,0.457391,1.0


In [13]:
# Regression model
import statsmodels.formula.api as smf
RV_Reg = smf.ols(formula = 'RV17 ~ RV14 + RV15 + RV16', data=Player_Value).fit()
RV_Reg.summary()

0,1,2,3
Dep. Variable:,RV17,R-squared:,0.308
Model:,OLS,Adj. R-squared:,0.302
Method:,Least Squares,F-statistic:,54.61
Date:,"Fri, 17 Jul 2020",Prob (F-statistic):,3.2e-29
Time:,11:12:30,Log-Likelihood:,-1458.4
No. Observations:,372,AIC:,2925.0
Df Residuals:,368,BIC:,2941.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0472,0.650,0.073,0.942,-1.231,1.326
RV14,0.0620,0.056,1.101,0.272,-0.049,0.173
RV15,0.3509,0.054,6.480,0.000,0.244,0.457
RV16,0.2673,0.059,4.532,0.000,0.151,0.383

0,1,2,3
Omnibus:,39.399,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,89.833
Skew:,0.548,Prob(JB):,3.11e-20
Kurtosis:,5.144,Cond. No.,20.1
