In [1]:
#Load packages
import pandas as pd
import numpy as np

# II. Coding Run Expectancy Dataset (2016)

In [2]:
############################################### Repeat for 2016 ###################################################

# Read in MLBAM Data for 2016

MLBAM16 = pd.read_csv("MLBAM16.csv")

# drop column with no relevant information
MLBAM16.drop(['Unnamed: 0'], axis=1, inplace=True)

# Keep relevant columns
RE16 = MLBAM16[['batterName','batterId','event', 'start1B', 'start2B', 'start3B', 'end1B', 'end2B', 'end3B',\
                   'startOuts','endOuts','runsFuture','runsOnPlay','outsInInning',\
                   'stand', 'throws','venueId', 'stadium', 'batterPos']].copy()

# Create indicator variables to determine whether base is occupied prior to plate appearance
# Create starting base out state for plate appearance
RE16['Start1'] = np.where(pd.isnull(RE16['start1B']),0,1)
RE16['Start2'] = np.where(pd.isnull(RE16['start2B']),0,1)
RE16['Start3'] = np.where(pd.isnull(RE16['start3B']),0,1)
RE16['Start_State'] = (RE16['Start1'].astype(str) + RE16['Start2'].astype(str) + RE16['Start3'].astype(str)+\
                          " " + RE16['startOuts'].astype(str))

# Create indicator variables to determine whether base is occupied after plate appearance
# Create ending base out state for plate appearance
RE16['End1'] = np.where(pd.isnull(RE16['end1B']),0,1)
RE16['End2'] = np.where(pd.isnull(RE16['end2B']),0,1)
RE16['End3'] = np.where(pd.isnull(RE16['end3B']),0,1)
RE16['End_State'] = (RE16['End1'].astype(str) + RE16['End2'].astype(str) + RE16['End3'].astype(str)+\
                          " " + RE16['endOuts'].astype(str))

# Restrict data to plays where either there is a change in state or runs scored and innings that had exactly 3 outs
RE16 = RE16[((RE16.Start_State != RE16.End_State) | (RE16.runsOnPlay > 0)) & (RE16.outsInInning == 3)]

# Calculate run expectancy by starting state and merge into play by play data
Start_RunExp = RE16.groupby(['Start_State'])['runsFuture'].mean().reset_index().rename(columns={'runsFuture':'Start_RE'})
RE16 = pd.merge(RE16, Start_RunExp, on=['Start_State'], how='left')

# Code run expectancy values by end state using start state run expectancy including end states with 3 outs
Base_State_3 = [pd.Series(['000 3', 0], index=Start_RunExp.columns),
                pd.Series(['001 3', 0], index=Start_RunExp.columns),
                pd.Series(['010 3', 0], index=Start_RunExp.columns),
                pd.Series(['011 3', 0], index=Start_RunExp.columns),
                pd.Series(['100 3', 0], index=Start_RunExp.columns),
                pd.Series(['101 3', 0], index=Start_RunExp.columns),
                pd.Series(['110 3', 0], index=Start_RunExp.columns),
                pd.Series(['111 3', 0], index=Start_RunExp.columns)]

Start_RunExp = Start_RunExp.append(Base_State_3, ignore_index=True)

End_RunExp = Start_RunExp.rename(columns={'Start_State':'End_State', 'Start_RE':'End_RE'})

# Merge into play by play data
RE16 = pd.merge(RE16, End_RunExp, on=['End_State'], how='left')

# Run Value of each event

RE16['Run_Value'] = RE16['runsOnPlay'] + RE16['End_RE'] - RE16['Start_RE']

display(RE16)

Unnamed: 0,batterName,batterId,event,start1B,start2B,start3B,end1B,end2B,end3B,startOuts,...,Start2,Start3,Start_State,End1,End2,End3,End_State,Start_RE,End_RE,Run_Value
0,"Carpenter, M",572761,Groundout,,,,,,,0,...,0,0,000 0,0,0,0,000 1,0.498377,0.268678,-0.229699
1,Pham,502054,Groundout,,,,,,,1,...,0,0,000 1,0,0,0,000 2,0.268678,0.106305,-0.162373
2,Holliday,407812,Strikeout,,,,,,,2,...,0,0,000 2,0,0,0,000 3,0.106305,0.000000,-0.106305
3,Jaso,444379,Groundout,,,,,,,0,...,0,0,000 0,0,0,0,000 1,0.498377,0.268678,-0.229699
4,McCutchen,457705,Hit By Pitch,,,,457705.0,,,1,...,0,0,000 1,1,0,0,100 1,0.268678,0.512225,0.243547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184147,Moss,461235,Strikeout,594824.0,,,,,,2,...,0,0,100 2,0,0,0,000 3,0.220539,0.000000,-0.220539
184148,"Rogers, J",595386,Groundout,,,,,,,0,...,0,0,000 0,0,0,0,000 1,0.498377,0.268678,-0.229699
184149,Jaso,444379,Walk,,,,444379.0,,,1,...,0,0,000 1,1,0,0,100 1,0.268678,0.512225,0.243547
184150,"Polanco, G",570256,Strikeout,444379.0,,,444379.0,,,1,...,0,0,100 1,1,0,0,100 2,0.512225,0.220539,-0.291686


In [3]:
# Calculate percent of plate appearances resulting in groundouts
RE16['Count'] = 1
Groundout = RE16[RE16['event']=='Groundout']
sum(Groundout['Count'])/sum(RE16['Count'])

0.18445088839654197

In [4]:
# Subset data to exclude starting base states with bases empty
RE16_BOcc = RE16[(RE16['Start_State'] != "000 0")&(RE16['Start_State'] != "000 1")&(RE16['Start_State'] != "000 2")]
RE16_BOcc

Unnamed: 0,batterName,batterId,event,start1B,start2B,start3B,end1B,end2B,end3B,startOuts,...,Start3,Start_State,End1,End2,End3,End_State,Start_RE,End_RE,Run_Value,Count
5,Freese,501896,Single,457705.0,,,501896.0,,457705.0,1,...,0,100 1,1,0,1,101 1,0.512225,1.196777,0.684552,1
6,"Marte, S",516782,Lineout,501896.0,,457705.0,501896.0,,457705.0,1,...,1,101 1,1,0,1,101 2,1.196777,0.480175,-0.716603,1
7,Cervelli,465041,Pop Out,501896.0,,457705.0,,,,2,...,1,101 2,0,0,0,000 3,0.480175,0.000000,-0.480175,1
12,Harrison,543281,Groundout,,570256.0,,,,570256.0,0,...,0,010 0,0,0,1,001 1,1.132468,0.948684,-0.183783,1
13,Mercer,474568,Walk,,,570256.0,474568.0,,570256.0,1,...,1,001 1,1,0,1,101 1,0.948684,1.196777,0.248093,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184138,"Carpenter, M",572761,Sac Fly,,,545341.0,,,,1,...,1,001 1,0,0,0,000 2,0.948684,0.106305,0.157621,1
184143,Florimon,465753,Groundout,518700.0,,,,,,2,...,0,100 2,0,0,0,000 3,0.220539,0.000000,-0.220539,1
184147,Moss,461235,Strikeout,594824.0,,,,,,2,...,0,100 2,0,0,0,000 3,0.220539,0.000000,-0.220539,1
184150,"Polanco, G",570256,Strikeout,444379.0,,,444379.0,,,1,...,0,100 1,1,0,0,100 2,0.512225,0.220539,-0.291686,1


In [5]:
# Home runs by position
HR = RE16[RE16['event'] == "Home Run"]
HR.groupby(['batterPos'])['Count'].sum().reset_index()

Unnamed: 0,batterPos,Count
0,1B,797
1,2B,576
2,3B,710
3,C,541
4,CF,532
5,DH,424
6,LF,576
7,P,22
8,RF,694
9,SS,486
