## Combines 2019 and 2015-2018 Pitch Data

Requirements - 2 CSV Files (in the same working directory):
1. 2019_pitches.csv
2. pitches.csv

Output: A master CSV in the same directory

Notes: We are missing Base Runner data for 2019, which we would need when we expand the state

In [1]:
import sys
path = '/Users/Everwitt/Documents/Pitch-Prediction/pitcherprediction'
sys.path.append(path)

In [2]:
import pandas as pd
import numpy as np
from pitch_zone_config import generate_pitches

In [3]:
path = '../../raw-data/pitches.csv'
path_2019 = '../../raw-data/2019_pitches.csv'

In [4]:
p = pd.read_csv(path)
p_2019 = pd.read_csv(path_2019)

In [5]:
p['pitch_type'].value_counts()

FF    1014877
SL     450578
FT     337983
CH     292789
SI     242504
CU     234391
FC     149756
KC      66484
FS      43705
KN      11260
IN       6197
EP        815
FO        810
PO        628
SC        113
UN         57
AB          9
FA          9
Name: pitch_type, dtype: int64

In [6]:
#rename COLS pz to py 
p_2019.rename(columns={'pz':'py'}, inplace=True)
p.rename(columns={'pz':'py'}, inplace=True)

In [7]:
#rename SI to FT
p['pitch_type'].replace({"SI":"FT"}, inplace=True)
p_2019['pitch_type'].replace({"SI":"FT"}, inplace=True)

# replacing missing data with NaN
p_2019['on_1b'] = np.nan
p_2019['on_2b'] = np.nan
p_2019['on_3b'] = np.nan

In [8]:
print(p_2019.info())
print(p.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728790 entries, 0 to 728789
Data columns (total 40 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   px               722161 non-null  float64
 1   py               722161 non-null  float64
 2   start_speed      722161 non-null  float64
 3   end_speed        722161 non-null  float64
 4   spin_rate        722161 non-null  object 
 5   spin_dir         722161 non-null  object 
 6   break_angle      722161 non-null  float64
 7   break_length     722161 non-null  float64
 8   break_y          722161 non-null  float64
 9   ax               722161 non-null  float64
 10  ay               722161 non-null  float64
 11  az               722161 non-null  float64
 12  sz_bot           728790 non-null  float64
 13  sz_top           728790 non-null  float64
 14  type_confidence  722161 non-null  object 
 15  vx0              722161 non-null  float64
 16  vy0              722161 non-null  floa

In [9]:
keep_cols = ['ab_id','px','py','code','pitch_type','batter_id', 'type']
state_cols = ['outs', 'pitch_num', 'on_1b', 'on_2b', 'on_3b', 's_count', 'b_count']
all_cols = keep_cols + state_cols

In [10]:
p.drop(columns=[col for col in p if col not in all_cols], inplace=True)
p_2019.drop(columns=[col for col in p_2019 if col not in all_cols], inplace=True)

In [11]:
p = p.append(p_2019, ignore_index=True)

In [12]:
p.shape

(3595944, 13)

In [13]:
p.head()

Unnamed: 0,px,py,code,type,pitch_type,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,C,S,FF,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,S,S,FF,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,F,S,FF,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,B,B,FF,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,B,B,CU,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0


In [14]:
# delete where px does not exist
# delete where b_count is 4
p.dropna(subset=['px'], inplace=True)
p.dropna(subset=['py'], inplace=True)
p.dropna(subset=['code'], inplace=True)
p.dropna(subset=['pitch_type'], inplace=True)
p.dropna(subset=['ab_id'], inplace=True)

In [15]:
#filter only the pitches that we want
pitches = ["FF", "FT", "CU", "CH", "FC", "SL"]
filt = p['pitch_type'].isin(pitches)
p = p.loc[filt]

In [16]:
p.head()

Unnamed: 0,px,py,code,type,pitch_type,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,C,S,FF,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,S,S,FF,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,F,S,FF,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,B,B,FF,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,B,B,CU,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0


In [17]:
#updating ab_id dtype
p['ab_id'] = p['ab_id'].astype(np.int64)

#updating b_count dtype
p['b_count'] = p['b_count'].astype(np.int64)

#updating s_count dtype
p['s_count'] = p['s_count'].astype(np.int64)

#updating outs dtype
p['outs'] = p['outs'].astype(np.int64)

#updating pitch_num dtype
p['pitch_num'] = p['pitch_num'].astype(np.int64)

In [18]:
p.shape

(3413438, 13)

In [19]:
#making sure we show that we dont have on_base data for 2019
p.tail()

Unnamed: 0,px,py,code,type,pitch_type,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
3595939,0.3,1.99,X,X,FF,2019185244,2,0,1,4,,,
3595940,1.0,-0.38,B,B,SL,2019185245,0,0,2,1,,,
3595941,0.36,2.02,C,C,FF,2019185245,1,0,2,2,,,
3595942,-0.26,2.6,C,C,SL,2019185245,1,0,2,3,,,
3595943,0.22,1.06,S,S,SL,2019185245,1,0,2,4,,,


### Adding Result and Swing Columns to Pitches using AtBat Data

In [20]:
path_ab = './all_atbats.csv'

In [21]:
ab = pd.read_csv(path_ab)

In [22]:
ab.head()

Unnamed: 0,ab_id,batter_id,event,g_id,inning,o,p_throws,pitcher_id,stand
0,2015000001,572761,Groundout,201500001,1,1,L,452657,L
1,2015000002,518792,Double,201500001,1,1,L,452657,L
2,2015000003,407812,Single,201500001,1,1,L,452657,R
3,2015000004,425509,Strikeout,201500001,1,2,L,452657,R
4,2015000005,571431,Strikeout,201500001,1,3,L,452657,L


In [23]:
# adding columns
p['swing'] = -1
p['res'] = ''

In [24]:
s, o, h, f,  = "strike", "out", "hit", "foul"
b, ts = "ball", "take strike"

#pitch keys
swing_strike_not_in_play = ['S', 'T', 'L', 'W', 'M', 'Q']
swing_foul_not_in_play = ['F', 'R']
take_ball = ['B', '*B']
intentional_ball = ['P', 'I']

#event keys
gets_out = ['Groundout', 'Flyout', 'Lineout', 'Forceout', 'Pop Out', 'Grounded Into DP', 'Double Play', 
            'Fielders Choice Out', 'Bunt Pop Out', 'Bunt Lineout', 'Triple Play', 'Bunt Groundout', 'Runner Out',
            'Fielders Choice', 'Strikeout - DP', 'Batter Interference']
gets_hit = ['Single', 'Double', 'Triple', 'Home Run']
sac_hit = ['Sac Fly', 'Sac Bunt', 'Sac Fly DP', 'Sacrifice Bunt DP', 'Sac Bunt Double Play']
error = ['Field Error', 'Catcher Interference', 'Fan Interference']
        
for i, row in p.iterrows():
    # batter swings and ball is NOT in play
    if row.code in swing_strike_not_in_play:
        p.at[i, 'swing'] = 1
        p.at[i, 'res'] = s
        
    # batter swings and ball is NOT in play
    elif row.code in swing_foul_not_in_play:
        p.at[i, 'swing'] = 1
        p.at[i, 'res'] = f
        
    # batter takes and its a ball
    elif row.code in take_ball:
        p.at[i, 'swing'] = 0
        p.at[i, 'res'] = b
        
    # pitcher intentionally throws a ball to the batter
    elif row.code in intentional_ball: continue

    # takes a strike
    elif row.code == 'C':
        p.at[i, 'swing'] = 0
        p.at[i, 'res'] = s
        
    # batter takes but gets on base (hit by pitch)
    elif row.code == 'H': continue
        
    # this contains all rows in which code does not exist and is NaN
    elif row.code != row.code: continue
        
    elif row.type == 'X':
        filt = ab['ab_id'] == row['ab_id']
        res = ab.loc[filt, 'event'].values[0]

        # gets out swinging
        if res in gets_out:
            p.at[i, 'swing'] = 1
            p.at[i, 'res'] = o
        
        # gets on base swinging
        elif res in gets_hit:
            p.at[i, 'swing'] = 1
            p.at[i, 'res'] = h
            
        # batter is not trying to get on base
        elif res in sac_hit: continue
            
        # fielding team makes an error
        elif res in error: continue
           
        # result is unkown
        else:
            print(res)

In [25]:
p.head()

Unnamed: 0,px,py,code,type,pitch_type,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,swing,res
0,0.416,2.963,C,S,FF,2015000001,0,0,0,1,0.0,0.0,0.0,0,strike
1,-0.191,2.347,S,S,FF,2015000001,0,1,0,2,0.0,0.0,0.0,1,strike
2,-0.518,3.284,F,S,FF,2015000001,0,2,0,3,0.0,0.0,0.0,1,foul
3,-0.641,1.221,B,B,FF,2015000001,0,2,0,4,0.0,0.0,0.0,0,ball
4,-1.821,2.083,B,B,CU,2015000001,1,2,0,5,0.0,0.0,0.0,0,ball


In [26]:
#filtering/dropping columns with swing = -1, that means there was a continue triggered from before
filt = p['swing'] != -1
p = p.loc[filt]

In [27]:
p['swing'].value_counts(normalize=True)

0    0.537535
1    0.462465
Name: swing, dtype: float64

In [28]:
print(p['res'].value_counts(normalize=True))

ball      0.365378
strike    0.289854
foul      0.179948
out       0.116813
hit       0.048006
Name: res, dtype: float64


In [29]:
print(p.shape)

(3346063, 15)


In [30]:
filt = p['swing'] == 1

p_swing = p.loc[filt]
p_take = p.loc[~filt]

In [31]:
print(p_swing.shape)
print(p_take.shape)

(1547437, 15)
(1798626, 15)


In [32]:
p_swing['res'].value_counts(normalize=True)

foul      0.389107
strike    0.254500
out       0.252589
hit       0.103804
Name: res, dtype: float64

In [33]:
p_take['res'].value_counts(normalize=True)

ball      0.679729
strike    0.320271
Name: res, dtype: float64

In [34]:
pitches = generate_pitches()

In [35]:
#adding the correct zone to our data
p['zone'] = ''
rescale_y = 2.599
p['py'] = p['py'] - rescale_y

In [36]:
for i, row in p.iterrows():
    p.at[i, 'zone'] = pitches[row['pitch_type']].zones.return_zone(row['px'], row['py'])

In [37]:
p.head()

Unnamed: 0,px,py,code,type,pitch_type,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,swing,res,zone
0,0.416,0.364,C,S,FF,2015000001,0,0,0,1,0.0,0.0,0.0,0,strike,2a
1,-0.191,-0.252,S,S,FF,2015000001,0,1,0,2,0.0,0.0,0.0,1,strike,4a
2,-0.518,0.685,F,S,FF,2015000001,0,2,0,3,0.0,0.0,0.0,1,foul,0a
3,-0.641,-1.378,B,B,FF,2015000001,0,2,0,4,0.0,0.0,0.0,0,ball,15b
4,-1.821,-0.516,B,B,CU,2015000001,1,2,0,5,0.0,0.0,0.0,0,ball,12b


In [None]:
#we have some pitches with zone -1 if px was on a line defining the strike zones (did not know which to put it in)
filt = testdf['zone'] != '-1'
p = p.loc[filt]

In [38]:
p.to_csv('./all_pitches.csv', index=False)

In [39]:
testdf = pd.read_csv('./all_pitches.csv')

In [40]:
testdf.head()

Unnamed: 0,px,py,code,type,pitch_type,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,swing,res,zone
0,0.416,0.364,C,S,FF,2015000001,0,0,0,1,0.0,0.0,0.0,0,strike,2a
1,-0.191,-0.252,S,S,FF,2015000001,0,1,0,2,0.0,0.0,0.0,1,strike,4a
2,-0.518,0.685,F,S,FF,2015000001,0,2,0,3,0.0,0.0,0.0,1,foul,0a
3,-0.641,-1.378,B,B,FF,2015000001,0,2,0,4,0.0,0.0,0.0,0,ball,15b
4,-1.821,-0.516,B,B,CU,2015000001,1,2,0,5,0.0,0.0,0.0,0,ball,12b


In [41]:
#this just cant be correct...too few strikes
#-1 implies that the pitch is on the edge/line
testdf['zone'].value_counts()

12a    311062
15a    288733
13a    279727
4a     239050
7a     229461
8a     210612
3a     206808
5a     204035
6a     187835
1a     134066
15b    127702
0a     117847
16b    111816
13b    109396
12b    109055
2a     104105
10b     75499
16a     67266
10a     64783
14b     56338
9b      39108
14a     38355
11b     24377
9a       8575
-1        452
Name: zone, dtype: int64