In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import csv data
df = pd.read_csv('..\\Data\\all_pitches.csv')

In [3]:
df.head()

Unnamed: 0,game_id,batSide_code,batSide_des,batter,batter_id,call_des,inning_top_bot,pitchHand_code,pitchHand_des,pitch_type,...,strike_left,strike_down_right,strike_down,strike_down_left,call_B,call_C,call_F,call_H,call_S,is_out
0,413661,L,Left,Matt Carpenter,572761,Called Strike,top,L,Left,FF,...,0,0,0,0,0,1,0,0,0,0
1,413661,L,Left,Matt Carpenter,572761,Swinging Strike,top,L,Left,FF,...,0,0,0,0,0,0,0,0,1,0
2,413661,L,Left,Matt Carpenter,572761,Foul,top,L,Left,FF,...,0,0,0,0,0,0,1,0,0,0
3,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,FF,...,0,0,0,0,1,0,0,0,0,0
4,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,CU,...,0,0,0,0,1,0,0,0,0,0


# Exploratory Data Analysis

For this project, we want to find out if we can optimize a pitcher's pitch count to get more outs in less pitches. More specifically, which pitches tend to be most effective against which batters and in what location? It's a common expression to pitch "hard stuff in, soft stuff away"; basically to pitch fastballs closer to a batter in the strikezone and off-speed pitches futher away from the batter. This analysis will look into that idiom and determine which pitches, and where, tend to lead to outs.

In [4]:
# Separate target variables
target_cols = ['call_B', 'call_C', 'call_F', 'call_H', 'call_S', 'is_out']

In [5]:
targets = df[target_cols]

In [6]:
targets.head()

Unnamed: 0,call_B,call_C,call_F,call_H,call_S,is_out
0,0,1,0,0,0,0
1,0,0,0,0,1,0
2,0,0,1,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0


First we need to look at how often an outcome occurred when a pitch was thrown throughout the 2015-2017 MLB seasons.

In [7]:
# Print percentage of pitches produced which outcomes
print("%i pitches from the 2015-2017 seasons are being analyzed." % len(targets))
targets.mean()

2129403 pitches from the 2015-2017 seasons are being analyzed.


call_B    0.360146
call_C    0.168416
call_F    0.173895
call_H    0.064159
call_S    0.114559
is_out    0.173634
dtype: float64

Note that the percentage of outcomes are near each other with ``call_B``, balls, and ``call_H``, hits, being the biggest disparity. It's also worth mentioning that the percentages add up to more than 1 and that's due to the ``is_out`` column. There are a number of ways a batter could be out, one of which would be to strikeout which would result in ``call_C``, or ``call_S``, being ``1``as well as ``is_out``. This leads to double counting which isn't a bad thing because I want the model to learn that getting 3 strikes is an out.

### Pitch Type & Location

When looking at the data, it's obvious that the different pitch types and locations are broken down into dummy variables. Although this will be useful when modeling the data, it proves a problem when doing exploratory data analysis so the first thing to do is to switch them back into categorical columns.

In [9]:
# Separate pitch types and pitch locations
pitch_types_dummies = df.filter(regex="pitch_type_*")
pitch_locs_dummies = df[['ball_up_right', 'ball_up', 'ball_up_left', 'ball_right', 'ball_left', 'ball_down_right',
                         'ball_down', 'ball_down_left', 'strike_up_right', 'strike_up', 'strike_up_left', 'strike_right',
                         'strike_mid', 'strike_left', 'strike_down_right', 'strike_down', 'strike_down_left']]

In [10]:
pitch_types_dummies.head()

Unnamed: 0,pitch_type,pitch_type_Changeup,pitch_type_Curveball,pitch_type_Cutter,pitch_type_Eephus,pitch_type_Fastball,pitch_type_Forkball,pitch_type_Four-Seam Fastball,pitch_type_Int. Ball,pitch_type_Knuckle Curve,pitch_type_Knuckleball,pitch_type_Pitchout,pitch_type_Screwball,pitch_type_Sinker,pitch_type_Slider,pitch_type_Splitter,pitch_type_Two-Seam Fastball,pitch_type_Unknown
0,FF,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,FF,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,FF,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,FF,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,CU,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
pitch_locs_dummies.head()

Unnamed: 0,ball_up_right,ball_up,ball_up_left,ball_right,ball_left,ball_down_right,ball_down,ball_down_left,strike_up_right,strike_up,strike_up_left,strike_right,strike_mid,strike_left,strike_down_right,strike_down,strike_down_left
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# Use stack to collapse the columns into 1
pitch_types = pitch_types_dummies[pitch_types_dummies==1].stack().reset_index().drop(['level_0', 0], 1)

In [13]:
pitch_types.columns = ['pitch_type']
pitch_types.head()

Unnamed: 0,pitch_type
0,pitch_type_Four-Seam Fastball
1,pitch_type_Four-Seam Fastball
2,pitch_type_Four-Seam Fastball
3,pitch_type_Four-Seam Fastball
4,pitch_type_Curveball


In [14]:
# Do the same for pitch_locs_dummies
pitch_locs = pitch_locs_dummies[pitch_locs_dummies==1].stack().reset_index().drop(['level_0', 0], 1)

In [15]:
pitch_locs.columns = ['pitch_locs']
pitch_locs.head()

Unnamed: 0,pitch_locs
0,strike_up_left
1,strike_mid
2,strike_up_right
3,ball_down
4,ball_right


Now that the dummy variables have been condensed, a new data frame can be created to do some exploratory data analysis.

In [16]:
# Filter out relevant columns to keep from df
eda_df = df[['batSide_des', 'pitchHand_des', 'count_balls', 'count_strikes', 'inning_num', 'pitch_speed']]

In [17]:
# Join targets, pitch_types, pitch_locs
eda_df = eda_df.join([pitch_types, pitch_locs, targets])
eda_df.head()

Unnamed: 0,batSide_des,pitchHand_des,count_balls,count_strikes,inning_num,pitch_speed,pitch_type,pitch_locs,call_B,call_C,call_F,call_H,call_S,is_out
0,Left,Left,0,0,1,92.9,pitch_type_Four-Seam Fastball,strike_up_left,0,1,0,0,0,0
1,Left,Left,0,1,1,92.8,pitch_type_Four-Seam Fastball,strike_mid,0,0,0,0,1,0
2,Left,Left,0,2,1,94.1,pitch_type_Four-Seam Fastball,strike_up_right,0,0,1,0,0,0
3,Left,Left,0,2,1,91.0,pitch_type_Four-Seam Fastball,ball_down,1,0,0,0,0,0
4,Left,Left,1,2,1,75.4,pitch_type_Curveball,ball_right,1,0,0,0,0,0


## Outs vs. Hits

This premise behind this project is to determine if there's a pattern to getting outs or not getting outs. Do high curveballs lead to more hits? Does the idiom "hard stuff in, soft stuff away" hold true? These are a couple of patterns to look into in this analysis.

In [18]:
outs_df = eda_df[eda_df['is_out'] == 1]

In [30]:
outs_grouped = outs_df.groupby(['pitch_type', 'pitch_locs']).count()
outs_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,batSide_des,pitchHand_des,count_balls,count_strikes,inning_num,pitch_speed,call_B,call_C,call_F,call_H,call_S,is_out
pitch_type,pitch_locs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
pitch_type_Changeup,ball_down,6664,6664,6664,6664,6664,6664,6664,6664,6664,6664,6664,6664
pitch_type_Changeup,ball_down_left,199,199,199,199,199,199,199,199,199,199,199,199
pitch_type_Changeup,ball_down_right,242,242,242,242,242,242,242,242,242,242,242,242
pitch_type_Changeup,ball_left,1295,1295,1295,1295,1295,1295,1295,1295,1295,1295,1295,1295
pitch_type_Changeup,ball_right,1696,1696,1696,1696,1696,1696,1696,1696,1696,1696,1696,1696
pitch_type_Changeup,ball_up,444,444,444,444,444,444,444,444,444,444,444,444
pitch_type_Changeup,ball_up_left,30,30,30,30,30,30,30,30,30,30,30,30
pitch_type_Changeup,ball_up_right,52,52,52,52,52,52,52,52,52,52,52,52
pitch_type_Changeup,strike_down,6274,6274,6274,6274,6274,6274,6274,6274,6274,6274,6274,6274
pitch_type_Changeup,strike_down_left,3388,3388,3388,3388,3388,3388,3388,3388,3388,3388,3388,3388
