In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from bokeh.io import output_notebook, show, curdoc
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, Axis
from bokeh.models.widgets import Panel, Tabs
from bokeh.layouts import widgetbox, row
from bokeh.models import Slider

First thing's first, bring in the wrangled data sets and give them their own dataframes

In [2]:
df_2015 = pd.read_csv('wrangled_2015_4_classes.csv', index_col=0)
df_2016 = pd.read_csv('wrangled_2016_4_classes.csv', index_col=0)
df_2017 = pd.read_csv('wrangled_2017_4_classes.csv', index_col=0)
df_2018 = pd.read_csv('wrangled_2018_4_classes.csv', index_col=0)

Let's take a look at one of the data frame's information

In [3]:
df_2015.head()

Unnamed: 0,ab_id,b_count,s_count,b_score,batter_id,batter_side,code,g_id,inning,on1b,...,KN,PO,SC,SI,SL,UN,id,first_name,last_name,p_in_ab
0,2015000001,0,0,0,444876,1.0,C,201500001,1,0.0,...,0.0,0.0,0.0,0.0,0.149969,0.0,489119,Wade,Miley,1
1,2015000001,0,1,0,444876,1.0,F,201500001,1,0.0,...,0.0,0.0,0.0,0.0,0.149969,0.0,489119,Wade,Miley,2
2,2015000001,0,2,0,444876,1.0,B,201500001,1,0.0,...,0.0,0.0,0.0,0.0,0.149969,0.0,489119,Wade,Miley,3
3,2015000001,1,2,0,444876,1.0,D,201500001,1,0.0,...,0.0,0.0,0.0,0.0,0.149969,0.0,489119,Wade,Miley,4
4,2015000002,0,0,0,450314,1.0,F,201500001,1,1.0,...,0.0,0.0,0.0,0.0,0.149969,0.0,489119,Wade,Miley,1


In [4]:
df_2015.describe()

Unnamed: 0,ab_id,b_count,s_count,b_score,batter_id,batter_side,g_id,inning,on1b,on2b,...,IN,KC,KN,PO,SC,SI,SL,UN,id,p_in_ab
count,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,...,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0,701882.0
mean,2015092000.0,0.864952,0.873006,2.262835,491943.435576,0.570689,201501200.0,5.010882,0.296996,0.185112,...,0.004501,0.022475,0.00526,0.000342,1e-05,0.097706,0.145368,3.1e-05,499175.203845,2.866101
std,53153.3,0.958384,0.823956,2.532728,86005.234882,0.494978,707.6745,2.67484,0.456935,0.388389,...,0.006072,0.0694,0.066079,0.00084,0.000148,0.197778,0.133723,0.000238,78021.089195,1.713197
min,2015000000.0,0.0,0.0,0.0,112526.0,0.0,201500000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112526.0,1.0
25%,2015046000.0,0.0,0.0,0.0,451532.0,0.0,201500600.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,453172.0,1.0
50%,2015092000.0,1.0,1.0,2.0,492841.0,1.0,201501200.0,5.0,0.0,0.0,...,0.003056,0.0,0.0,0.0,0.0,0.0,0.142462,0.0,502188.0,3.0
75%,2015138000.0,1.0,2.0,3.0,544369.0,1.0,201501800.0,7.0,1.0,0.0,...,0.005766,0.0,0.0,0.00036,0.0,0.0,0.231045,0.0,544727.0,4.0
max,2015184000.0,3.0,2.0,22.0,656941.0,1.0,201502500.0,19.0,1.0,1.0,...,0.127854,0.509705,0.864765,0.033898,0.002214,0.883442,0.75,0.005076,648737.0,15.0


Let's put all of these data sets into one big DataFrame to make some of the analysis easier.

In [5]:
combined_df = pd.concat([df_2015, df_2016, df_2017, df_2018], sort=False)
combined_df = combined_df.reset_index().drop('index', axis=1)
combined_df['year'] = combined_df['ab_id'].astype(str).str[:4].astype(int)

In [6]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2855212 entries, 0 to 2855211
Data columns (total 45 columns):
ab_id             int64
b_count           int64
s_count           int64
b_score           int64
batter_id         int64
batter_side       float64
code              object
g_id              int64
inning            int64
on1b              float64
on2b              float64
on3b              float64
p_score           int64
pitch_type        object
pitcher_id        int64
pitcher_side      float64
ptype             object
px                float64
pz                float64
top               float64
target            int64
pitcher_ahead     float64
pitcher_behind    float64
CH                float64
CU                float64
EP                float64
FA                float64
FC                float64
FF                float64
FO                float64
FS                float64
FT                float64
IN                float64
KC                float64
KN                float64


In [7]:
combined_df.describe()

Unnamed: 0,ab_id,b_count,s_count,b_score,batter_id,batter_side,g_id,inning,on1b,on2b,...,KC,KN,PO,SC,SI,SL,UN,id,p_in_ab,year
count,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,...,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2855212.0,2136454.0,2855212.0,2855212.0,2855212.0
mean,2016602000.0,0.8797221,0.8824798,2.355403,520196.4,0.5812955,201652200.0,5.001988,0.3025008,0.1845737,...,0.023305,0.003943665,0.0002199484,3.953706e-05,0.08496594,0.1579628,2.667972e-05,527416.2,2.893235,2016.51
std,1117336.0,0.9649233,0.8249371,2.591648,81225.02,0.4933469,111582.8,2.66701,0.459341,0.3879514,...,0.07339946,0.05641274,0.0006238878,0.0008824301,0.1869596,0.1372082,0.0002544124,77524.13,1.726823,1.115809
min,2015000000.0,0.0,0.0,0.0,112526.0,0.0,201500000.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112526.0,1.0,2015.0
25%,2016003000.0,0.0,0.0,0.0,457727.0,0.0,201600000.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462136.0,1.0,2016.0
50%,2017003000.0,1.0,1.0,2.0,519317.0,1.0,201700000.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1544656,0.0,541640.0,3.0,2017.0
75%,2018001000.0,1.0,2.0,4.0,592261.0,1.0,201800000.0,7.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2504368,0.0,592836.0,4.0,2018.0
max,2018186000.0,3.0,2.0,25.0,673633.0,1.0,201802500.0,19.0,1.0,1.0,...,0.559322,0.8725728,0.0338983,0.03157895,0.9142857,1.0,0.0625,673633.0,21.0,2018.0


Let's put some functions together for data analysis and visualization. This first one will give the number of fastballs, breaking balls, offspeed, and other pitches for each pitch in an at-bat for each year. Unfortunately I wasn't able to make it work to break it down by pitcher, so this will give us an overview of all pitchers in this dataset.

In [8]:
def get_df_p_in_ab(target_variable):
    
    dic_2015 = {'{}_2015'.format(a): [df_2015[(df_2015['pitch_type'] == i) & (df_2015[target_variable] == j)]
                                       ['pitch_type'].count() 
                                       for j in list(df_2015[target_variable].unique()) 
                                       for i in list(df_2015['pitch_type'].unique()) if i == a] 
                                       for a in list(df_2015['pitch_type'].unique())}
    dic_2016 = {'{}_2016'.format(a): [df_2016[(df_2016['pitch_type'] == i) & (df_2016[target_variable] == j)]
                                       ['pitch_type'].count() 
                                       for j in list(df_2016[target_variable].unique()) 
                                       for i in list(df_2016['pitch_type'].unique()) if i == a] 
                                       for a in list(df_2016['pitch_type'].unique())}
    dic_2017 = {'{}_2017'.format(a): [df_2017[(df_2017['pitch_type'] == i) & (df_2017[target_variable] == j)]
                                       ['pitch_type'].count() 
                                       for j in list(df_2017[target_variable].unique()) 
                                       for i in list(df_2017['pitch_type'].unique()) if i == a] 
                                       for a in list(df_2017['pitch_type'].unique())}
    dic_2018 = {'{}_2018'.format(a): [df_2018[(df_2018['pitch_type'] == i) & (df_2018[target_variable] == j)]
                                       ['pitch_type'].count() 
                                       for j in list(df_2018[target_variable].unique()) 
                                       for i in list(df_2018['pitch_type'].unique()) if i == a] 
                                       for a in list(df_2018['pitch_type'].unique())}

    df_2015_count = pd.DataFrame({'count': df_2015[target_variable].unique(), 
                                  'year': [2015] * len(df_2015[target_variable].unique())
                                 })
    for x in [i[:2] for i in dic_2015.keys()]:
        df_2015_count['number_of_{}'.format(x)] = dic_2015['{}_2015'.format(x)]

    df_2016_count = pd.DataFrame({'count': df_2016[target_variable].unique(), 
                                  'year': [2016] * len(df_2016[target_variable].unique())
                                 })
    for x in [i[:2] for i in dic_2016.keys()]:
        df_2016_count['number_of_{}'.format(x)] = dic_2016['{}_2016'.format(x)]

    df_2017_count = pd.DataFrame({'count': df_2017[target_variable].unique(), 
                                  'year': [2017] * len(df_2017[target_variable].unique())
                                 })
    for x in [i[:2] for i in dic_2017.keys()]:
        df_2017_count['number_of_{}'.format(x)] = dic_2017['{}_2017'.format(x)]

    df_2018_count = pd.DataFrame({'count': df_2018[target_variable].unique(), 
                                  'year': [2018] * len(df_2018[target_variable].unique())
                                 })
    for x in [i[:2] for i in dic_2018.keys()]:
        df_2018_count['number_of_{}'.format(x)] = dic_2018['{}_2018'.format(x)]

    df = df_2015_count.append([df_2016_count, df_2017_count, df_2018_count], sort=True)
    df = df.reset_index().drop('index', axis=1)
        
    for col in [i[-2:] for i in list(df.columns[1:5])]:
        df['percent_{}'.format(col)] = (df['number_of_{}'.format(col)] / 
                                        df.iloc[:,1:5].sum(axis=1) * 100)
    return df

This function will be able to give us the number of pitches for basically any other variable we want to look at, and it is able to filter by different pitcher, you just need to provied the first_name and last_name keyword arguments. But it can give us the number of pitches for how many different balls or strikes in a given at bat, or what side the batter is hitting on, or any other variable we want to look at.

In [9]:
def get_df(target_df, target_variable, **kwargs):
    
    years = list(target_df['year'].unique())
    
    if kwargs:
        kwarg_list = list(kwargs.items())
        df = target_df[(target_df[kwarg_list[0][0]] == kwarg_list[0][1]) & 
                         (target_df[kwarg_list[1][0]] == kwarg_list[1][1])]
        
        full_dic = {'dic_{}'.format(year): {'{}_{}'.format(p, year): [df[(df['year'] == year) & (df['pitch_type'] == i) & 
                                                                         (df[target_variable] == j)]['pitch_type'].count() 
                                           for j in list(df[target_variable].unique()) 
                                           for i in list(df['pitch_type'].unique()) if i == p] 
                                           for p in list(df['pitch_type'].unique())} for year in years}
    else:
        full_dic = {'dic_{}'.format(year): {'{}_{}'.format(p, year): 
                                            [target_df[(target_df['year'] == year) & (target_df['pitch_type'] == i) & 
                                                         (target_df[target_variable] == j)]['pitch_type'].count() 
                                           for j in list(target_df[target_variable].unique()) 
                                           for i in list(target_df['pitch_type'].unique()) if i == p] 
                                           for p in list(target_df['pitch_type'].unique())} for year in years}
    df_list = []
    for year in years:
        df_count = pd.DataFrame({'count': target_df[target_df['year'] == year][target_variable].unique(), 
                                 'year': [year] * len(target_df[target_df['year'] == year][target_variable].unique())
                               })
        
        for x in [i[:2] for i in full_dic['dic_{}'.format(year)].keys()]:
            df_count['number_of_{}'.format(x)] = full_dic['dic_{}'.format(year)]['{}_{}'.format(x, year)]
            
        df_list.append(df_count)

    df = pd.concat(df_list).reset_index().drop('index', axis=1)
    
    for col in [i[-2:] for i in list(df.columns[2:])]:
            df['percent_{}'.format(col)] = (df['number_of_{}'.format(col)] / 
                                            df.iloc[:,2:6].sum(axis=1) * 100)
            
    return df

This function uses the bokeh library and will give us an interactive plot of all the data we pulled from either function above, and will break it up by year for us by giving us different tabs. It will also give us the percentage of types of pitches thrown.

In [10]:
def get_plots(target_df, target_variable, **kwargs):
        
    if target_variable == 'p_in_ab' or target_variable == 'inning':
        df = get_df_p_in_ab(target_variable)
    else:
        df = get_df(target_df, target_variable, **kwargs)
    
    p_tab_list = []
    for year in df.year.unique():
        p = figure(x_range=[str(i) for i in list(df[df.year == year]['count'].unique())])
        p.vbar_stack(['number_of_FB', 
                      'number_of_BB', 
                      'number_of_OS', 
                      'number_of_OT'],
                      x='count',
                      width=0.8,
                      color=('red', 'blue', 'green', 'yellow'),
                      source=df[df.year == year],
                      legend_label = ['Fastballs', 'Breaking balls', 'Offspeed', 'Other'])
        yaxis = p.select(dict(type=Axis, layout="left"))[0]
        yaxis.formatter.use_scientific = False
        hover = HoverTool(tooltips=[(target_variable, '@count'),
                                    ('Number of Fastballs', '@number_of_FB'), 
                                    ('Number of Breaking Balls', '@number_of_BB'),
                                    ('Number of Offspeed', '@number_of_OS'),
                                    ('Number of Other', '@number_of_OT')])
        p.add_tools(hover)
        p_tab_list.append(p)
        p.xaxis.axis_label = target_variable
        p.yaxis.axis_label = 'Number of pitches'
        if kwargs:
            kwarg_list = list(kwargs.items())
            p.title.text = '{} for {} {}'.format(target_variable, kwarg_list[0][1], kwarg_list[1][1])
        else:
            p.title.text = '{} for all pitchers'.format(target_variable)

    tab1 = Panel(child=p_tab_list[0], title='2015')
    tab2 = Panel(child=p_tab_list[1], title='2016')
    tab3 = Panel(child=p_tab_list[2], title='2017')
    tab4 = Panel(child=p_tab_list[3], title='2018')

    layout1 = Tabs(tabs=[tab1, tab2, tab3, tab4])
    output_notebook()
    
    pp_tab_list = []
    for year in df.year.unique():
        pp = figure(x_range=[str(i) for i in list(df[df.year == year]['count'].unique())])
        pp.vbar_stack(['percent_FB', 
                       'percent_BB', 
                       'percent_OS', 
                       'percent_OT'],
                      x='count',
                      width=0.8,
                      color=('red', 'blue', 'green', 'yellow'),
                      source=df[df.year == year],
                      legend_label = ['Fastballs', 'Breaking Balls', 'Offspeed', 'Other'])
        yaxis = pp.select(dict(type=Axis, layout="left"))[0]
        yaxis.formatter.use_scientific = False
        hover = HoverTool(tooltips=[(target_variable, '@count'),
                                    ('Percent Fastballs', '@percent_FB'), 
                                    ('Percent Breaking Balls', '@percent_BB'),
                                    ('Percent Offspeed', '@percent_OS'),
                                    ('Percent Other', '@percent_OT')])
        pp.add_tools(hover)
        pp_tab_list.append(pp)
        pp.xaxis.axis_label = target_variable
        pp.yaxis.axis_label = 'Percent of pitches'
        
        if kwargs:
            kwarg_list = list(kwargs.items())
            pp.title.text = '{} for {} {}'.format(target_variable, kwarg_list[0][1], kwarg_list[1][1])
        else:
            pp.title.text = '{} for all pitchers'.format(target_variable)

    tab1 = Panel(child=pp_tab_list[0], title='2015')
    tab2 = Panel(child=pp_tab_list[1], title='2016')
    tab3 = Panel(child=pp_tab_list[2], title='2017')
    tab4 = Panel(child=pp_tab_list[3], title='2018')

    layout2 = Tabs(tabs=[tab1, tab2, tab3, tab4])
    output_notebook()
    show(row(layout1, layout2))

So first let's take a look at all pitchers and what pitches they threw on different pitches in the at-bat (p_in_ab).

In [11]:
get_plots(combined_df, 'p_in_ab')

Looking at the number of pitches thrown, it certainly makes sense it would be decreasing every time, so more information would be gathered from the percentage graph on the right. First pitch fastball occurs 62% of the time in 2015, and actually decreases over time. Down to 60% in 2018. It's not a huge variation but certainly notable

Breaking balls have seem to become more prevelant as first pitches to batters as time has gone on, rising from 28% in 2015 to 32% in 2018.

Pitchers seem to agree that the best time for an off-speed pitch, most commonly a changeup, is the 2nd pitch of an at bat, and has stayed at around 14% of all pitches on the 2nd pitch of an at-bat.

As at-bats drag on past 4 pitches, fastballs seem to increase in percentage, which makes sense. The first 4 pitches are where pitchers are looking to get batters to strike out or put the ball in play. So they throw them junk pitches (breaking balls and offspeed) to get them to either swing and miss or hit a ground ball or pop-up for an easy out. As at-bats go past 4 pitches, pitchers are looking less to strike the batter out, and more to get him to put the ball in play, so they need to be more accurate with their pitches, and throwing a fastball is the best way to do that.

Now what about on specific ball or strike counts? How do the percentage of pitches thrown compare across all pitchers?

In [12]:
get_plots(combined_df, 'b_count')

The thing that sticks out right away is the percentage of fastballs thrown with 3 balls in the count. It's around 70% every year, which makes total sense. Pitchers need to be accurate with their pitches with 3 balls so as to not walk the batter, and the best way to do that is to throw a fastball.

Another notable thing is how breaking ball percentage drops with the number of balls in the count. With 0 or 1 ball, pitchers feel pretty safe in throwing breaking balls, trying to get the batter to chase or hit a ground ball or pop-up and get themselves out. But with 2 and 3 balls, that percentage drops off drastically, with only a 30% usage with 2 balls, and down to around 20% with 3 balls. Though pitchers have gotten more daring as the years go by. In 2015, breaking balls with 2 strikes were used about 27%, and in 2015 that number was up close to 31%, and on 3 balls the change was about the same, 19% in 2015 up to 22% in 2018.

In [13]:
get_plots(combined_df, 's_count')

For strike counts, we see a lot of the same patterns throughout the years. Fastballs until a pitcher can get a strike, then open it up with breaking balls and offspeed pitches. For the first time we see fastballs account for less than half the pitches thrown, with 2 strikes, no matter the year, fastballs accounted for only 49% of pitches thrown. 

Now what about for righty vs. lefty batters? How do the number of pitches given to them differ?

In [14]:
get_plots(combined_df, 'batter_side')

Overall, righty batters saw about 25% more pitches than lefty batters, which makes sense considering there are a lot more righties than lefties. In terms of pitches thrown, about the same percentage of fastballs were thrown to both sides, about 56% across all years, but there were a larger percetnage of offspeed pitches thrown to lefties than righties. Lefties consistently saw about 4% more offspeed pitches than righties did, with righties always seeing a larger percentage of breaking balls.


This could be because of the way the ball breaks out of right handed pitchers hands. Balls going away from batters are generally harder to hit than balls coming towards batters. And since right handed breaking balls tend to break away from right handed hitters, they'll see a larger portion of breaking balls than left handed hitters will. It's an interesting chess match, that's for sure.

Now let's look at specific pitchers, namely some of the bigger names in baseball and those who won the Cy Young award across each year.

Starting in 2015, Jake Arrieta of the Chicago Cubs won the National League Cy Young award in 2015, and Dallas Keuchel of the Houston Astros who won the American League Cy Young award in 2015, starting with ball_count.

In [15]:
get_plots(combined_df, 'b_count', first_name='Jake', last_name='Arrieta')

Just looking at Jake Arrieta's Cy Young year, in 2015 he apparently relied a lot on his breaking balls, especailly with a 3 ball count. About 51% of the pitches he threw with 3 balls in the count were breaking balls, which is noticably above the MLB average of 19%. Overall he seemed to rely a lot on his breaking balls, no matter how many balls were in the count. What about strike count?

In [16]:
get_plots(combined_df, 's_count', first_name='Jake', last_name='Arrieta')

His breaking ball usage was even more pronounced here, with him throwing many more breaking balls than fastballs with 2 strikes in the count. 58% breaking ball compared to 35% fastball.

This strategy must have worked, since he had the 6th most strikeouts in the MLB that year, as well as the most wins and came away with the NL Cy Young.

Now for the AL Cy Young winner Dallas Keuchel, starting with ball count

In [17]:
get_plots(combined_df, 'b_count', first_name='Dallas', last_name='Keuchel')

Keuchel seems to rely much more heavily on his fastball than Arrieta, or even the rest of the MLB do, throwing a well above average number of fastballs than the MLB on any amount of balls in the count. Especially 3 balls. Now what about strike count?

In [18]:
get_plots(combined_df, 's_count', first_name='Dallas', last_name='Keuchel')

Still heavily relied on his fastball, but certainly switched things up with 1 and 2 strikes, throwing 23% breaking balls and 15% offspeed with 1 strike, and 37% breaking ball and 16% offspeed with 2 strikes. All about average for MLB pitchers.

Moving on to the 2018 Cy Young award winners so this document doesn't get too long. Jacob deGrom of the New York Mets won the Cy Young for the NL, and Blake Snell of the Tampa Bay Rays won it for the AL, let's look at their stats in 2018.

In [19]:
get_plots(combined_df, 'b_count', first_name='Jacob', last_name='deGrom')

deGrom seems more like Dallas Keuchel from 2015, not relying much on his offspeed stuff and throwing a lot of fastballs. deGrom seems about on par with the 2018 MLB averages, throwing about 70% fastballs with 3 balls in the count. What about strikes? Does he differ there from the MLB much?

In [20]:
get_plots(combined_df, 's_count', first_name='Jacob', last_name='deGrom')

He definitely relies more heavily on his breaking balls and offspeed pitches with more strikes in the count, especially with 2 strikes, which makes sense. However compared to the rest of the MLB, he still throws a higher percentage of fastballs and a lower percentage of breaking balls.

Now for Blake Snell, let's see how he compares to the rest of the MLB.

In [21]:
get_plots(combined_df, 'b_count', first_name='Blake', last_name='Snell')

Similar story as the ones told above, Snell hasn't really broken away from the MLB average like 2015 Jake Arrieta did. It's at this point where Cy Young award voters look more toward the results of these pitches to determine winners. Finally let's take a look at his strike count pitches.

In [22]:
get_plots(combined_df, 's_count', first_name='Blake', last_name='Snell')

Here we see some deviation from the norm. He likes using a lot of fastballs and offspeed pitches to get that first strike, then when he has 2 strikes in the count he relies heavily on his breaking ball to get the out. In 2018 he used his breaking ball 46% of the time with 2 strikes, compared to a fastball usage of only 44%.

I wanted to look at all of these players because it's something I will be doing come the machine learning portion of this project. I want to do two things: to make predictions on all pitchers over all years, then make predictions on specific pitchers and compare them to their non-Cy Young winning peers and see if these Cy Young winning pitchers are more or less predictable. 

Other determining factors of what pitch a pitcher throws could be if the pitcher is ahead or behind in the count, which essentially summarizes everything seen up to this point. Let's take a look at when pitchers are ahead in the count.

In [23]:
get_plots(combined_df, 'pitcher_ahead')

Pitchers seem to be falling behind the in the count a lot. Which means they're throwing more balls than strikes. What they're doing in these situations is interesting though. Overall, pitchers throw more fastballs than anything else, thoguh that trend seems to be chaning as time goes on. From 2015-2018, the percentage of breaking balls thrown when behind in the count increases from 27% in 2015 to 30% in 2018. When pitchers are ahead in the count, that percentage increases from 38% to 40%. A not insignificant increase in percentage of breaking balls thrown. It makes sense that when pitchers are ahead, they throw a higher percentage of breaking balls and offspeed pitches since they want to get the batter out.

So that's for all pitchers, let's take a look at a specific pitcher, namely the most recent AL Cy Young award winner for 2019, Justin Verlander.

In [24]:
get_plots(combined_df, 'pitcher_ahead', first_name='Justin', last_name='Verlander')

A similar story. Though it seems Verlander is a bit better than the average pitcher at getting ahead in the count. And also an interesting observation, his offspeed usage has plummeted from 2015-2018. When behind in the count, his offspeed usage in 2015 was 11%, and fell to 1% by 2018

Another interesting stat to look at would be runners on base. We can create a column in our dataset that counts the total number of runners on base, and look at that and see how pitchers change their strategy.

In [25]:
combined_df['runners_on'] = combined_df.loc[:,['on1b', 'on2b', 'on3b']].sum(axis=1).astype(int)

In [26]:
get_plots(combined_df, 'runners_on')

Interesting. Having runners doesn't seem to shake up a pitcher's strategy a whole lot. Pitch usage is nearly constant even as more runners get on base. This is true for all years. Though with the bases loaded, it consistently has the highest fastball usage between 56% and 58% depending on the year. Interestingly, with 2 runners on, breaking ball percentage is at its highest. Between 32% and 36% from 2015-2018. This could be because with a runner on first, pitchers are looking to get batters to hit a ground ball and ground themselves into a double play. Smart.

Another interesting stat to look at would be what inning the game is in and how that affects pitch choice, let's take a look.

In [27]:
get_plots(combined_df, 'inning')

About what I expected. Pitchers like to use a high percentage of fastballs in the early game to get ahead in the count and start the game off right, then when the batting lineup comes around to the top of the order again by the 3rd and 4th inning, batters will see more offspeed and breaking ball pitches. This is also the first time we see a significant amount of pitches in the "other" category. Intentional balls, intentional walks, pitchouts, and unknown pitches. As far up as 7% in the 18th inning in 2016. Very few games actually made it to 18 innings in 2016, and these games feature a lot of position players as pitchers since managers don't want to tire out their actual pitchers. A lot of those pitches could be labeled as unknown since position players rarely pitch.

In [28]:
combined_df.to_csv('combined_df.csv')

## Final thought

All of this is to say that there could be a lot of variables that go into predicting what pitch is going to be thrown next. Ball count, strike count, what pitch in the at-bat it is, what inning they're in, how many runners are on base and where they are. All of these make predicting pitches a complex science, but if we're able to do it with any accuracy, it could be a powerful tool for teams who want to win more baseball games.