# Homeruns by Ball-Strike Counts

An analysis of the percentage of homeruns hit on certain ball-strike counts.  Background for this analysis can be found in the Baseball Prospectus article by Jeff Wiser at https://www.baseballprospectus.com/news/article/52470/from-the-outfield-grass-getting-ahead-and-paying-the-price/

In [1]:
import pandas as pd
from pybaseball import statcast
import warnings
warnings.filterwarnings('ignore')

### Function to build the Dataset

In [2]:
def get_HR_ball_strike_counts(year_list):
    
    data_list = []
    for year in year_list:
        
        # import the data
        data = pd.read_csv(f"/Users/chrisjackson/sports/baseball/data/pitch_data_{year}.csv")

        # format some columns as strings
        for col in ['game_pk', 'pitcher', 'batter', 'inning', 'outs_when_up', 'batter', 'balls', 'strikes']:
            data[col] = data[col].astype(int).astype(str)
        
        # select the columns we need
        cols_to_keep = ['game_pk', 'pitcher', 'batter', 'inning', 
                        'outs_when_up', 'at_bat_number', 'pitch_number', 'balls', 'strikes', 'events']
        data = data[cols_to_keep]

        # sort the data 
        data.sort_values(by=['game_pk', 'pitcher', 'inning', 'at_bat_number', 'pitch_number', 'outs_when_up'], inplace=True)

        # build an at-bat index
        data['ab_index'] = data['game_pk'] + "_" + data['pitcher'] + "_" +\
                           data['batter'] + "_" + data['inning'] + "_" +\
                           data['outs_when_up']

        # combine balls and strikes into a "count" column
        data['count'] = '(' + data['balls'] + ',' + data['strikes'] + ')'

        # select the columns we need
        data = data[['ab_index', 'count', 'events']]

        # groupby at-bat and aggregate counts and events into lists
        data = pd.DataFrame(data.groupby('ab_index').agg(lambda x: list(x)))

        # keep the last item in the event list (the thing that actually happened)
        data['events'] = data['events'].apply(lambda x: x[-1])

        # select the homerun events
        data = data[data['events'] == 'home_run']
        
        data.drop('events', axis=1, inplace=True)
        
        data_list.append(data)
    
    data = pd.concat(data_list)
        
    return data

In [3]:
data2015_18 = get_HR_ball_strike_counts(['2015', '2016', '2017', '2018'])
print(f"Total number of homeruns in 2015-18: {len(data2015_18)}")

Total number of homeruns in 2015-18: 22209


In [4]:
data2019 = get_HR_ball_strike_counts(['2019'])
print(f"Total number of homeruns in 2019: {len(data2019)}")

Total number of homeruns in 2019: 5109


### Percentage of HRs Hit Thru 3-1 Counts

In [5]:
data2015_18['HR_thru_3-1'] = data2015_18['count'].apply(lambda x: '(3,1)' in x)
data2019['HR_thru_3-1'] = data2019['count'].apply(lambda x: '(3,1)' in x)

print(f"2015-18: \t{round(len(data2015_18[data2015_18['HR_thru_3-1']]) / len(data2015_18) * 100., 2)}%")
print(f"2019: \t\t{round(len(data2019[data2019['HR_thru_3-1']]) / len(data2019) * 100., 2)}%")

2015-18: 	7.48%
2019: 		7.01%


### Percentage of HRs Hit Thru 2-0 Counts

In [6]:
data2015_18['HR_thru_2-0'] = data2015_18['count'].apply(lambda x: '(2,0)' in x)
data2019['HR_thru_2-0'] = data2019['count'].apply(lambda x: '(2,0)' in x)

print(f"2015-18: \t{round(len(data2015_18[data2015_18['HR_thru_2-0']]) / len(data2015_18) * 100., 2)}%")
print(f"2019: \t\t{round(len(data2019[data2019['HR_thru_2-0']]) / len(data2019) * 100., 2)}%")

2015-18: 	14.87%
2019: 		13.86%


### Percentage of HRs Hit Thru 0-2 Counts

In [7]:
data2015_18['HR_thru_0-2'] = data2015_18['count'].apply(lambda x: '(0,2)' in x)
data2019['HR_thru_0-2'] = data2019['count'].apply(lambda x: '(0,2)' in x)

print(f"2015-18: \t{round(len(data2015_18[data2015_18['HR_thru_0-2']]) / len(data2015_18) * 100., 2)}%")
print(f"2019: \t\t{round(len(data2019[data2019['HR_thru_0-2']]) / len(data2019) * 100., 2)}%")

2015-18: 	11.18%
2019: 		12.29%


### Percentage of HRs Hit Thru 1-2 Counts

In [8]:
data2015_18['HR_thru_1-2'] = data2015_18['count'].apply(lambda x: '(1,2)' in x and '(0,2)' not in x)
data2019['HR_thru_1-2'] = data2019['count'].apply(lambda x: '(1,2)' in x and '(0,2)' not in x)

print(f"2015-18: \t{round(len(data2015_18[data2015_18['HR_thru_1-2']]) / len(data2015_18) * 100., 2)}%")
print(f"2019: \t\t{round(len(data2019[data2019['HR_thru_1-2']]) / len(data2019) * 100., 2)}%")

2015-18: 	10.64%
2019: 		11.16%


### Percentage of HRs Hit Thru 2-2 Counts

In [9]:
data2015_18['HR_thru_2-2'] = data2015_18['count'].apply(lambda x: '(2,2)' in x and '(0,2)' not in x and '(1,2)' not in x)
data2019['HR_thru_2-2'] = data2019['count'].apply(lambda x: '(2,2)' in x and '(0,2)' not in x and '(1,2)' not in x)

print(f"2015-18: \t{round(len(data2015_18[data2015_18['HR_thru_2-2']]) / len(data2015_18) * 100., 2)}%")
print(f"2019: \t\t{round(len(data2019[data2019['HR_thru_2-2']]) / len(data2019) * 100., 2)}%")

2015-18: 	6.52%
2019: 		6.52%


## HR by Ball-Strike Counts for Individual Pitchers

In [None]:
pitcher_data_2019 = pd.read_csv("/Users/chrisjackson/sports/baseball/data/pitch_data_2019.csv")

pitcher_data_2019 = pitcher_data_2019[pitcher_data_2019['events'] == 'home_run']

pitcher_data_2019 = pitcher_data_2019[['player_name', 'balls', 'strikes']]

pitcher_data_2019.head()

In [None]:
hr_0_2_counts = pitcher_data_2019[(pitcher_data_2019['balls'] == 0.0) & (pitcher_data_2019['strikes'] == 2.0)]

pd.DataFrame(hr_0_2_counts.groupby('player_name').size()).sort_values(by=0, ascending=False)