# Catcher Pitch Framing

See this paper for a "hierarchical bayesian" approach:https://arxiv.org/abs/1704.00823

See this DataRobot blog post for a ML approach (really just logistic regression): https://blog.datarobot.com/catcher-pitch-framing-using-machine-learning-part-1

Or, the improved DataRobot model: https://blog.datarobot.com/catcher-pitch-framing-using-machine-learning-part-two



In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

### import the pitch data and only keep "taken" pitches

In [2]:
# import the data for calendar year
def import_data(year):
    
    print("===================================================")
    print(f"Gathering data for {year} season")
    print("===================================================")
    print()
    
    pitch_data_df = pd.read_csv(f"../data/pitch_data_{year}.csv")
    total_num_pitches = len(pitch_data_df)
    
    print(f"Number of total pitches in {year} data: {total_num_pitches}")
    print()

    pitch_taken = ['ball', 'called_strike']#, 'blocked_ball', 'pitchout', 'intent_ball']
    pitches_taken_df = pitch_data_df[pitch_data_df['description'].isin(pitch_taken)]
    num_pitches_taken = len(pitches_taken_df)

    print(f"Number of pitches taken: {num_pitches_taken}")
    print(f"Percentage of pitches taken: {round((num_pitches_taken / total_num_pitches) * 100., 2)}")
    print()

    strikes_taken_df = pitches_taken_df[pitches_taken_df['description'] == 'called_strike']
    num_strikes_taken = len(strikes_taken_df)

    print(f"Number of strikes taken: {num_strikes_taken}")
    print(f"Percentage of pitches taken that were called strikes: {round((num_strikes_taken / num_pitches_taken) * 100., 2)}")
    print()
    
    left_strike_boundary = -0.71
    right_strike_boundary = -left_strike_boundary

    pitches_taken_df['low_strike_boundary'] = pitches_taken_df['sz_bot']
    pitches_taken_df['high_strike_boundary'] = pitches_taken_df['sz_bot'] + pitches_taken_df['sz_top']

    bottom_strike_boundary = pitches_taken_df['low_strike_boundary'].mean()
    top_strike_boundary = pitches_taken_df['high_strike_boundary'].mean()

    print("===================================================")
    print(f"Top boundary of approximate strike zone: {round(top_strike_boundary, 2)}")
    print(f"Bottom boundary of approximate strike zone: {round(bottom_strike_boundary, 2)}")
    print(f"Right boundary of approximate strike zone: {right_strike_boundary}")
    print(f"Left boundary of approximate strike zone: {left_strike_boundary}")
    print("===================================================")
    print()
    
    pitches_taken_df = pitches_taken_df[
        (pitches_taken_df['plate_x'] >= left_strike_boundary - 1) &\
        (pitches_taken_df['plate_x'] <= right_strike_boundary + 1) &\
        (pitches_taken_df['plate_z'] <= top_strike_boundary + 1) &\
        (pitches_taken_df['plate_z'] >= bottom_strike_boundary - 1)
    ]
    num_pitches_in_zone = len(pitches_taken_df)
    
    print(f"Number of pitches within 1 foot of approximate stike zone: {num_pitches_in_zone}")
    print(f"Percentage of pitches within 1 foot of approximate strike zone: {round((num_pitches_in_zone / total_num_pitches) * 100., 2)}")
    print()
    
    team_rename_dict = {'NYY': 'NYA',
                        'NYM': 'NYN', 
                        'TB': 'TBA', 
                        'CHC': 'CHN',
                        'CWS': 'CHA',
                        'KC': 'KCA',
                        'SF': 'SFN',
                        'SD': 'SDN',
                        'LAD': 'LAN',
                        'LAA': 'ANA',
                        'STL': 'SLN',
                        'WSH': 'WAS'}

    def rename_team(x):
        if x in list(team_rename_dict.keys()):
            return team_rename_dict[x]
        else:
            return x
    pitches_taken_df['home_team'] = pitches_taken_df['home_team'].apply(rename_team)

    pitches_taken_df['game_team_id'] = pitches_taken_df['home_team'] + pitches_taken_df['game_date'].str.replace('-', '')
    
    hp_umps = pd.read_csv(f"../data/hp_umpires_{year}.csv", header=None)
    hp_umps.columns = ['game_team_id', 'hp_umpire']

    pitches_taken_df = pd.merge(pitches_taken_df, hp_umps, how='inner', on='game_team_id')
    
    num_unique_umpires = len(pitches_taken_df['hp_umpire'].unique())
    num_unique_batters = len(pitches_taken_df['batter'].unique())
    num_unique_catchers = len(pitches_taken_df['fielder_2'].unique())
    num_unique_pitchers = len(pitches_taken_df['pitcher'].unique())

    print(f"Number of unique umpires: {num_unique_umpires}")
    print(f"Number of unique batters: {num_unique_batters}")
    print(f"Number of unique catchers: {num_unique_catchers}")
    print(f"Number of unique pitchers: {num_unique_pitchers}")
    print()
    print()
    
    return pitches_taken_df

### only keep pitches that were "taken"

In [4]:
taken_pitches_2015_df = import_data(2015)
taken_pitches_2016_df = import_data(2016)
# taken_pitches_2017_df = import_data(2017)

Gathering data for 2015 season

Number of total pitches in 2015 data: 702302

Number of pitches taken: 351121
Percentage of pitches taken: 50.0

Number of strikes taken: 118692
Percentage of pitches taken that were called strikes: 33.8

Top boundary of approximate strike zone: 5.04
Bottom boundary of approximate strike zone: 1.58
Right boundary of approximate strike zone: 0.71
Left boundary of approximate strike zone: -0.71

Number of pitches within 1 foot of approximate stike zone: 309768
Percentage of pitches within 1 foot of approximate strike zone: 44.11

Number of unique umpires: 92
Number of unique batters: 952
Number of unique catchers: 110
Number of unique pitchers: 734


Gathering data for 2016 season

Number of total pitches in 2016 data: 715823

Number of pitches taken: 360212
Percentage of pitches taken: 50.32

Number of strikes taken: 120771
Percentage of pitches taken that were called strikes: 33.53

Top boundary of approximate strike zone: 5.07
Bottom boundary of approxi