In [6]:
from pybaseball import statcast
from pybaseball import cache
cache.enable()

import pandas as pd
import numpy as np
from enum import Enum
from tqdm import tqdm, trange

PITCHES_W_PO = {"CH", "CU", "EP", "FC", "FF", "FO", "FS", "KC", "KN", "PO", "SC", "SI", "SL"}
PITCHES_LIST = ["CH", "CU", "EP", "FC", "FF", "FO", "FS", "KC", "KN", "SC", "SI", "SL"]
PitchType = Enum('PitchType', PITCHES_LIST)

possible_hits = ['single', 'double', 'triple', 'home_run']
HitType = Enum('HitType', possible_hits)

In [2]:
at_bat_events = {'field_out': 'out', 
'strikeout': 'strikeout', 
None: None, 
'walk': 'walk', 
'home_run': 'home_run',
'grounded_into_double_play': 'out', 
'force_out': 'out', 
'field_error': 'error', 
'single': 'single',
'double': 'double', 
'caught_stealing_2b': 'other', 
'hit_by_pitch': 'walk', 
'sac_fly': 'out',
'triple': 'triple', 
'double_play': 'out', 
'sac_bunt': 'out', 
'fielders_choice': 'other',
'strikeout_double_play': 'strikeout', 
'fielders_choice_out': 'out', 
'other_out': 'other',
'caught_stealing_home': 'other', 
'sac_fly_double_play': 'out', 
'pickoff_1b': 'other',
'caught_stealing_3b': 'other', 
'wild_pitch': 'other', 
'sac_bunt_double_play': 'out',
'stolen_base_3b': 'other', 
'stolen_base_2b': 'other', 
'triple_play': 'out', 
'pickoff_2b': 'other',
'intent_walk': 'walk', 
'pickoff_caught_stealing_3b': 'other',
'pickoff_caught_stealing_2b': 'other', 
'pickoff_3b': 'other', 
'runner_double_play': 'other',
'catcher_interf': 'other', 
'pickoff_caught_stealing_home': 'other', 
'ejection': 'other',
'passed_ball': 'other', 
'other_advance': 'other', 
'pickoff_error_2b': 'other',
'game_advisory': 'other', 
'stolen_base_home': 'other'}

In [3]:
class PlayerData:
    def __init__(self):
        self.n_pitch_type = np.zeros(len(PITCHES_LIST))
        self.n_total_pitches = 0
        
        self.n_pitch_type_strikes = np.zeros(len(PITCHES_LIST))
        self.n_total_strikes = 0
        
        self.n_weighted_hits_pitch_type = np.zeros(len(PITCHES_LIST))
        self.n_weighted_hits_total = 0
        self.n_at_bats = 0
        
        self.zone_thrown = np.zeros(13)
        self.zone_strikes = np.zeros(13)
        
        self.all_player_data = [np.zeros((2 * (len(PITCHES_LIST) + 1)) + 13)]
        
    '''
        pitch_type = Integer / None
        strike = Boolean
        hit_type = Integer / None
        new_at_bat = Boolean
        zone = int 1...9, 11...14
    '''    
    def update(self, pitch_type, strike, hit_type, new_at_bat, zone):
        if pitch_type is not None:
            self.n_pitch_type[pitch_type.value - 1] += 1
        self.n_total_pitches += 1
        
        if zone > 10:
            zone -= 1
        self.zone_thrown[zone - 1] += 1
        
        if strike:
            if pitch_type is not None:
                self.n_pitch_type_strikes[pitch_type.value - 1] += 1
            self.n_total_strikes += 1
            
            self.zone_strikes[zone - 1] += 1
            
        if hit_type is not None:
            if pitch_type is not None:
                self.n_weighted_hits_pitch_type[pitch_type.value - 1] += hit_type.value
            self.n_weighted_hits_total += hit_type.value
            
        if new_at_bat:
            self.n_at_bats += 1
        
    def compute_strike_tendencies(self):
        return np.divide(self.n_pitch_type_strikes, self.n_pitch_type, out=np.zeros_like(self.n_pitch_type_strikes), where=self.n_pitch_type!=0)
    
    def compute_overall_strike_tendency(self):
        if self.n_total_pitches == 0:
            return 0
        return self.n_total_strikes / self.n_total_pitches
    
    def compute_slugging_averages(self):
        return np.divide(self.n_weighted_hits_pitch_type, self.n_pitch_type, out=np.zeros_like(self.n_weighted_hits_pitch_type), where=self.n_pitch_type!=0)
    
    def compute_overall_slugging_average(self):
        if self.n_at_bats == 0:
            return 0
        return self.n_weighted_hits_total / self.n_at_bats
    
    def compute_heatmap(self):
        return np.divide(self.zone_strikes, self.zone_thrown, out=np.zeros_like(self.zone_strikes), where=self.zone_thrown!=0)
    
    def compute_stats(self):
        strike_tendencies = self.compute_strike_tendencies()
        overall_strike_tendency = self.compute_overall_strike_tendency()
        slugging_avgs = self.compute_slugging_averages()
        overall_slugging_avg = self.compute_overall_slugging_average()
        heatmap = self.compute_heatmap()
        self.all_player_data.append(np.concatenate([strike_tendencies, [overall_strike_tendency], slugging_avgs, [overall_slugging_avg], heatmap]))

In [4]:
all_data = statcast(start_dt='2008-01-01', end_dt='2019-12-31')
pitch_data = all_data.iloc[::-1]

# Clean the data
pitch_data = pitch_data.replace('FA', 'FF')
pitch_data = pitch_data.replace('CS', 'CU')
pitch_data = pitch_data[pitch_data['pitch_type'].isin(PITCHES_W_PO)]
pitch_data = pitch_data.reset_index(drop=True)

pitch_data.to_csv("statcast_08_to_19.csv")

This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates


100%|█████████████████████████████████████████████████| 2584/2584 [05:44<00:00,  7.49it/s]


In [7]:
batters = dict()

for i in trange(len(pitch_data)):
    curr_pitch = pitch_data.iloc[i]
    batter_id = curr_pitch['batter']
    if batter_id not in batters:
        batters[batter_id] = PlayerData()
    
    curr_batter = batters[batter_id]

    pitch_type = curr_pitch['pitch_type']
    pitch_type = PitchType[pitch_type] if pitch_type in PITCHES_LIST else None
    strike = curr_pitch['type'] == 'S'
    hit_type = curr_pitch['events']
    hit_type = HitType[hit_type] if hit_type in possible_hits else None
    new_at_bat = (i == 0) or (curr_pitch['at_bat_number'] != pitch_data.iloc[i - 1]['at_bat_number'])
    zone = curr_pitch['zone']
    
    curr_batter.update(pitch_type, strike, hit_type, new_at_bat, zone)
    curr_batter.compute_stats()

100%|████████████████████████████████████████| 8617940/8617940 [2:29:04<00:00, 963.48it/s]


In [8]:
import pickle

with open('batters_dict.pickle', 'wb') as handle:
    pickle.dump(batters, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
batter_ids = list(batters.keys())
new_columns = ['Strike_Tend_CH', 'Strike_Tend_CU', 'Strike_Tend_EP', 'Strike_Tend_FC', 'Strike_Tend_FF',
               'Strike_Tend_FO', 'Strike_Tend_FS', 'Strike_Tend_KC', 'Strike_Tend_KN', 'Strike_Tend_SC',
               'Strike_Tend_SI', 'Strike_Tend_SL', 'Overall_Strike_Tend',
               'Slug_Avg_CH', 'Slug_Avg_CU', 'Slug_Tend_EP', 'Slug_Tend_FC', 'Slug_Avg_FF', 
               'Slug_Avg_FO', 'Slug_Avg_FS', 'Slug_Avg_KC', 'Slug_Avg_KN', 'Slug_Avg_SC', 
               'Slug_Avg_SI', 'Slug_Avg_SL', 'Overall_Slug_Avg',
               'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt',
               'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt',
               'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt',
               'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

for i in trange(len(batter_ids)):
    batter_id = batter_ids[i]
    batter = batters[batter_id]
    
    # Add statistics to the pitch row
    pitch_data.loc[pitch_data['batter'].eq(batter_id), new_columns] = batter.all_player_data[:-1] # Changes happen here
    
#     # Add Result Last at Bat
#     batter_df = pitch_data[pitch_data['batter'] == batter_id]
    
#     row_ids = batter_df.index
#     last_at_bat = None
#     pitch_data.loc[row_ids[0], 'Result_Last_At_Bat'] = None # CHANGES HAPPEN HERE
#     for i in range(1, len(row_ids)):
#         row_id = row_ids[i]
#         if batter_df.iloc[i]['game_pk'] != batter_df.iloc[i - 1]['game_pk']:
#             last_at_bat = None
#         elif batter_df.iloc[i]['at_bat_number'] != batter_df.iloc[i - 1]['at_bat_number']:
#             last_at_bat = at_bat_events[batter_df.iloc[i - 1]['events']]
#         pitch_data.loc[row_ids[i],'Result_Last_At_Bat'] = last_at_bat # CHANGES HAPPEN HERE

100%|█████████████████████████████████████████████████| 3422/3422 [25:09<00:00,  2.27it/s]


In [None]:
print("Done")

In [15]:
pitch_data.to_csv("batter_stats.csv")

In [17]:
pitch_data.tail(20)

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,Zone_5_Strike_Pcnt,Zone_6_Strike_Pcnt,Zone_7_Strike_Pcnt,Zone_8_Strike_Pcnt,Zone_9_Strike_Pcnt,Zone_11_Strike_Pcnt,Zone_12_Strike_Pcnt,Zone_13_Strike_Pcnt,Zone_14_Strike_Pcnt,Result_Last_At_Bat
8617920,FF,2019-10-30,90.9,-0.82,6.75,"Urquidy, José",543685,664353,,ball,...,0.644765,0.762739,0.583333,0.579213,0.625427,0.186676,0.241651,0.29557,0.174297,
8617921,SL,2019-10-30,80.4,-1.64,6.41,"Urquidy, José",543685,664353,,foul,...,0.644765,0.762739,0.583333,0.579213,0.625427,0.186676,0.241505,0.29557,0.174297,
8617922,SL,2019-10-30,81.3,-1.1,6.63,"Urquidy, José",543685,664353,,ball,...,0.644765,0.762739,0.583333,0.579213,0.625427,0.186676,0.241505,0.29557,0.174297,
8617923,FF,2019-10-30,93.3,-0.81,6.79,"Urquidy, José",543685,664353,,ball,...,0.644765,0.762739,0.583333,0.579213,0.625427,0.186676,0.241505,0.29557,0.174251,
8617924,FF,2019-10-30,92.8,-0.61,6.78,"Urquidy, José",543685,664353,,foul,...,0.644765,0.762739,0.583333,0.579213,0.625427,0.186676,0.241358,0.29557,0.174251,
8617925,FF,2019-10-30,92.9,-0.46,6.78,"Urquidy, José",543685,664353,field_out,hit_into_play,...,0.644765,0.762739,0.583333,0.579213,0.625427,0.186676,0.241818,0.29557,0.174251,
8617926,CU,2019-10-30,81.7,-0.98,6.65,"Urquidy, José",665742,664353,,ball,...,0.642857,0.626866,0.703833,0.679878,0.674208,0.219911,0.272921,0.220917,0.154044,
8617927,CH,2019-10-30,84.6,-1.36,6.15,"Urquidy, José",665742,664353,field_out,hit_into_play,...,0.642857,0.626866,0.703833,0.679878,0.674208,0.219911,0.27234,0.220917,0.154044,
8617928,FF,2019-10-30,95.8,-2.85,5.63,"Hudson, Daniel",543807,543339,,called_strike,...,0.631011,0.70098,0.706577,0.616749,0.666063,0.134821,0.220532,0.214749,0.257266,
8617929,FF,2019-10-30,94.7,-2.76,5.65,"Hudson, Daniel",543807,543339,field_out,hit_into_play,...,0.631373,0.70098,0.706577,0.616749,0.666063,0.134821,0.220532,0.214749,0.257266,


In [None]:
# # TESTING

# # batter_data[new_columns] = batters[434578].all_player_data[:-1]
# # batters = dict()

# # for i in range(len(batter_data)):
# #     curr_pitch = batter_data.iloc[i]
# #     batter_id = curr_pitch['batter']
# #     if batter_id not in batters:
# #         batters[batter_id] = PlayerData()
    
# #     curr_batter = batters[batter_id]
    
# #     pitch_type = curr_pitch['pitch_type']
# #     pitch_type = PitchType[pitch_type] if pitch_type in PITCHES_LIST else None
# #     strike = curr_pitch['type'] == 'S'
# #     hit_type = curr_pitch['events']
# #     hit_type = HitType[hit_type] if hit_type in possible_hits else None
# #     new_at_bat = (i == 0) or (curr_pitch['at_bat_number'] != batter_data.iloc[i - 1]['at_bat_number'])
# #     zone = curr_pitch['zone']
    
# #     curr_batter.update(pitch_type, strike, hit_type, new_at_bat, zone)
# #     curr_batter.compute_stats()

# # # last_at_bat = None
# # # batter_data.iloc[0]['Result_Last_At_Bat'] = None
# # # for i in range(1, len(batter_data)):
# # #     if batter_data.iloc[i]['game_pk'] != batter_data.iloc[i - 1]['game_pk']:
# # #         last_at_bat = None
# # #     elif batter_data.iloc[i]['at_bat_number'] != batter_data.iloc[i - 1]['at_bat_number']:
# # #         last_at_bat = at_at_events[batter_data.iloc[i - 1]['events']]
# # #     batter_data.iloc[i]['Result_Last_At_Bat'] = last_at_bat

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(batter_data[['game_pk','events', 'at_bat_number', 'Result_Last_At_Bat']])