In [69]:
import pandas as pd
import numpy as np
from enum import Enum
from tqdm import trange

In [65]:
VERLANDER_PITCHES_LIST = ["FF", "CU", "CH", "SL", "SI"]
DROP_PITCHES_LIST = ['Strike_Tend_EP', 'Strike_Tend_FC', 'Strike_Tend_FO', 
                     'Strike_Tend_FS', 'Strike_Tend_KC', 'Strike_Tend_KN', 'Strike_Tend_SC',
                     'Slug_Tend_EP', 'Slug_Tend_FC', 'Slug_Avg_FO',
                     'Slug_Avg_FS', 'Slug_Avg_KC', 'Slug_Avg_KN', 'Slug_Avg_SC']
VerlanderPitchType = Enum('VerlanderPitchType', VERLANDER_PITCHES_LIST)

PITCHER_PITCH_TEND = ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI']
PITCHER_STRIKE_TEND = ['Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI']

VERLANDER_ID = 434378
AVILA_ID = 488671

In [None]:
class PitcherData:
    def __init__(self):
        self.n_pitch_type = np.zeros(len(VERLANDER_PITCHES_LIST))
        self.n_total_pitches = 0
        
        self.n_pitch_type_strikes = np.zeros(len(VERLANDER_PITCHES_LIST))
        
        self.all_pitcher_data = [np.zeros(2*len(VERLANDER_PITCHES_LIST))]
        
    '''
        pitch_type = Integer / None
        strike = Boolean
        hit_type = Integer / None
        new_at_bat = Boolean
        zone = int 1...9, 11...14
    '''    
    def update(self, pitch_type, strike):
        self.n_pitch_type[pitch_type.value - 1] += 1
        self.n_total_pitches += 1

        if strike:
            self.n_pitch_type_strikes[pitch_type.value - 1] += 1
            
    def compute_pitch_tendencies(self):
        return self.n_pitch_type / self.n_total_pitches
        
    def compute_strike_tendencies(self):
        return np.divide(self.n_pitch_type_strikes, self.n_pitch_type, out=np.zeros_like(self.n_pitch_type_strikes), where=self.n_pitch_type!=0)

    def compute_stats(self):
        pitch_tendencies = self.compute_pitch_tendencies()
        strike_tendencies = self.compute_strike_tendencies()
        self.all_pitcher_data.append(np.concatenate([pitch_tendencies, strike_tendencies]))
        
def compute_tendencies(pitch_data):
    computed_data = PitcherData()

    for i in trange(len(pitch_data)):
        curr_pitch = pitch_data.iloc[i]

        pitch_type = curr_pitch['pitch_type']
        pitch_type = VerlanderPitchType[pitch_type]
        strike = curr_pitch['type'] == 'S'

        computed_data.update(pitch_type, strike)
        computed_data.compute_stats()
        
    return np.array(computed_data.all_pitcher_data)

In [3]:
verlander_games = pd.read_csv('verlander_games_to_2017.csv', index_col=0)

  verlander_games = pd.read_csv('verlander_games_to_2017.csv', index_col=0)


In [59]:
verlander_pitches = verlander_games[verlander_games['pitcher.1'] == VERLANDER_ID]

# Drop PO pitches
verlander_pitches = verlander_pitches[verlander_pitches['pitch_type'].isin(VERLANDER_PITCHES_LIST)]
verlander_pitches = verlander_pitches.drop(labels=DROP_PITCHES_LIST, axis=1)

verlander_and_avila = verlander_pitches[verlander_pitches['fielder_2.1'] == AVILA_ID]
verlander_pitches = verlander_pitches[verlander_pitches['game_date'] <= verlander_and_avila.iloc[-1]['game_date']]

verlander_pitches[PITCHER_PITCH_TEND + PITCHER_STRIKE_TEND] = compute_tendencies(verlander_pitches)[:-1]
verlander_and_avila[PITCHER_PITCH_TEND + PITCHER_STRIKE_TEND] = compute_tendencies(verlander_and_avila)[:-1]

In [None]:
# % Changeup / Total Pitches Thrown
# % Curveball / Total Pitches Thrown
# % 4-Seam Fastball / Total Pitches Thrown
# % Sinker / Total Pitches Thrown
# % Slider / Total Pitches Thrown
# % Changeup Strikes / Changeup Pitches
# % Curveball Strikes / Curveball Pitches
# % 4-Seam Fastball Strikes / 4-Seam Fast Pitches
# % Sinker Strikes / Sinker Pitches
# % Slider Strikes / Slider Pitches

In [72]:
# Previous Pitch Type
# Previous Pitch Result

# Previous n pitches: (n = 5, 10, 20)
    
# - n_pitch_type / n

# Previous n pitch_types: (n = 5, 10, 20)

# - n_strikes / n

# Previous Pitch Result:
# Previous 5 Pitches: % Changeup
# Previous 5 Pitches: % Curveball
# Previous 5 Pitches: % Four-Seam Fastball
# Previous 5 Pitches: % Slider
# Previous 5 Pitches: % Sinker
# Previous 10 Pitches: % Changeup
# Previous 10 Pitches: % Curveball
# Previous 10 Pitches: % Four-Seam Fastball
# Previous 10 Pitches: % Slider
# Previous 10 Pitches: % Sinker
# Previous 20 Pitches: % Changeup
# Previous 20 Pitches: % Curveball
# Previous 20 Pitches: % Four-Seam Fastball
# Previous 20 Pitches: % Slider
# Previous 20 Pitches: % Sinker
# Previous 5 Changeups: % Strikes
# Previous 5 Curveball: % Strikes
# Previous 5 Four-Seam Fastball: % Strikes
# Previous 5 Slider: % Strikes
# Previous 5 Sinker: % Strikes

In [159]:
def compute_recursive_stats(pitch_data):
    n = [5, 10, 20]
    blank_row = np.concatenate([[None, None], np.zeros((2 * len(n))*len(VERLANDER_PITCHES_LIST))])
    computed_data = [blank_row.copy()]
    
    prev_pitches = {5: [], 10: [], 20: []}
    prev_pitch_results = {5: [[] for _ in range(len(VERLANDER_PITCHES_LIST))],
                         10: [[] for _ in range(len(VERLANDER_PITCHES_LIST))],
                         20: [[] for _ in range(len(VERLANDER_PITCHES_LIST))]}
    
    for i in trange(len(pitch_data) - 1):
        curr_pitch = pitch_data.iloc[i]
        
        if (curr_pitch['game_pk'] == pitch_data.iloc[i + 1]['game_pk']):
            pitch_type = curr_pitch['pitch_type']
            pitch_type = VerlanderPitchType[pitch_type]
            pitch_result = curr_pitch['type']

            row_data = [pitch_type.name, pitch_result]

            for j in n:
                prev_pitches[j].append(pitch_type)
                if len(prev_pitches[j]) > j:
                    prev_pitches[j].pop(0)
                prev_pitch_results[j][pitch_type.value - 1].append(pitch_result)
                if len(prev_pitch_results[j][pitch_type.value - 1]) > j:
                    prev_pitch_results[j][pitch_type.value - 1].pop(0)

                for pitch in VerlanderPitchType:
                    if len(prev_pitches[j]) != 0:
                        pitch_pcnt = prev_pitches[j].count(pitch) / len(prev_pitches[j])
                    else:
                        pitch_pcnt = 0.0
                    if len(prev_pitch_results[j][pitch.value - 1]) != 0:
                        strike_pcnt = prev_pitch_results[j][pitch.value - 1].count('S') / len(prev_pitch_results[j][pitch.value - 1])
                    else:
                        strike_pcnt = 0.0
                    row_data.append(pitch_pcnt)
                    row_data.append(strike_pcnt)

            computed_data.append(np.array(row_data))
            
        else: # Last Pitch of the Game
            prev_pitches = {5: [], 10: [], 20: []}
            prev_pitch_results = {5: [[] for _ in range(len(VERLANDER_PITCHES_LIST))],
                                 10: [[] for _ in range(len(VERLANDER_PITCHES_LIST))],
                                 20: [[] for _ in range(len(VERLANDER_PITCHES_LIST))]}
            computed_data.append(blank_row.copy())
        
    return computed_data, prev_pitches, prev_pitch_results

In [144]:
recursive_cols = ["PrevPitchType", "PrevPitchResult"]

for j in n:
    for pitch in VerlanderPitchType:
        recursive_cols.append("Prev" + str(j) + "_Pcnt_" + pitch.name)
        recursive_cols.append("Prev" + str(j) + "_" + pitch.name + "_Strike")

In [170]:
v_and_a_recursive, prev_pitches, prev_pitch_results = compute_recursive_stats(verlander_and_avila)
verlander_and_avila[recursive_cols] = v_and_a_recursive

100%|█████████████████████████████████████████████| 15044/15044 [00:05<00:00, 2728.82it/s]


In [173]:
v_only_recursive, prev_pitches, prev_pitch_results = compute_recursive_stats(verlander_pitches)
verlander_pitches[recursive_cols] = v_only_recursive

100%|█████████████████████████████████████████████| 35152/35152 [00:16<00:00, 2192.62it/s]


In [180]:
min(verlander_pitches['game_year'].to_numpy())

2008

In [174]:
for column in verlander_and_avila.columns:
    print(column)

pitch_type
game_date
release_speed
release_pos_x
release_pos_z
player_name
batter
pitcher
events
description
spin_dir
spin_rate_deprecated
break_angle_deprecated
break_length_deprecated
zone
des
game_type
stand
p_throws
home_team
away_team
type
hit_location
bb_type
balls
strikes
game_year
pfx_x
pfx_z
plate_x
plate_z
on_3b
on_2b
on_1b
outs_when_up
inning
inning_topbot
hc_x
hc_y
tfs_deprecated
tfs_zulu_deprecated
fielder_2
umpire
sv_id
vx0
vy0
vz0
ax
ay
az
sz_top
sz_bot
hit_distance_sc
launch_speed
launch_angle
effective_speed
release_spin_rate
release_extension
game_pk
pitcher.1
fielder_2.1
fielder_3
fielder_4
fielder_5
fielder_6
fielder_7
fielder_8
fielder_9
release_pos_y
estimated_ba_using_speedangle
estimated_woba_using_speedangle
woba_value
woba_denom
babip_value
iso_value
launch_speed_angle
at_bat_number
pitch_number
pitch_name
home_score
away_score
bat_score
fld_score
post_away_score
post_home_score
post_bat_score
post_fld_score
if_fielding_alignment
of_fielding_alignment
spin_axi

In [186]:
verlander_pitches['if_fielding_alignment'].unique()

array([nan, 'Standard', 'Infield shift', 'Strategic'], dtype=object)

In [193]:
needed_columns = [
"pitch_type", # Start Game Situational Data
"month",
"year",
"inning",
"inning_topbot",
"outs",
"strikes",
"balls",
"pitch_number",
"on_1b",
"on_2b",
"on_3b",
"score_diff",
"of_std",
"of_strat",
"if_std",
"if_strat",
"if_shift",
"Pitcher_Tend_FF", # Start Pitcher Statistics
"Pitcher_Tend_CU",
"Pitcher_Tend_CH",
"Pitcher_Tend_SL",
"Pitcher_Tend_SI",
"Pitcher_Strike_Tend_FF",
"Pitcher_Strike_Tend_CU",
"Pitcher_Strike_Tend_CH",
"Pitcher_Strike_Tend_SL",
"Pitcher_Strike_Tend_SI",
"PrevPitch_FF", # Start Recursive Statistics
"PrevPitch_CU",
"PrevPitch_CH",
"PrevPitch_SL",
"PrevPitch_SI",
"PrevPitch_Strike",
"PrevPitch_Ball",
"PrevPitch_InPlay",
"Prev5_Pcnt_FF",
"Prev5_FF_Strike",
"Prev5_Pcnt_CU",
"Prev5_CU_Strike",
"Prev5_Pcnt_CH",
"Prev5_CH_Strike",
"Prev5_Pcnt_SL",
"Prev5_SL_Strike",
"Prev5_Pcnt_SI",
"Prev5_SI_Strike",
"Prev10_Pcnt_FF",
"Prev10_FF_Strike",
"Prev10_Pcnt_CU",
"Prev10_CU_Strike",
"Prev10_Pcnt_CH",
"Prev10_CH_Strike",
"Prev10_Pcnt_SL",
"Prev10_SL_Strike",
"Prev10_Pcnt_SI",
"Prev10_SI_Strike",
"Prev20_Pcnt_FF",
"Prev20_FF_Strike",
"Prev20_Pcnt_CU",
"Prev20_CU_Strike",
"Prev20_Pcnt_CH",
"Prev20_CH_Strike",
"Prev20_Pcnt_SL",
"Prev20_SL_Strike",
"Prev20_Pcnt_SI",
"Prev20_SI_Strike",
"batter_stance", # Start Batter Statistics
"Last_At_Bat_Strikeout",
"Last_At_Bat_Out",
"Last_At_Bat_Homerun",
"Last_At_Bat_Triple",
"Last_At_Bat_Double",
"Last_At_Bat_Single",
"Last_At_Bat_Walk",
"Last_At_Bat_Error",
"Last_At_Bat_Other",
"Strike_Tend_FF",
"Strike_Tend_CU",
"Strike_Tend_CH",
"Strike_Tend_SL",
"Strike_Tend_SI",
"Overall_Strike_Tend",
"Slug_Avg_FF",
"Slug_Avg_CU",
"Slug_Avg_CH",
"Slug_Avg_SL",
"Slug_Avg_SI",
"Overall_Slug_Avg",
"Zone_1_Strike_Pcnt",
"Zone_2_Strike_Pcnt",
"Zone_3_Strike_Pcnt",
"Zone_4_Strike_Pcnt",
"Zone_5_Strike_Pcnt",
"Zone_6_Strike_Pcnt",
"Zone_7_Strike_Pcnt",
"Zone_8_Strike_Pcnt",
"Zone_9_Strike_Pcnt",
"Zone_11_Strike_Pcnt",
"Zone_12_Strike_Pcnt",
"Zone_13_Strike_Pcnt",
"Zone_14_Strike_Pcnt"]

In [195]:
TOP_BOT_LOWERCASE_MAP = {'top': 0, 'bot': 1}

MONTH_START = 5
MONTH_STOP = 7
DAY_START = 8
DAY_STOP = 10

def clean_data(pitch_data, min_year):
    # Convert game year from string to integer, and make the year 0-indexed starting at min_year.
    pitch_data['game_year'] = pitch_data['game_year'].astype("int")
    pitch_data['game_year'] = pitch_data['game_year'] - min_year

    # Convert inning type from string to integer.
    pitch_data['inning'] = pitch_data['inning'].astype("int")

    # Convert balls from string to integer, and only keep rows where there's valid number of balls.
    pitch_data['balls'] = pitch_data['balls'].astype("int")
    pitch_data = pitch_data[pitch_data['balls'] < 4]

    # Convert strikes from string to integer, and only keep rows where there's valid number of strikes.
    pitch_data['strikes'] =  pitch_data['strikes'].astype("int")
    pitch_data =  pitch_data[ pitch_data['strikes'] < 3]

    # Convert number of outs from string to integer.
    pitch_data['outs_when_up'] = pitch_data['outs_when_up'].astype("int")

    # Convert score of each team from string to integer, and store the difference.
    pitch_data['bat_score'] = pitch_data['bat_score'].astype("int")
    pitch_data['fld_score'] = pitch_data['fld_score'].astype("int")
    pitch_data['score_diff'] = pitch_data['fld_score'] - pitch_data['bat_score']

    # Lowercase top and bottom of inning, then map top and bottom to 0 and 1.
    pitch_data['inning_topbot'] = pitch_data['inning_topbot'].str.lower()
    pitch_data['inning_topbot'] = pitch_data['inning_topbot'].map(TOP_BOT_LOWERCASE_MAP)

    # Store month from the game date
    pitch_data['month'] = pitch_data['game_date'].str.slice(MONTH_START, MONTH_STOP).astype("int")

    # Set each base as a boolean where 0 if base is empty, and 1 if someone on base.
    pitch_data['on_1b'] = (~pitch_data['on_1b'].isnull()).astype("int")
    pitch_data['on_2b'] = (~pitch_data['on_2b'].isnull()).astype("int")
    pitch_data['on_3b'] = (~pitch_data['on_3b'].isnull()).astype("int")

    # Set batter's stance as a boolean where 0 if left, and 1 if right
    pitch_data['stand'] = (pitch_data['stand'] == 'R').astype("int")
    
    print("Making alignments")
    # One hot vector for out-fielding alignments.
    pitch_data['of_std'] = (pitch_data['of_fielding_alignment'].str.lower() == 'standard').astype("int")
    pitch_data['of_strat'] = (pitch_data['of_fielding_alignment'].str.lower() == 'strategic').astype("int")

    # One hot vector for in-fielding alignments.
    pitch_data['if_std'] = (pitch_data['if_fielding_alignment'].str.lower() == 'standard').astype("int")
    pitch_data['if_strat'] = (pitch_data['if_fielding_alignment'].str.lower() == 'strategic').astype("int")
    pitch_data['if_shift'] = (pitch_data['if_fielding_alignment'].str.lower() == 'infield shift').astype("int")

    # One hot vector for last_at_bat_results
    pitch_data["Last_At_Bat_Strikeout"] = (pitch_data["Result_Last_At_Bat"] == 'strikeout').astype("int")
    pitch_data["Last_At_Bat_Out"] = (pitch_data["Result_Last_At_Bat"] == 'out').astype("int")
    pitch_data["Last_At_Bat_Homerun"] = (pitch_data["Result_Last_At_Bat"] == 'home_run').astype("int")
    pitch_data["Last_At_Bat_Triple"] = (pitch_data["Result_Last_At_Bat"] == 'triple').astype("int")
    pitch_data["Last_At_Bat_Double"] = (pitch_data["Result_Last_At_Bat"] == 'double').astype("int")
    pitch_data["Last_At_Bat_Single"] = (pitch_data["Result_Last_At_Bat"] == 'single').astype("int")
    pitch_data["Last_At_Bat_Walk"] = (pitch_data["Result_Last_At_Bat"] == 'walk').astype("int")
    pitch_data["Last_At_Bat_Error"] = (pitch_data["Result_Last_At_Bat"] == 'error').astype("int")
    pitch_data["Last_At_Bat_Other"] = (pitch_data["Result_Last_At_Bat"] == 'other').astype("int")
    
    # One hot vector for "PrevPitchType"
    pitch_data["PrevPitch_FF"] = (pitch_data['PrevPitchType'] == "FF").astype("int")
    pitch_data["PrevPitch_CU"] = (pitch_data['PrevPitchType'] == "CU").astype("int")
    pitch_data["PrevPitch_CH"] = (pitch_data['PrevPitchType'] == "CH").astype("int")
    pitch_data["PrevPitch_SI"] = (pitch_data['PrevPitchType'] == "SI").astype("int")
    pitch_data["PrevPitch_SL"] = (pitch_data['PrevPitchType'] == "SL").astype("int")
    
    # One hot vecror for "PrevPitchResult"
    pitch_data["PrevPitch_Strike"] = (pitch_data["PrevPitchResult"] == "S").astype("int")
    pitch_data["PrevPitch_Ball"] = (pitch_data["PrevPitchResult"] == "B").astype("int")
    pitch_data["PrevPitch_InPlay"] = (pitch_data["PrevPitchResult"] == "X").astype("int")
    
    print("Dropping columns")
    # Organize columns
    pitch_data = pitch_data.rename(columns={"game_year": "year", "outs_when_up": "outs", "stand": "batter_stance"})
    pitch_data = pitch_data[needed_columns]

    return pitch_data

In [206]:
verlander_and_avila_final = clean_data(verlander_and_avila.copy(), min_year=min(verlander_and_avila['game_year']))
verlander_pitches_final = clean_data(verlander_pitches.copy(), min_year=min(verlander_pitches['game_year']))

Making alignments
Dropping columns
Making alignments
Dropping columns


In [211]:
verlander_and_avila_final.to_csv("VerlanderAndAvilaDataset.csv")
verlander_pitches_final.to_csv("VerlanderOnlyDataset.csv")

In [None]:
def prep_train_test_data(cleaned_data, dataset_name):
    y = cleaned_data["pitch_type"].to_numpy()
    X = cleaned_data.loc[:, cleaned_data.columns != 'pitch_type'].to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    pd.DataFrame(X_train).to_csv("./" + dataset_name + "_X_train.csv", index=False)
    pd.DataFrame(X_test).to_csv("./" + dataset_name + "_X_test.csv", index=False)
    pd.DataFrame(y_train).to_csv("./" + dataset_name + "_y_train.csv", index=False)
    pd.DataFrame(y_test).to_csv("./" + dataset_name + "_y_test.csv", index=False)

In [212]:
"./" + "VerlanderAndAvila" + "_X_train.csv"

'./VerlanderAndAvila_X_train.csv'