## Model Training

#### Load Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import dirichlet
import plotly.express as px

#### Load Training Data

In [2]:
train_data = pd.read_csv("Data/full_training_data.csv")
train_data.head()

Unnamed: 0,PITCH_CLASS,PITCH_CLASS_NAME,PITCH_TYPE,PITCH_NAME,PLAYER_NAME,BATTER_ID,BAT_SIDE,THROW_SIDE,GAME_YEAR,BALLS,STRIKES,PLATE_X,PLATE_Z
0,FB,Fastball,FF,4-Seam Fastball,"Betts, Mookie",605141,R,R,2021,0,0,0.07,4.16
1,FB,Fastball,FF,4-Seam Fastball,"Betts, Mookie",605141,R,R,2021,1,0,0.19,2.6
2,FB,Fastball,FF,4-Seam Fastball,"Betts, Mookie",605141,R,R,2021,1,1,-0.61,3.65
3,FB,Fastball,FF,4-Seam Fastball,"Betts, Mookie",605141,R,R,2021,2,1,0.52,3.95
4,FB,Fastball,FF,4-Seam Fastball,"Betts, Mookie",605141,R,R,2021,3,1,0.07,3.09


#### Get Counts Per Hierarchy

`prior_pitches_per_year_count` - the prior distribution (vector of alphas) that will be used for a dirichlet prior for every year + count combination

`pitch_count_dist_by_player` - the observed proportion of pitches seen for each year + count + player combination

`dist_pitches_per_year` - the observed proportion of pitches in each year for every player

`dist_count_per_year` - the prior distribution (vector of alphas) that will be used for a dirichlet prior for the proportion of pitches seen in each count for every year


In [3]:
prior_pitches_per_year_count = train_data.groupby(['GAME_YEAR', 'STRIKES', 'BALLS', 'PITCH_CLASS_NAME']).size().reset_index(name='freq_pitch_class')
pitch_count_dist_by_player = train_data.groupby(['GAME_YEAR', 'BATTER_ID', 'BALLS', 'STRIKES']).size().reset_index(name='freq_count_seen')
dist_pitches_per_year = train_data.groupby(["BATTER_ID", "GAME_YEAR"]).size().reset_index(name="freq")
dist_count_per_year = train_data.groupby(["GAME_YEAR", "BALLS", "STRIKES"]).size().reset_index(name='freq')

In [4]:
prior_scaling_factor = 100
pitch_count_dist_by_player['prop_count_seen'] = pitch_count_dist_by_player.groupby(['GAME_YEAR', 'BATTER_ID'])['freq_count_seen'].transform(lambda grp: grp / grp.sum())
prior_pitches_per_year_count['prior_pitch_class_count'] = prior_pitches_per_year_count.groupby(['GAME_YEAR', 'STRIKES', 'BALLS'])['freq_pitch_class'].transform(lambda grp: round(grp * prior_scaling_factor / grp.sum()))
dist_pitches_per_year['normalized_freq'] = dist_pitches_per_year.groupby("BATTER_ID")[['freq']].transform(lambda grp: grp / sum(grp))
dist_count_per_year['normalized_freq'] = dist_count_per_year.groupby(["GAME_YEAR"])[['freq']].transform(lambda grp: grp / sum(grp))
prior_pitches_per_year_count = prior_pitches_per_year_count.pivot(index=['GAME_YEAR', 'BALLS', 'STRIKES'],
                                                                  columns='PITCH_CLASS_NAME',
                                                                  values='prior_pitch_class_count').reset_index()
prior_pitches_per_year_count = prior_pitches_per_year_count.rename(columns={
    'Breaking Ball': 'prior_breaking_ball',
    'Fastball': 'prior_fastball',
    'Off-speed': 'prior_off-speed'
})

In [5]:
observed_pitches_per_year_count_player = train_data.groupby(['BATTER_ID', 'GAME_YEAR', 'STRIKES', 'BALLS', 'PITCH_CLASS_NAME']).size().reset_index(name='count_pitch_class')
observed_pitches_per_year_count_player = observed_pitches_per_year_count_player.pivot(index=['BATTER_ID', 'GAME_YEAR', 'BALLS', 'STRIKES'],
                                                                                      columns='PITCH_CLASS_NAME',
                                                                                      values='count_pitch_class').reset_index()
observed_pitches_per_year_count_player = observed_pitches_per_year_count_player.rename(columns={
    'Breaking Ball': 'obs_breaking_ball',
    'Fastball': 'obs_fastball',
    'Off-speed': 'obs_off-speed'
})
observed_pitches_per_year_count_player = observed_pitches_per_year_count_player.fillna(0)

Ensure that we have 0 frequencies if a player never saw a pitch in a certain count in a given year

In [6]:
years = pd.DataFrame({"GAME_YEAR": observed_pitches_per_year_count_player['GAME_YEAR'].unique().tolist()})
balls = pd.DataFrame({"BALLS": observed_pitches_per_year_count_player['BALLS'].unique().tolist()})
strikes = pd.DataFrame({"STRIKES": observed_pitches_per_year_count_player['STRIKES'].unique().tolist()})
players = pd.DataFrame({"BATTER_ID": observed_pitches_per_year_count_player['BATTER_ID'].unique().tolist()})
balls_strikes = pd.merge(balls, strikes, how="cross")
balls_strikes_years = pd.merge(years, balls_strikes, how="cross")
balls_strikes_years_players = pd.merge(balls_strikes_years, players, how="cross")

In [7]:
observed_pitches_per_year_count_player = pd.merge(balls_strikes_years_players, observed_pitches_per_year_count_player, how="left", on=["GAME_YEAR", "BATTER_ID", "BALLS", "STRIKES"]).groupby(['BATTER_ID', 'GAME_YEAR']).filter(lambda grp: sum(grp['obs_fastball'].isna()) < 12).fillna(0)

In [8]:
prior_observed_merged = pd.merge(observed_pitches_per_year_count_player, prior_pitches_per_year_count, on=['GAME_YEAR', 'BALLS', 'STRIKES'], how='right')
posterior_year_count = prior_observed_merged[['BATTER_ID', 'GAME_YEAR', 'BALLS', 'STRIKES']].copy()
posterior_year_count.loc[:, 'posterior_fastball'] = prior_observed_merged['obs_fastball'] + prior_observed_merged['prior_fastball']
posterior_year_count.loc[:, 'posterior_breaking_ball'] = prior_observed_merged['obs_breaking_ball'] + prior_observed_merged['prior_breaking_ball']
posterior_year_count.loc[:, 'posterior_off-speed'] = prior_observed_merged['obs_off-speed'] + prior_observed_merged['prior_off-speed']

In [9]:
pitch_count_dist_by_player['batting_count'] = pitch_count_dist_by_player['BALLS'].astype(str) + "-" + pitch_count_dist_by_player['STRIKES'].astype(str)

In [10]:
pitch_count_dist_by_player = pitch_count_dist_by_player.pivot(index=["BATTER_ID", "GAME_YEAR"],
                                                              columns="batting_count",
                                                              values="freq_count_seen").reset_index().fillna(0)

In [11]:
alphas_pitch_count = pitch_count_dist_by_player.groupby(["GAME_YEAR", "BATTER_ID"]).apply(lambda grp: grp[[col for col in grp.columns if col.__contains__('-')]].to_numpy(), include_groups=False)

In [12]:
np.random.seed(2024)
bootstrapped_posterior = pd.DataFrame({'BATTER_ID': [], 'GAME_YEAR': [], 'fastball_pct': [], 'breaking_ball_pct': [], 'offspeed_pct': []})
for index in alphas_pitch_count.index:
    year = index[0]
    player = index[1]
    prior_alphas = (dist_count_per_year.query("GAME_YEAR == @year")['normalized_freq'].to_numpy() * 100).round()
    posterior_count_alphas = prior_alphas + alphas_pitch_count[index]
    posterior_pitch_class = posterior_year_count.query("GAME_YEAR == @year & BATTER_ID == @player")[[col for col in posterior_year_count if col.__contains__("posterior")]].to_numpy()
    sample_batters_counts = dirichlet.rvs(posterior_count_alphas.flatten(), 1000)
    sample_pitch_class_given_count = np.apply_along_axis(lambda row: dirichlet.rvs(row, 1000), 1, posterior_pitch_class)
    final = np.zeros([1000, 3])
    if sample_batters_counts.shape[1] != 12 or sample_pitch_class_given_count.shape[0] != 12:
        print(index)
        print(sample_batters_counts.shape)
        print(sample_pitch_class_given_count.shape)
    for i in range(0, 12):
        final += sample_batters_counts.T[i, :][:, np.newaxis] * sample_pitch_class_given_count[i, :]
    final_df = pd.DataFrame(final).rename(columns={0: "fastball_pct", 1:'breaking_ball_pct', 2:'offspeed_pct'})
    final_df['BATTER_ID'] = player
    final_df['GAME_YEAR'] = year
    bootstrapped_posterior = pd.concat([bootstrapped_posterior, final_df], axis=0)

Multiply each value from the bootstrapped distributions of pitch type by the proportion of pitches that a player saw for that year. Then sum all the values for a player corresponding to the index of the group. That is, sum the first row of the 2021 bootstrapped distribution for a player with the first row from the 2022 and 2023 bootstrapped distributions. We can do this because the bootstrapped distributions are approximately normal, and the sum of scaled normal random variables results in a normal distribution.

In [13]:
bootstrapped_posterior_merged = pd.merge(bootstrapped_posterior, dist_pitches_per_year.drop(columns='freq'), on=['BATTER_ID', 'GAME_YEAR'])
bootstrapped_posterior_combined = bootstrapped_posterior_merged.copy()
bootstrapped_posterior_combined.loc[:, 'fastball_pct'] = (bootstrapped_posterior_combined['fastball_pct'] * bootstrapped_posterior_combined['normalized_freq'])
bootstrapped_posterior_combined.loc[:, 'breaking_ball_pct'] = (bootstrapped_posterior_combined['breaking_ball_pct'] * bootstrapped_posterior_combined['normalized_freq'])
bootstrapped_posterior_combined.loc[:, 'offspeed_pct'] = (bootstrapped_posterior_combined['offspeed_pct'] * bootstrapped_posterior_combined['normalized_freq'])
bootstrapped_posterior_combined['row_index_in_group'] = bootstrapped_posterior_combined.groupby(["BATTER_ID", "GAME_YEAR"]).cumcount()
bootstrapped_posterior_combined = bootstrapped_posterior_combined.groupby(["BATTER_ID", "row_index_in_group"])[['fastball_pct', 'breaking_ball_pct', 'offspeed_pct']].sum().reset_index().drop(columns='row_index_in_group')

Get the mean and standard-deviation for each batter's combined boostrap distribution. The means are our prediction values for the proportion of each pitch type that a batter will see in 2024

In [39]:
bootstrapped_posterior_summarized = bootstrapped_posterior_combined.groupby(['BATTER_ID']).agg({'fastball_pct': ['mean', 'std'],
                                                                                                    'breaking_ball_pct': ['mean', 'std'],
                                                                                                    'offspeed_pct': ['mean', 'std']}).reset_index()
bootstrapped_posterior_summarized.columns = ['BATTER_ID', 'mean_fastball', 'sd_fastball', 'mean_breaking_ball', 'sd_breaking_ball', 'mean_offspeed', 'sd_offspeed']
bootstrapped_posterior_summarized['BATTER_ID'] = bootstrapped_posterior_summarized['BATTER_ID'].astype(int)

Write the predictions to the predictions .csv

In [40]:
predictions = pd.read_csv("Data/predictions.csv")

In [41]:
predictions = pd.merge(predictions, bootstrapped_posterior_summarized, on='BATTER_ID')[['BATTER_ID', 'PLAYER_NAME', 'GAME_YEAR'] + [col for col in bootstrapped_posterior_summarized.columns if col.__contains__("mean")]]
predictions = predictions.rename(columns={'mean_fastball': 'PITCH_TYPE_FB',
                                          'mean_breaking_ball': 'PITCH_TYPE_BB', 
                                          'mean_offspeed': 'PITCH_TYPE_OS'})
predictions.to_csv("Data/predictions.csv", index=False)

Also write the predictions with their uncertainty (standard-deviation).

In [42]:
bootstrapped_posterior_summarized.to_csv("Data/predictions_with_std.csv", index=False)

Replicate bootstrapping for each player + count combination. This will be used for the interactive dashboard

In [29]:
np.random.seed(2024)
dataframes = []
bootstrapped_posterior_by_count = pd.DataFrame({'BATTER_ID': [], 'BALLS': [], 'STRIKES': [], 'GAME_YEAR': [], 'fastball_pct': [], 'breaking_ball_pct': [], 'offspeed_pct': []})
for index, row in posterior_year_count.iterrows():
    sample = dirichlet.rvs([row['posterior_fastball'], row['posterior_breaking_ball'], row['posterior_off-speed']], 1000)
    final_df = pd.DataFrame(sample).rename(columns={0: "fastball_pct", 1:'breaking_ball_pct', 2:'offspeed_pct'})
    final_df['BATTER_ID'] = row['BATTER_ID']
    final_df['GAME_YEAR'] = row['GAME_YEAR']
    final_df['BALLS'] = row['BALLS']
    final_df['STRIKES'] = row['STRIKES']
    dataframes.append(final_df)
bootstrapped_posterior_by_count = pd.concat(dataframes, axis=0)

In [34]:
bootstrapped_posterior_by_count_merged = pd.merge(bootstrapped_posterior_by_count, dist_pitches_per_year.drop(columns='freq'), on=['BATTER_ID', 'GAME_YEAR'])
bootstrapped_posterior_by_count_combined = bootstrapped_posterior_by_count_merged.copy()
bootstrapped_posterior_by_count_combined.loc[:, 'fastball_pct'] = (bootstrapped_posterior_by_count_combined['fastball_pct'] * bootstrapped_posterior_by_count_combined['normalized_freq'])
bootstrapped_posterior_by_count_combined.loc[:, 'breaking_ball_pct'] = (bootstrapped_posterior_by_count_combined['breaking_ball_pct'] * bootstrapped_posterior_by_count_combined['normalized_freq'])
bootstrapped_posterior_by_count_combined.loc[:, 'offspeed_pct'] = (bootstrapped_posterior_by_count_combined['offspeed_pct'] * bootstrapped_posterior_by_count_combined['normalized_freq'])
bootstrapped_posterior_by_count_combined['row_index_in_group'] = bootstrapped_posterior_by_count_combined.groupby(["BATTER_ID", "GAME_YEAR", "BALLS", "STRIKES"]).cumcount()
bootstrapped_posterior_by_count_combined = bootstrapped_posterior_by_count_combined.groupby(["BATTER_ID", "BALLS", "STRIKES", "row_index_in_group"])[['fastball_pct', 'breaking_ball_pct', 'offspeed_pct']].sum().reset_index().drop(columns='row_index_in_group')

In [61]:
bootstrapped_posterior_by_count_summarized = bootstrapped_posterior_by_count_combined.groupby(['BATTER_ID', 'BALLS', 'STRIKES']).agg({'fastball_pct': ['mean', 'std'],
                                                                                                    'breaking_ball_pct': ['mean', 'std'],
                                                                                                    'offspeed_pct': ['mean', 'std']}).reset_index()
bootstrapped_posterior_by_count_summarized.columns = ['BATTER_ID', 'BALLS', 'STRIKES', 'mean_fastball', 'sd_fastball', 'mean_breaking_ball', 'sd_breaking_ball', 'mean_offspeed', 'sd_offspeed']
bootstrapped_posterior_by_count_summarized['BATTER_ID'] = bootstrapped_posterior_by_count_summarized['BATTER_ID'].astype(int)

In [62]:
bootstrapped_posterior_by_count_summarized = pd.merge(predictions[['BATTER_ID', 'PLAYER_NAME']], bootstrapped_posterior_by_count_summarized, on='BATTER_ID')[['BATTER_ID','PLAYER_NAME','BALLS','STRIKES','mean_fastball','sd_fastball','mean_breaking_ball','sd_breaking_ball','mean_offspeed','sd_offspeed']]
bootstrapped_posterior_by_count_summarized.to_csv("Data/predictions_by_count.csv", index=False)