In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
#Let's get started by reading in the data

pitches = pd.read_csv('savant_pitch_level.csv')
pitch_sample = pitches.head()

In [19]:
#First I want to know each pitcher's mean fastball velocity and mean spin rate across all fastballs.
g = pitches.groupby('pitcher')

fb_velo_agg =  pd.DataFrame(g.apply(lambda x: x[x['pitch_type'] == "FF"]["release_speed"].mean())).reset_index().rename(columns={0:'mean_fb_velo'})
fb_spin_agg = pd.DataFrame(g.apply(lambda x: x[x['pitch_type'] == "FF"]["release_spin_rate"].mean())).reset_index().rename(columns={0:'mean_fb_spin'})

In [20]:
fb_velo_agg

Unnamed: 0,pitcher,mean_fb_velo
0,405395,
1,424144,88.780000
2,425794,87.850264
3,425844,89.157304
4,425877,81.333333
...,...,...
1378,695243,98.310651
1379,696136,92.133939
1380,696147,
1381,700363,94.460408


In [21]:
# Now I'm going to select the most thrown pitch that isn't a fastball to determine a pitcher's "secondary". 
# Yes some pitchers use an offspeed as their primary, but let's move past that for now.
pitch_freq = pitches.groupby(['player_name', 'pitch_type']).size().unstack(fill_value = 0)
secondaries = pitch_freq.drop('FF', axis = 1).reset_index()
secondary_pitches = []
for i, row in secondaries.drop('player_name', axis = 1).iterrows():
    max_pitch = row[row == max(row)].index[0]
    secondary_pitches.append(max_pitch)

df_secondary = secondaries.loc[:,['player_name']]
df_secondary['pitch_type'] = secondary_pitches
# Let's merge that information back to the main dataframe and we can then isolate for only rows that reflect preferred secondary pitchers.
pitches_secondary = pitches.merge(df_secondary, how = 'left', on = ['player_name', 'pitch_type'], indicator = True)

In [22]:
df_sec_analysis = pitches_secondary[pitches_secondary.loc[:, '_merge'] == "both"]

# Now that we have that dataframe with only secondary pitches, we can aggregate mean velo and spin rate very simply.
secondary_agg = df_sec_analysis.groupby('pitcher').agg({'release_speed': 'mean', 'release_spin_rate':'mean'}).reset_index()

In [23]:
df_fastball = pitches[pitches.loc[:, "pitch_type"] == "FF"]

In [24]:
secondary_agg.head()



Unnamed: 0,pitcher,release_speed,release_spin_rate
0,405395,62.373684,1627.736842
1,424144,88.689855,2093.0
2,425794,72.845766,2774.673567
3,425844,86.433891,1654.045837
4,425877,53.205,1475.95


In [25]:
# So what I was trying to do here is calculate the mean fastball velocity lost per pitch thrown, but I messed it up 
# because I am only counting fastballs thrown, not all pitches thrown.
# I think a more elegant way of capturing these pitch-over-pitch deltas in broad strokes is 
# by calculating the correlation between pitch number and velo, spin rate, etc. Will do later,
# or someone else can look into it in the meantime.

# Here's aggregating by correlation for fastballs

def calculate_correlation(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce')  # Ensure the pitch speed/velocity column is numeric
    return df[['pitch_number', column]].corr().iloc[0, 1]

# Group by pitcher and calculate the correlation across all games
def correlation_per_pitcher(df, column, pitch_type):
    correlation_results = df.groupby('pitcher').apply(lambda group: calculate_correlation(group, column)).reset_index()
    correlation_results.rename(columns={0: f'corr_pitch_{pitch_type}_{column}'}, inplace=True)  # Rename the 0 column to 'correlation'
    return correlation_results


In [26]:
df_fb_speed_agg = correlation_per_pitcher(df_fastball, "release_speed", 'fb')
df_fb_spin_agg = correlation_per_pitcher(df_fastball, "release_spin_rate", 'fb')
df_sec_speed_agg = correlation_per_pitcher(df_sec_analysis, "release_speed", 'sec')
df_sec_spin_agg = correlation_per_pitcher(df_sec_analysis, "release_spin_rate", 'sec')

In [29]:
def calculate_correlation_ab(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce')  # Ensure the pitch speed/velocity column is numeric
    return df[['pitcher_at_bat_number', column]].corr().iloc[0, 1]

# Group by pitcher and calculate the correlation across all games
def correlation_per_pitcher_ab(df, column, pitch_type):
    correlation_results = df.groupby('pitcher').apply(lambda group: calculate_correlation_ab(group, column)).reset_index()
    correlation_results.rename(columns={0: f'corr_at_bat_{pitch_type}_{column}'}, inplace=True)  # Rename the 0 column to 'correlation'
    return correlation_results

In [30]:
ab_fb_speed_agg = correlation_per_pitcher_ab(df_fastball, "release_speed", 'fb')
ab_fb_spin_agg = correlation_per_pitcher_ab(df_fastball, 'release_spin_rate', 'fb')
ab_sec_speed_agg = correlation_per_pitcher_ab(df_sec_analysis, 'release_speed', 'sec')
ab_sec_spin_agg = correlation_per_pitcher_ab(df_sec_analysis, 'release_spin_rate', 'sec')

In [31]:
fastball_metrics = df_fastball.groupby(['pitcher', 'game_pk']).agg(
    mean_velocity = ('release_speed', 'mean'),
    std_dev_velocity = ('release_speed', 'std')
).reset_index()

In [32]:
def calculate_mean_pitch_number_below_threshold(df, column_of_interest, occurrence = 1, pitch_type = "fb"):
    """
    Calculate the mean pitch number where a specified statistic (e.g., release speed or spin rate)
    for a pitcher's fastball falls below its mean minus one standard deviation in each appearance.
    
    Parameters:
    - df: DataFrame containing pitch data.
    - pitch_type_code: The code used to identify fastballs in the 'pitch_type' column.
    - column_of_interest: The column (e.g., 'release_speed' or 'spin_rate') to analyze.
    
    Returns:
    - DataFrame containing each pitcher and their mean pitch number to threshold or max for the specified statistic.
    """

        # Calculate mean and standard deviation of the column of interest per appearance
    stats_per_appearance = df.groupby(['pitcher', 'game_date']).agg(
        mean_stat=(column_of_interest, 'mean'),
        std_dev_stat=(column_of_interest, 'std')
    ).reset_index()

    # Merge the stats back to identify pitches below the threshold
    pitches_with_stats = pd.merge(df, stats_per_appearance, on=['pitcher', 'game_date'])

    # Calculate the threshold (mean - std dev)
    pitches_with_stats['stat_threshold'] = pitches_with_stats['mean_stat'] - pitches_with_stats['std_dev_stat']

    # Mark pitches below the threshold
    pitches_with_stats['below_threshold'] = pitches_with_stats[column_of_interest] < pitches_with_stats['stat_threshold']

    # Determine the Maximum Pitch Number for Each Pitcher's Appearance
    max_pitch_number_per_appearance = df.groupby(['pitcher', 'game_date'])['pitch_number_appearance'].max().reset_index()
    max_pitch_number_per_appearance.rename(columns={'pitch_number_appearance': 'max_pitch_number'}, inplace=True)

    # Filter rows below threshold
    below_threshold = pitches_with_stats[pitches_with_stats['below_threshold']]

    # Rank occurrences of below-threshold pitches
    below_threshold['occurrence_rank'] = below_threshold.groupby(['pitcher', 'game_date']).cumcount() + 1

    # Filter for the specified occurrence
    specific_occurrence = below_threshold[below_threshold['occurrence_rank'] == occurrence]

    # Merge to get the max pitch number where needed
    merged = pd.merge(max_pitch_number_per_appearance, specific_occurrence[['pitcher', 'game_date', 'pitch_number_appearance']], on=['pitcher', 'game_date'], how='left')
    merged['used_pitch_number'] = merged['pitch_number_appearance'].fillna(merged['max_pitch_number'])

    # Calculate the mean pitch number across appearances
    mean_pitch_number = merged.groupby('pitcher')['used_pitch_number'].mean().reset_index()
    mean_pitch_number.rename(columns={'used_pitch_number': f'{pitch_type}_{column_of_interest}_threshold_drop'}, inplace=True)

    mean_pitch_number_merged = pd.merge(mean_pitch_number, df[['player_name', 'pitcher']], on = 'pitcher', how = 'left').drop_duplicates()

    return mean_pitch_number_merged

In [33]:
fb_velo_threshold_agg = calculate_mean_pitch_number_below_threshold(df_fastball, "release_speed", 2)
fb_spinrate_threshold_agg = calculate_mean_pitch_number_below_threshold(df_fastball, "release_spin_rate", 2)
sec_velo_threshold_agg = calculate_mean_pitch_number_below_threshold(df_sec_analysis, "release_speed", 2, 'sec')
sec_spinrate_threshold_agg = calculate_mean_pitch_number_below_threshold(df_sec_analysis, "release_spin_rate", 2, 'sec')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  below_threshold['occurrence_rank'] = below_threshold.groupby(['pitcher', 'game_date']).cumcount() + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  below_threshold['occurrence_rank'] = below_threshold.groupby(['pitcher', 'game_date']).cumcount() + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b

In [34]:
df_roles = (
    pitches.drop_duplicates(subset=['game_pk', 'pitcher']).
    groupby(['pitcher', 'role_key']).count().reset_index().
    sort_values(['pitcher', 'pitch_type'], ascending = False)
    )

df_roles = df_roles.loc[df_roles.groupby('pitcher')['pitch_type'].idxmax()].loc[:, ["pitcher", "role_key"]]

In [35]:
final_aggregated = (
    pd.merge(pd.DataFrame(fb_velo_agg), pd.DataFrame(fb_spin_agg)).
    merge(secondary_agg).
    merge(df_fb_speed_agg).merge(df_fb_spin_agg).
    merge(df_sec_speed_agg).merge(df_sec_spin_agg).
    merge(ab_fb_speed_agg).merge(ab_fb_spin_agg).
    merge(ab_sec_speed_agg).merge(ab_sec_spin_agg).
    merge(fb_velo_threshold_agg).merge(fb_spinrate_threshold_agg).
    merge(sec_velo_threshold_agg).merge(sec_spinrate_threshold_agg).
    merge(df_roles)
)


In [36]:
final_aggregated.sort_values('mean_fb_velo')

Unnamed: 0,pitcher,mean_fb_velo,mean_fb_spin,release_speed,release_spin_rate,corr_pitch_fb_release_speed,corr_pitch_fb_release_spin_rate,corr_pitch_sec_release_speed,corr_pitch_sec_release_spin_rate,corr_at_bat_fb_release_speed,corr_at_bat_fb_release_spin_rate,corr_at_bat_sec_release_speed,corr_at_bat_sec_release_spin_rate,fb_release_speed_threshold_drop,player_name,fb_release_spin_rate_threshold_drop,sec_release_speed_threshold_drop,sec_release_spin_rate_threshold_drop,role_key
455,621433,76.828571,1809.000000,50.850000,1145.518519,-0.540824,-0.341631,0.175152,0.283422,-0.527296,-0.504637,-0.306085,-0.231518,3.500000,"Phillips, Brett",3.500000,10.000000,9.800000,RP
3,425877,81.333333,2112.333333,53.205000,1475.950000,0.613899,0.718051,0.395689,0.297352,0.783505,0.460509,0.380885,0.306180,20.000000,"Molina, Yadier",7.000000,18.000000,19.500000,RP
601,643511,83.111940,1883.477612,82.899607,1905.257176,0.070035,0.133612,0.086905,0.027853,0.297542,0.094811,0.116102,0.064001,11.583333,"Rogers, Tyler",11.583333,11.060185,11.319444,RP
573,642232,84.337500,1999.375000,82.449063,2080.520750,-0.294046,-0.250825,0.009154,0.083898,-0.085625,-0.149661,-0.014209,0.021795,47.500000,"Yarbrough, Ryan",47.500000,44.729730,44.310811,SP
208,572383,84.358333,2060.833333,69.715385,2068.000000,-0.274725,-0.353918,-0.298038,-0.435137,-0.279213,-0.214298,0.479861,-0.184502,13.000000,"Moran, Brian",9.000000,5.000000,3.000000,RP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,663855,100.314063,2345.937008,99.695078,2069.767150,0.253616,0.018086,0.245213,0.105145,0.071654,-0.211922,-0.022491,0.109956,16.173077,"Hicks, Jordan",17.307692,9.309091,9.709091,RP
1070,679885,100.534507,2242.225352,88.945614,1056.877193,0.356612,0.071333,0.367504,-0.038715,0.048325,0.146489,0.206181,0.016712,12.500000,"Martinez, Justin",7.300000,15.200000,21.500000,RP
1110,682842,100.705882,2248.294118,99.367823,2173.328076,0.172398,-0.085250,-0.039203,-0.069998,-0.299586,0.231721,0.087731,0.025705,14.833333,"Uribe, Abner",14.833333,8.937500,9.968750,RP
1149,690829,100.924845,2357.087500,87.552500,2741.700000,-0.091468,0.032184,-0.221434,-0.195336,0.235603,-0.012573,0.070545,0.145438,8.333333,"Joyce, Ben",11.583333,14.200000,15.300000,RP


In [37]:
final_aggregated.to_csv('aggregated_savant.csv')