In [25]:
#import standard data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [26]:
import warnings
warnings.filterwarnings('ignore')

In [27]:
from functions import *

This part of the code contains the functions used later on

In [29]:
#main function dictating which functions to call for calculating player stats
def add_to_df_players(df_players, df_savant, at_bat_ids):
    """
    Enhances the df_players DataFrame by computing and adding various performance statistics 
    based on player event data from df_savant and at_bat_ids.

    This function:
    - Filters out players without an MLB ID.
    - Computes the number of years since a player’s MLB debut.
    - Aggregates plate appearance (PA) and batters faced (BF) counts.
    - Tracks fielding appearances at different positions.
    - Counts the occurrences of various play events (e.g., hits, strikeouts, walks).
    - Computes key batting statistics like batting average, slugging, and on-base percentage.

    Parameters:
    -----------
    df_players : pandas.DataFrame
        DataFrame containing player-level information, including player IDs and debut years.
    df_savant : pandas.DataFrame
        Baseball Savant event-level data, which includes play-by-play information.

    Returns:
    --------
    pandas.DataFrame
        Updated df_players with additional columns
    """
    
    #remove players without an mlb Id
    df_players = df_players[df_players['player_mlb_id'].notna()]

    #calculate years since a player debuted
    df_players['years_after_debut'] = df_players['year'] - df_players['debut'].str[-4:].astype(int)

    #get the counts for the main dataframe
    df_players = compute_player_pa_and_bf(df_players, at_bat_ids, df_savant)

    #calculate where a player played in the field
    df_players = get_fielding_counts(df_players, df_savant)

    #count how many of each stat a player had on the year
    df_players = calculate_all_play_event_counts(df_players,df_savant)

    #calculate average lineup position
    df_players = calculate_lineup_position(df_players, df_savant)
    
    #calculate general batting stats
    df_players = calculate_batting_stats(df_players, df_savant)

    #innings pitched calculation
    df_players = calculate_innings_pitched(df_players, df_savant)

    #average fastball velocity
    df_players = fastball_velocity(df_players, df_savant)
    
    #calculate pitching stats
    df_players = calculate_pitching_stats(df_players)

    #add average batters faced per outing for pitchers
    df_players = calculate_batters_faced_in_game(df_players,df_savant)

    #find inzone + chase rate
    df_players = calculate_zone_chase_pct(df_players, df_savant)
    
    #calculate rbis
    df_players = calculate_rbis(df_players,df_savant)

    #calculate average expected batting avg for batters and pitchers
    df_players = calculate_average_exp_ba(df_players,df_savant,'batter')
    df_players = calculate_average_exp_ba(df_players,df_savant,'pitcher')

    #look at WOBA and xWOBA
    df_players = calculate_woba(df_players, df_savant, 'batter')
    df_players = calculate_woba(df_players, df_savant, 'pitcher')
    df_players = calculate_average_xwoba(df_players, df_savant, 'batter')
    df_players = calculate_average_xwoba(df_players, df_savant, 'pitcher')

    #get player position estimates
    df_players = primary_position(df_players)

    #get percentage of times a pitcher started
    df_players = add_sp_percentage(df_players, df_savant)
    
    #get player age
    df_players['age'] = df_players['year'] - df_players['birthYear']
    
    #years from 26-28 peak age performance (https://tht.fangraphs.com/how-do-baseball-players-age-part-1/)
    df_players['years_before_26'] = (26 - df_players['age']).clip(lower=0)
    df_players['years_after_28'] = (df_players['age'] - 28).clip(lower=0)

    return df_players

Past the functions

In [31]:
#read the csvs as dataframes
df_savant_raw = pd.read_csv('savant_data_2021_2023.csv')
df_people = pd.read_csv('lahman_people.csv')

In [32]:
#create dataframe for player season info
df_players = pd.concat([df_people.assign(year=i) for i in range(2021, 2024)], ignore_index=True)

In [33]:
#clean Savant data
df_savant, at_bat_ids = clean_savant_data(df_savant_raw)

In [34]:
#add columns to df_players based on the Savant data
df_players_full = add_to_df_players(df_players, df_savant, at_bat_ids)

In [35]:
# #get savant data for batting splits
# df_savant_r = df_savant_raw[df_savant_raw['p_throws'] == 'R']
# df_savant_l = df_savant_raw[df_savant_raw['p_throws'] == 'L']

# #make df_players for each batting split
# df_players_r = add_to_df_players(df_players, df_savant_r)
# df_players_l = add_to_df_players(df_players, df_savant_l)

In [36]:
#export csv for use in other .ipynb
df_players_full.to_csv('df_players.csv')
# df_players_r.to_csv('df_players_right.csv')
# df_players_l.to_csv('df_players_left.csv')

In [37]:
for i in df_players_full.columns:
    print(i)

player_mlb_id
playerID_LAHMAN
birthYear
birthMonth
birthDay
birthCountry
weight
height
bats
throws
debut
birthDate
year
years_after_debut
total_pa
total_bf
base_ended_inn
field_p
field_c
field_1b
field_2b
field_3b
field_ss
field_lf
field_cf
field_rf
batter_strikeout
pitcher_strikeout
batter_caught_stealing_3b
pitcher_caught_stealing_3b
batter_field_out
pitcher_field_out
batter_walk
pitcher_walk
batter_force_out
pitcher_force_out
batter_sac_fly
pitcher_sac_fly
batter_single
pitcher_single
batter_hit_by_pitch
pitcher_hit_by_pitch
batter_double
pitcher_double
batter_grounded_into_double_play
pitcher_grounded_into_double_play
batter_sac_bunt
pitcher_sac_bunt
batter_home_run
pitcher_home_run
batter_fielders_choice
pitcher_fielders_choice
batter_field_error
pitcher_field_error
batter_other_out
pitcher_other_out
batter_caught_stealing_2b
pitcher_caught_stealing_2b
batter_triple
pitcher_triple
batter_strikeout_double_play
pitcher_strikeout_double_play
batter_fielders_choice_out
pitcher_fielder