In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore

In [2]:
# MLB pitching stats data
sp_pitch_data = pd.read_csv('Resources/2018_2023_mlb_sp_stats.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/2018_2023_mlb_sp_stats.csv'

In [None]:
sp_pitch_data

In [None]:
# Filter out rows with Season 2020 due to pandemic year (Only 60 games played)
sp_pitch_data = sp_pitch_data[sp_pitch_data['Season'] != 2020]

# Drop the 'Season.1' column.
columns_to_drop = ['Name.1', 'Team.1', 'Season.1', 'NameASCII', 'MLBAMID']

sp_pitch_data = sp_pitch_data.drop(columns= columns_to_drop)

sp_pitch_data.head()


In [None]:
# Fill remaining null values with zeros to account for any gaps in data since pitchers do not all throw the same pitches.
sp_pitch_data = sp_pitch_data.fillna(0)

# Display the cleaned and filled DataFrame
sp_pitch_data.head()


In [None]:
# Filter out columns with (pi) in their titles
columns_to_drop = sp_pitch_data.filter(like="(pi)").columns

# Drop the specified columns
sp_pitch_data = sp_pitch_data.drop(columns=columns_to_drop)


In [None]:
# Filter DF for the relevant seasons (2018, 2019, 2021, 2022)
relevant_seasons = [2018, 2019, 2021, 2022]
filtered_pitching_data = sp_pitch_data[sp_pitch_data['Season'].isin(relevant_seasons)]

# Group the data by player for target 2023
sp_data_2023 = sp_pitch_data[sp_pitch_data['Season'] == 2023]
grouped_data = filtered_pitching_data.groupby('PlayerId')

# Calculate average statistics for ERA, FIP, and WHIP
average_stats = grouped_data[['ERA', 'FIP', 'WHIP']].mean()

# Merge the average stats with the 2023 stats for each player
stats_2023 = pd.merge(average_stats, sp_data_2023[['PlayerId', 'ERA', 'FIP', 'WHIP']], on='PlayerId', suffixes=('_avg', '_2023'))

# Merge pitcher names with the results DF, and merge pitcher names and calculated results
pitcher_output = pd.merge(
   filtered_pitching_data[['PlayerId', 'Name']].drop_duplicates(),
   stats_2023,
   on='PlayerId'
)

# Merging pitcher_output back with sp_pitch_data to retain all columns
merged_data = pd.merge(sp_pitch_data, pitcher_output, on=['PlayerId', 'Name'], how='left')

# Clean up duplicate rows by keeping the first occurrence
merged_data = merged_data.drop_duplicates(subset=['PlayerId', 'Name'], keep='first')

# Display the merged result
merged_data.head()

In [None]:
merged_data.to_csv('Resources/merged_data.csv', encoding="utf-8", index=False)

In [None]:
epsilon = 1e-6  # A small constant to avoid zero standard deviation

# Calculate z-scores for ERA, FIP, and WHIP
z_scores_era_avg = zscore(merged_data['ERA_avg'].fillna(0))
z_score_era_2023 = zscore(merged_data['ERA_2023'].fillna(0) + epsilon)
z_scores_fip_avg = zscore(merged_data['FIP_avg'].fillna(0))
z_score_fip_2023 = zscore(merged_data['FIP_2023'].fillna(0) + epsilon)
z_scores_whip_avg = zscore(merged_data['WHIP_avg'].fillna(0))
z_score_whip_2023 = zscore(merged_data['WHIP_2023'].fillna(0) + epsilon)

# Calculate the differences between z-scores
z_score_diff_era = z_score_era_2023 - z_scores_era_avg
z_score_diff_fip = z_score_fip_2023 - z_scores_fip_avg
z_score_diff_whip = z_score_whip_2023 - z_scores_whip_avg

# Add the z-scores and differences as new columns to the DataFrame
merged_data['z_score_era_avg'] = z_scores_era_avg
merged_data['z_score_era_2023'] = z_score_era_2023
merged_data['z_score_diff_era'] = z_score_diff_era
merged_data['z_score_fip_avg'] = z_scores_fip_avg
merged_data['z_score_fip_2023'] = z_score_fip_2023
merged_data['z_score_diff_fip'] = z_score_diff_fip
merged_data['z_score_whip_avg'] = z_scores_whip_avg
merged_data['z_score_whip_2023'] = z_score_whip_2023
merged_data['z_score_diff_whip'] = z_score_diff_whip

# Display the DataFrame
print(merged_data.head())


In [None]:
# Check standard deviations
print("Standard Deviations:")
print("ERA_avg:", merged_data['ERA_avg'].std())
print("FIP_avg:", merged_data['FIP_avg'].std())
print("WHIP_avg:", merged_data['WHIP_avg'].std())
print("ERA_2023:", merged_data['ERA_2023'].std())
print("FIP_2023:", merged_data['FIP_2023'].std())
print("WHIP_2023:", merged_data['WHIP_2023'].std())

In [None]:

print("Intermediate Values:")
print("z_scores_era_avg:", z_scores_era_avg)
print("z_score_era_2023:", z_score_era_2023)

In [None]:

print("Intermediate Values:")
print("z_scores_fip_avg:", z_scores_fip_avg)
print("z_score_fip_2023:", z_score_fip_2023)

In [None]:

print("Intermediate Values:")
print("z_scores_whip_avg:", z_scores_whip_avg)
print("z_score_whip_2023:", z_score_whip_2023)

In [None]:
#  Output full pitch data to csv
merged_data.to_csv('Resources/full_pitcher_data.csv', encoding="utf-8", index=False)


In [None]:
merged_data.head()

In [None]:
# Create dataset for ERA learning
columns_to_drop = ['PlayerId', 'Name', 'ERA_2023', 'FIP_2023', 'WHIP_2023',
                   'z_score_diff_fip', 'z_score_diff_whip']

for_learning_era = merged_data.drop(columns=columns_to_drop)
for_learning_era.head()


In [None]:
# Save ERA learning dataset to csv
for_learning_era.to_csv('full_era_learning.csv', encoding="utf-8", index=False)


In [None]:
# Create dataset for FIP learning
columns_to_drop = ['PlayerId', 'Name', 'ERA_2023', 'FIP_2023', 'WHIP_2023',
                   'zscore_difference_era', 'zscore_difference_whip']

for_learning_fip = pitcher_output.drop(columns=columns_to_drop)
for_learning_fip.head()


In [None]:
# Save FIP learning dataset to csv
for_learning_fip.to_csv('full_fip_learning.csv', encoding="utf-8", index=False)


In [None]:
# Create dataset for WHIP learning
columns_to_drop = ['PlayerId', 'Name', 'ERA_2023', 'FIP_2023', 'WHIP_2023',
                   'zscore_difference_era', 'zscore_difference_fip']

for_learning_whip = pitcher_output.drop(columns=columns_to_drop)
for_learning_whip.head()


In [None]:
# Save WHIP learning dataset to csv
for_learning_whip.to_csv('full_whip_learning.csv', encoding="utf-8", index=False)
