In [1]:
pip install shap

Collecting shap
  Downloading shap-0.42.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/547.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/547.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.42.1 slicer-0.0.7


In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline
import joblib
from itertools import product
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import shap
from scipy.stats import zscore

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [3]:
# MLB pitching stats data
sp_pitch_data = pd.read_csv('2018_2023_mlb_sp_stats.csv')

In [4]:
sp_pitch_data.head()

Unnamed: 0,Season,Name,Team,Name.1,Team.1,Season.1,Age,W,L,ERA,...,wOppTeamV,wNetPitV,TG,wOBA,OBP,SLG,wSB,NameASCII,PlayerId,MLBAMID
0,2023,Zack Greinke,KCR,Zack Greinke,KCR,2023,39,1,12,5.530121,...,0.221085,0.221085,132,,,,,Zack Greinke,1943,425844
1,2023,Clayton Kershaw,LAD,Clayton Kershaw,LAD,2023,35,11,4,2.515528,...,0.0,-0.26397,129,,,,,Clayton Kershaw,2036,477132
2,2023,Adam Wainwright,STL,Adam Wainwright,STL,2023,41,3,9,8.61207,...,0.124603,-0.299898,131,,,,,Adam Wainwright,2233,425794
3,2023,Corey Kluber,BOS,Corey Kluber,BOS,2023,37,2,6,6.264,...,0.0,0.0,131,,,,,Corey Kluber,2429,446372
4,2023,Lance Lynn,CHW,Lance Lynn,CHW,2023,36,6,9,6.467967,...,0.262619,0.262619,131,,,,,Lance Lynn,2520,458681


In [5]:
# Filter out rows with Season 2020 due to pandemic year (Only 60 games played)
sp_pitch_data = sp_pitch_data[sp_pitch_data['Season'] != 2020]

# Drop the 'Season.1' column.
columns_to_drop = ['Name.1', 'Team.1', 'Season.1', 'NameASCII', 'MLBAMID']

sp_pitch_data = sp_pitch_data.drop(columns= columns_to_drop)

sp_pitch_data.head()


Unnamed: 0,Season,Name,Team,Age,W,L,ERA,G,GS,CG,...,ESV,wTeamV,wOppTeamV,wNetPitV,TG,wOBA,OBP,SLG,wSB,PlayerId
0,2023,Zack Greinke,KCR,39,1,12,5.530121,22,22,0,...,1.0,0.0,0.221085,0.221085,132,,,,,1943
1,2023,Clayton Kershaw,LAD,35,11,4,2.515528,19,19,0,...,0.0,-0.26397,0.0,-0.26397,129,,,,,2036
2,2023,Adam Wainwright,STL,41,3,9,8.61207,17,17,0,...,3.0,-0.424501,0.124603,-0.299898,131,,,,,2233
3,2023,Corey Kluber,BOS,37,2,6,6.264,9,9,0,...,0.0,0.0,0.0,0.0,131,,,,,2429
4,2023,Lance Lynn,CHW,36,6,9,6.467967,21,21,0,...,2.0,0.0,0.262619,0.262619,131,,,,,2520


In [6]:
# Fill remaining null values with zeros to account for any gaps in data since pitchers do not all throw the same pitches.
sp_pitch_data = sp_pitch_data.fillna(0)

# Display the cleaned and filled DataFrame
sp_pitch_data.head()


Unnamed: 0,Season,Name,Team,Age,W,L,ERA,G,GS,CG,...,ESV,wTeamV,wOppTeamV,wNetPitV,TG,wOBA,OBP,SLG,wSB,PlayerId
0,2023,Zack Greinke,KCR,39,1,12,5.530121,22,22,0,...,1.0,0.0,0.221085,0.221085,132,0.0,0.0,0.0,0.0,1943
1,2023,Clayton Kershaw,LAD,35,11,4,2.515528,19,19,0,...,0.0,-0.26397,0.0,-0.26397,129,0.0,0.0,0.0,0.0,2036
2,2023,Adam Wainwright,STL,41,3,9,8.61207,17,17,0,...,3.0,-0.424501,0.124603,-0.299898,131,0.0,0.0,0.0,0.0,2233
3,2023,Corey Kluber,BOS,37,2,6,6.264,9,9,0,...,0.0,0.0,0.0,0.0,131,0.0,0.0,0.0,0.0,2429
4,2023,Lance Lynn,CHW,36,6,9,6.467967,21,21,0,...,2.0,0.0,0.262619,0.262619,131,0.0,0.0,0.0,0.0,2520


In [8]:
# Filter out columns with (pi) in their titles
columns_to_drop = sp_pitch_data.filter(like="(pi)").columns

# Drop the specified columns
sp_pitch_data = sp_pitch_data.drop(columns=columns_to_drop)


In [9]:
# Columns of filtered DF.
headers = sp_pitch_data.columns.to_list()
print(headers)

['Season', 'Name', 'Team', 'Age', 'W', 'L', 'ERA', 'G', 'GS', 'CG', 'ShO', 'SV', 'BS', 'IP', 'TBF', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'HBP', 'WP', 'BK', 'SO', 'GB', 'FB', 'LD', 'IFFB', 'Balls', 'Strikes', 'Pitches', 'RS', 'IFH', 'BU', 'BUH', 'K/9', 'BB/9', 'K/BB', 'H/9', 'HR/9', 'AVG', 'WHIP', 'BABIP', 'LOB%', 'FIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'Starting', 'Start-IP', 'Relieving', 'Relief-IP', 'RAR', 'WAR', 'Dollars', 'tERA', 'xFIP', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'inLI', 'gmLI', 'exLI', 'Pulls', 'WPA/LI', 'Clutch', 'FB%.1', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'HLD', 'SD', 'MD', 'ERA-', 'FIP-', 'xFIP-', 'K%', 'BB%', 'SIERA', 'RS/9', 'E-F', 'FA% (sc)', 'F

In [11]:
# Filter DF for the relevant seasons (2018, 2019, 2021, 2022)
relevant_seasons = [2018, 2019, 2021, 2022]
filtered_pitching_data = sp_pitch_data[sp_pitch_data['Season'].isin(relevant_seasons)]

# Group the data by player for target 2023
sp_data_2023 = sp_pitch_data[sp_pitch_data['Season'] == 2023]
grouped_data = filtered_pitching_data.groupby('PlayerId')

# Calculate average statistics for ERA, FIP, and WHIP
average_stats = grouped_data[['ERA', 'FIP', 'WHIP']].mean()

# Merge the average stats with the 2023 stats for each player
stats_2023 = pd.merge(average_stats, sp_data_2023[['PlayerId', 'ERA', 'FIP', 'WHIP']], on='PlayerId', suffixes=('_avg', '_2023'))

# Merge pitcher names with the results DF, and merge pitcher names and calculated results
pitcher_output = pd.merge(
    filtered_pitching_data[['PlayerId', 'Name']].drop_duplicates(),
    stats_2023,
    on='PlayerId'
)

# Display result
pitcher_output

Unnamed: 0,PlayerId,Name,ERA_avg,FIP_avg,WHIP_avg,ERA_2023,FIP_2023,WHIP_2023
0,1943,Zack Greinke,3.381562,3.780352,1.121997,5.530121,5.141063,1.274096
1,2036,Clayton Kershaw,2.902238,3.155674,1.012831,2.515528,3.755054,1.015528
2,2233,Adam Wainwright,3.854966,3.989132,1.307504,8.612070,6.024892,1.991379
3,2429,Corey Kluber,4.213119,3.648284,1.298954,6.264000,6.582582,1.536000
4,2520,Lance Lynn,4.011502,3.463603,1.283427,6.467967,5.209300,1.462396
...,...,...,...,...,...,...,...,...
271,17611,Taylor Clarke,5.875647,6.214009,1.507772,0.000000,2.270581,0.000000
272,18383,Michael Soroka,3.092941,3.151605,1.276123,5.468355,6.422480,1.518987
273,12447,Seth Lugo,3.913043,3.856084,1.304348,3.704268,3.746191,1.225610
274,14932,Ben Lively,6.845070,5.146347,1.859155,5.482234,5.752815,1.385787


In [12]:
# Calculate z-scores for ERA, FIP, and WHIP
z_scores_era_avg = zscore(pitcher_output['ERA_avg'])
z_score_era_2023 = zscore(pitcher_output['ERA_2023'])
z_scores_fip_avg = zscore(pitcher_output['FIP_avg'])
z_score_fip_2023 = zscore(pitcher_output['FIP_2023'])
z_scores_whip_avg = zscore(pitcher_output['WHIP_avg'])
z_score_whip_2023 = zscore(pitcher_output['WHIP_2023'])

# Create new columns for z-scores and differences
pitcher_output['z_scores_era_avg'] = z_scores_era_avg
pitcher_output['z_score_era_2023'] = z_score_era_2023
pitcher_output['z_scores_fip_avg'] = z_scores_fip_avg
pitcher_output['z_score_fip_2023'] = z_score_fip_2023
pitcher_output['z_scores_whip_avg'] = z_scores_whip_avg
pitcher_output['z_score_whip_2023'] = z_score_whip_2023

# Calculate the difference between z-scores
pitcher_output['zscore_difference_era'] = z_score_era_2023 - z_scores_era_avg
pitcher_output['zscore_difference_fip'] = z_score_fip_2023 - z_scores_fip_avg
pitcher_output['zscore_difference_whip'] = z_score_whip_2023 - z_scores_whip_avg


# Display the result
pitcher_output

Unnamed: 0,PlayerId,Name,ERA_avg,FIP_avg,WHIP_avg,ERA_2023,FIP_2023,WHIP_2023,z_scores_era_avg,z_score_era_2023,z_scores_fip_avg,z_score_fip_2023,z_scores_whip_avg,z_score_whip_2023,zscore_difference_era,zscore_difference_fip,zscore_difference_whip
0,1943,Zack Greinke,3.381562,3.780352,1.121997,5.530121,5.141063,1.274096,-0.397709,0.101183,-0.443933,0.117758,-0.544849,-0.237736,0.498892,0.561691,0.307114
1,2036,Clayton Kershaw,2.902238,3.155674,1.012831,2.515528,3.755054,1.015528,-0.501079,-0.491784,-0.830967,-0.337950,-0.728177,-0.589648,0.009295,0.493017,0.138529
2,2233,Adam Wainwright,3.854966,3.989132,1.307504,8.612070,6.024892,1.991379,-0.295615,0.707399,-0.314579,0.408353,-0.233318,0.738490,1.003014,0.722932,0.971807
3,2429,Corey Kluber,4.213119,3.648284,1.298954,6.264000,6.582582,1.536000,-0.218377,0.245536,-0.525760,0.591716,-0.247675,0.118716,0.463913,1.117476,0.366392
4,2520,Lance Lynn,4.011502,3.463603,1.283427,6.467967,5.209300,1.462396,-0.261857,0.285656,-0.640183,0.140193,-0.273752,0.018540,0.547513,0.780376,0.292292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,17611,Taylor Clarke,5.875647,6.214009,1.507772,0.000000,2.270581,0.000000,0.140161,-0.986586,1.063895,-0.826031,0.103003,-1.971786,-1.126747,-1.889926,-2.074789
272,18383,Michael Soroka,3.092941,3.151605,1.276123,5.468355,6.422480,1.518987,-0.459952,0.089033,-0.833488,0.539076,-0.286018,0.095562,0.548986,1.372564,0.381580
273,12447,Seth Lugo,3.913043,3.856084,1.304348,3.704268,3.746191,1.225610,-0.283090,-0.257960,-0.397012,-0.340864,-0.238618,-0.303726,0.025130,0.056148,-0.065108
274,14932,Ben Lively,6.845070,5.146347,1.859155,5.482234,5.752815,1.385787,0.349226,0.091763,0.402400,0.318896,0.693098,-0.085724,-0.257462,-0.083504,-0.778823


In [13]:
#  Output full pitch data to csv
pitcher_output.to_csv('full_pitcher_data.csv', encoding="utf-8", index=False)


In [14]:
# Create dataset for ERA learning
columns_to_drop = ['PlayerId', 'Name', 'ERA_2023', 'FIP_2023', 'WHIP_2023',
                   'zscore_difference_fip', 'zscore_difference_whip']

for_learning_era = pitcher_output.drop(columns=columns_to_drop)
for_learning_era.head()


Unnamed: 0,ERA_avg,FIP_avg,WHIP_avg,z_scores_era_avg,z_score_era_2023,z_scores_fip_avg,z_score_fip_2023,z_scores_whip_avg,z_score_whip_2023,zscore_difference_era
0,3.381562,3.780352,1.121997,-0.397709,0.101183,-0.443933,0.117758,-0.544849,-0.237736,0.498892
1,2.902238,3.155674,1.012831,-0.501079,-0.491784,-0.830967,-0.33795,-0.728177,-0.589648,0.009295
2,3.854966,3.989132,1.307504,-0.295615,0.707399,-0.314579,0.408353,-0.233318,0.73849,1.003014
3,4.213119,3.648284,1.298954,-0.218377,0.245536,-0.52576,0.591716,-0.247675,0.118716,0.463913
4,4.011502,3.463603,1.283427,-0.261857,0.285656,-0.640183,0.140193,-0.273752,0.01854,0.547513


In [15]:
# Save ERA learning dataset to csv
for_learning_era.to_csv('full_era_learning.csv', encoding="utf-8", index=False)


In [16]:
# Create dataset for FIP learning
columns_to_drop = ['PlayerId', 'Name', 'ERA_2023', 'FIP_2023', 'WHIP_2023',
                   'zscore_difference_era', 'zscore_difference_whip']

for_learning_fip = pitcher_output.drop(columns=columns_to_drop)
for_learning_fip.head()


Unnamed: 0,ERA_avg,FIP_avg,WHIP_avg,z_scores_era_avg,z_score_era_2023,z_scores_fip_avg,z_score_fip_2023,z_scores_whip_avg,z_score_whip_2023,zscore_difference_fip
0,3.381562,3.780352,1.121997,-0.397709,0.101183,-0.443933,0.117758,-0.544849,-0.237736,0.561691
1,2.902238,3.155674,1.012831,-0.501079,-0.491784,-0.830967,-0.33795,-0.728177,-0.589648,0.493017
2,3.854966,3.989132,1.307504,-0.295615,0.707399,-0.314579,0.408353,-0.233318,0.73849,0.722932
3,4.213119,3.648284,1.298954,-0.218377,0.245536,-0.52576,0.591716,-0.247675,0.118716,1.117476
4,4.011502,3.463603,1.283427,-0.261857,0.285656,-0.640183,0.140193,-0.273752,0.01854,0.780376


In [17]:
# Save FIP learning dataset to csv
for_learning_fip.to_csv('full_fip_learning.csv', encoding="utf-8", index=False)


In [18]:
# Create dataset for WHIP learning
columns_to_drop = ['PlayerId', 'Name', 'ERA_2023', 'FIP_2023', 'WHIP_2023',
                   'zscore_difference_era', 'zscore_difference_fip']

for_learning_whip = pitcher_output.drop(columns=columns_to_drop)
for_learning_whip.head()


Unnamed: 0,ERA_avg,FIP_avg,WHIP_avg,z_scores_era_avg,z_score_era_2023,z_scores_fip_avg,z_score_fip_2023,z_scores_whip_avg,z_score_whip_2023,zscore_difference_whip
0,3.381562,3.780352,1.121997,-0.397709,0.101183,-0.443933,0.117758,-0.544849,-0.237736,0.307114
1,2.902238,3.155674,1.012831,-0.501079,-0.491784,-0.830967,-0.33795,-0.728177,-0.589648,0.138529
2,3.854966,3.989132,1.307504,-0.295615,0.707399,-0.314579,0.408353,-0.233318,0.73849,0.971807
3,4.213119,3.648284,1.298954,-0.218377,0.245536,-0.52576,0.591716,-0.247675,0.118716,0.366392
4,4.011502,3.463603,1.283427,-0.261857,0.285656,-0.640183,0.140193,-0.273752,0.01854,0.292292


In [19]:
# Save WHIP learning dataset to csv
for_learning_whip.to_csv('full_whip_learning.csv', encoding="utf-8", index=False)
