In [38]:
import pandas as pd
import json
from scipy import stats
from sklearn.metrics import mean_squared_error, accuracy_score

In [7]:
config_path = 'data/config.json'
raw_data_path = 'data/statcast_2015-2024.csv'

In [8]:
from DataProc.DataProcessor import DataProcessor

processor = DataProcessor(raw_data_path,config_path,one_hot=False)


  self.raw_data = pd.read_csv(raw_data_path)


Selected Columns: ['events', 'launch_speed', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'batter', 'pitcher', 'stand', 'p_throws', 'hit_location', 'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'outs_when_up', 'hc_x', 'hc_y', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'launch_angle', 'release_spin_rate', 'release_extension', 'game_pk', 'release_pos_y', 'at_bat_number', 'batter_name', 'pitcher_name']
Raw Data Shape: (6653226, 87)


In [None]:
processed_data = processor.get_processed_data()

In [17]:
processed_data.head()

Unnamed: 0,events,launch_speed,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,batter,pitcher,stand,...,sz_top,sz_bot,launch_angle,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,batter_name,pitcher_name
0,strikeout,-0.627173,ST,2024-06-30,-1.056521,-1.498172,-2.80244,680869,623149,R,...,0.302539,-0.679867,-0.606538,0.692401,0.440371,747182,-0.5358,72,zack gelof,paul sewald
1,S,1.645207,ST,2024-06-30,-1.089609,-1.53509,-2.936805,680869,623149,R,...,0.451288,-0.194194,0.685068,0.905474,0.633444,747182,-0.753158,72,zack gelof,paul sewald
2,S,1.081636,FF,2024-06-30,0.333173,-1.455979,-2.5721,680869,623149,R,...,0.451288,-0.194194,2.231081,0.851456,0.826517,747182,-0.861837,72,zack gelof,paul sewald
3,B,-0.627173,ST,2024-06-30,-0.891082,-1.503446,-2.84083,680869,623149,R,...,-0.292455,-1.327432,-0.606538,0.668393,0.633444,747182,-0.79663,72,zack gelof,paul sewald
4,S,1.668473,FF,2024-06-30,0.184277,-1.44543,-2.744855,680869,623149,R,...,0.451288,-0.194194,2.094093,1.142555,0.826517,747182,-0.970516,72,zack gelof,paul sewald


In [15]:
train_data = processed_data[processed_data['game_date'] < "2023-04-01"]
test_data = processed_data[processed_data['game_date'] >= "2023-04-01"]

In [16]:
print(train_data.shape,test_data.shape)

(5438614, 36) (1110231, 36)


In [14]:
label_columns = [column for column, settings in processor.config.items() if settings.get('label', False)]
categorical_columns = ['events', 'hit_location']
numerical_columns = [col for col in label_columns if col not in categorical_columns]

# Print the label columns
print("Label columns:", label_columns)
print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)

Label columns: ['events', 'launch_speed', 'hit_location', 'hc_x', 'hc_y', 'launch_angle']
Categorical columns: ['events', 'hit_location']
Numerical columns: ['launch_speed', 'hc_x', 'hc_y', 'launch_angle']


In [20]:
player_stats_numerical = train_data.groupby('batter')[numerical_columns].mean().reset_index()
player_stats_categorical = train_data.groupby('batter')[categorical_columns].agg(lambda x: x.mode()[0]).reset_index()


In [26]:
player_stats = player_stats_numerical.merge(player_stats_categorical, on='batter')
player_stats.head()

Unnamed: 0,batter,launch_speed,hc_x,hc_y,launch_angle,events,hit_location
0,112526,-0.31079,-0.107899,-0.022762,-0.321052,S,0.0
1,115629,-0.627173,-0.43026,-0.426236,-0.606538,S,0.0
2,116338,-0.075254,0.063978,0.084884,-0.119328,S,0.0
3,120074,-0.08258,0.072284,-0.016393,-0.109503,B,0.0
4,121347,-0.179386,-0.089969,-0.081154,-0.212362,S,0.0


In [32]:
test_data_with_stats = test_data.merge(player_stats, on='batter', how='left', suffixes=('', '_pred')).dropna() #some players in the test data were not present in training data, leading to nans, just drop them
test_data_with_stats.head()

Unnamed: 0,events,launch_speed,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,batter,pitcher,stand,...,release_pos_y,at_bat_number,batter_name,pitcher_name,launch_speed_pred,hc_x_pred,hc_y_pred,launch_angle_pred,events_pred,hit_location_pred
6,field_out,-0.627173,ST,2024-06-30,-1.122697,-1.503446,-2.860025,663527,623149,R,...,-0.405385,71,tyler nevin,paul sewald,-0.00492,-0.067907,-0.069658,0.021228,S,0.0
7,S,1.172117,FF,2024-06-30,0.300085,-1.519268,-2.956,663527,623149,R,...,-0.818366,71,tyler nevin,paul sewald,-0.00492,-0.067907,-0.069658,0.021228,S,0.0
8,S,-0.627173,ST,2024-06-30,-1.106153,-1.482349,-2.61049,663527,623149,R,...,-0.557536,71,tyler nevin,paul sewald,-0.00492,-0.067907,-0.069658,0.021228,S,0.0
9,strikeout,-0.627173,FF,2024-06-30,0.333173,-1.455979,-2.744855,668832,623149,L,...,-0.601008,70,kyle mccann,paul sewald,-0.094625,0.16912,0.368802,-0.072562,S,0.0
10,B,-0.627173,CH,2024-06-30,-0.725642,-1.382141,-2.64888,668832,623149,L,...,-0.557536,70,kyle mccann,paul sewald,-0.094625,0.16912,0.368802,-0.072562,S,0.0


In [36]:
mse_scores_numerical = {col: mean_squared_error(test_data_with_stats[col], test_data_with_stats[f"{col}_pred"]) for col in numerical_columns}

In [37]:
# Print the MSE scores for numerical columns
print("MSE Scores (Numerical):", mse_scores_numerical)

MSE Scores (Numerical): {'launch_speed': 1.0703136880195878, 'hc_x': 1.0113056556892088, 'hc_y': 0.9991666843376507, 'launch_angle': 1.1157709071773647}


In [40]:
accuracy_scores_categorical = {col: accuracy_score(test_data_with_stats[col], test_data_with_stats[f"{col}_pred"]) for col in categorical_columns}

# Print the accuracy scores for categorical columns
print("Accuracy Scores (Categorical):", accuracy_scores_categorical)

Accuracy Scores (Categorical): {'events': 0.4055333770911234, 'hit_location': 0.7760943011366489}
