#### Using both models because of missing data

In [49]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import json

#### Data Load and Prepare 

In [50]:
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Coding Projects/woba modeling/data')
df = pd.read_csv('pitch/pitch_cleaned.csv')

In [51]:
df['year'] = pd.to_datetime(df['year']).dt.year

In [52]:
df = df[df['description'] == 'hit_into_play']

In [53]:
df = (df.reset_index()).drop(columns=['index'])

#### Bat Tracking

In [54]:
with open(r'C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Coding Projects/woba modeling/data/parameters/m2_bat_tracking.json', 'r') as f:
    bat_tracking_params = json.load(f)

In [55]:
bt_train = df[df['attack_direction'].notna()] 
bt_train = bt_train.dropna()

In [56]:
X = (bt_train[['attack_angle', 'launch_speed', 'attack_direction', 'zone', 'swing_path_tilt']])
y = bt_train['woba_value']
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=26) 

In [57]:
bt_models = {}
quantiles = [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]
for q in quantiles:
    quantile_model = lgb.LGBMRegressor(**bat_tracking_params[str(q)], alpha=q, random_state=26, n_jobs=-1)
    quantile_model.fit(x_train, y_train, 
                       eval_set=[(x_val, y_val)], 
                       callbacks=[lgb.early_stopping(stopping_rounds=75, verbose=False)])
    bt_models[q] = quantile_model

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1034
[LightGBM] [Info] Number of data points in the train set: 202071, number of used features: 5
[LightGBM] [Info] Start training from score 0.371120
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1034
[LightGBM] [Info] Number of data points in the train set: 202071, number of used features: 5
[LightGBM] [Info] Start training from score 0.371120
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bi

#### Ev Direction

In [58]:
ev_train = df[df['launch_speed'].notna()] # mcar 
ev_train = ev_train.dropna()

In [59]:
with open(r'C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Coding Projects/woba modeling/data/parameters/ev_dir_params.json', 'r') as f:
    ev_dir_params = json.load(f)

In [60]:
X = (ev_train[['launch_speed', 'launch_angle']])
y = ev_train['woba_value']
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=26) 

In [61]:
models = {}
quantiles = [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]
for q in quantiles:
    quantile_model = lgb.LGBMRegressor(**ev_dir_params[str(q)], alpha=q, random_state=26, n_jobs=-1)
    quantile_model.fit(x_train, y_train, 
                       eval_set=[(x_val, y_val)], 
                       callbacks=[lgb.early_stopping(stopping_rounds=40, verbose=False)])
    models[q] = quantile_model

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 213957, number of used features: 2
[LightGBM] [Info] Start training from score 0.371348
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 213957, number of used features: 2
[LightGBM] [Info] Start training from score 0.371348
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000280 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

#### Predection

In [62]:
# Define the feature sets for each model type
model_1_features = ['attack_angle', 'launch_speed', 'attack_direction', 'zone', 'swing_path_tilt']
model_2_features = ['launch_speed', 'launch_angle']

mask = df[model_1_features].notna().all(axis=1)
full_predictions = pd.DataFrame(index=df.index)

for q in quantiles:
    pred_pre_impact = bt_models[q].predict(df.loc[mask, model_1_features])
    pred_post_impact = models[q].predict(df.loc[~mask, model_2_features])

    quantile_column = f'q_{q}'

    full_predictions.loc[mask, quantile_column] = pred_pre_impact
    full_predictions.loc[~mask, quantile_column] = pred_post_impact



In [63]:
full_predictions['name'] = df['batter']
full_predictions['year'] = df['year']
quantile_cols = sorted([col for col in full_predictions.columns if col.startswith('q_')])
full_predictions[quantile_cols] = np.sort(full_predictions[quantile_cols].values, axis=1)
full_predictions[quantile_cols] = full_predictions[quantile_cols].clip(lower=0)
full_predictions[quantile_cols] = full_predictions[quantile_cols].clip(upper=2.01775) # average hr woba over last 8 years
full_predictions = full_predictions.reset_index()
cols = ['name', 'year'] + [col for col in full_predictions.columns if col not in ['name', 'year', 'index']]
full_predictions = full_predictions[cols]

In [64]:
full_predictions.to_csv('quantile_predections/full_preds.csv')