### Using xgb to model wobacon

In [1]:
import pandas as pd
import os
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, PredefinedSplit
from sklearn.metrics import mean_squared_error

In [2]:
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Coding Projects/woba modeling/data/')
df = pd.read_csv('pitch_cleaned.csv').drop(columns=['Unnamed: 0'])

##### Cleaning for Task

In [3]:
df = df[df['description'] == 'hit_into_play']
df = df[df['attack_direction'].notna()] # mar, but higher substinally higher avg woba on missing. going to adress with a launch angle, launch speed model
df = df[df['launch_speed'].notna()] # mcar 

In [4]:
df['zone'] = df['zone'].astype('category')
df = df.dropna()

In [5]:
X = (df[['zone', 'launch_speed', 'swing_path_tilt', 'attack_angle', 'attack_direction']])
y = df['woba_value']

##### Train Test Val

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.125, random_state=26) 

##### Hyper Parameters Tuning

In [None]:
model = xgb.XGBRegressor(random_state=26, n_jobs=-1, metric=['mae', 'rmse'])

In [8]:
rnd_search_params = {
    'learning_rate': np.linspace(0.005, 0.2, 10),          
    'num_leaves': np.linspace(2, 100, 10, dtype=int),
    'max_depth': np.linspace(1, 15, 20, dtype=int),  
    'min_data_in_leaf': np.linspace(1, 30, 10, dtype=int),         
    'subsample': np.linspace(0.4, 0.9, 7),               
    'max_bin': np.linspace(200, 600, 10, dtype=int),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'n_estimators': np.linspace(200, 2000, 15, dtype=int),
    'lambda_l2': np.linspace(0.01, 0.5, 15)
}

x_combined = pd.concat([x_train, x_val])
y_combined = pd.concat([y_train, y_val])

test_fold = np.zeros(len(x_combined))
test_fold[:len(x_train)] = -1
pds = PredefinedSplit(test_fold)

fit_params = {'eval_set': [(x_val, y_val)], 'verbose': False}

rnd_searcher = RandomizedSearchCV(model, param_distributions=rnd_search_params, cv=pds,
                                n_iter=500, random_state=26, verbose=4, n_jobs=-1 
)

In [9]:
search = rnd_searcher.fit(x_combined, y_combined, **fit_params)
print(search.best_params_)
print(search.best_score_)
print(search.feature_names_in_)

Fitting 1 folds for each of 500 candidates, totalling 500 fits


ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dalto\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1081, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 596, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1008, in _create_dmatrix
    return DMatrix(**kwargs, nthread=self.n_jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\core.py", line 878, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
                                           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\data.py", line 1223, in dispatch_data_backend
    return _from_pandas_df(
           ^^^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\data.py", line 640, in _from_pandas_df
    df, feature_names, feature_types = _transform_pandas_df(
                                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\data.py", line 603, in _transform_pandas_df
    pandas_check_dtypes(data, enable_categorical)
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\data.py", line 569, in pandas_check_dtypes
    _invalid_dataframe_dtype(data)
  File "c:\Users\dalto\anaconda3\Lib\site-packages\xgboost\data.py", line 356, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:zone: category


In [167]:
grid = {'boosting_type': ['dart', 'gdbt'],
        'subsample': [0.45, 0.5, 0.55], 
        'num_leaves':  [8, 9, 10], 
        'n_estimators': [350, 400, 450], 
        'min_data_in_leaf': [16, 18, 20], 
        'max_depth': [8, 9, 10], 
        'max_bin': [63], 
        'learning_rate': [0.05 ,0.1, 0.15], 
        'lambda_l2': [0.5, 1, 3], 
        'colsample_bytree': [0.75,0.8, 0.85]
        }

grid_searcher = GridSearchCV(model, param_grid=grid, cv=pds, verbose=4, n_jobs=-1)

In [None]:
grid_searcher.fit(x_combined, y_combined, **fit_params)
print(grid_searcher.best_params_)
print(grid_searcher.best_score_)
print(grid_searcher.feature_names_in_)

Fitting 1 folds for each of 13122 candidates, totalling 13122 fits


In [163]:
grid_best = {'boosting_type': 'dart','colsample_bytree': 0.8, 'lambda_l2': 0.5, 'learning_rate': 0.1, 'max_bin': 63, 'max_depth': 9, 'min_data_in_leaf': 18, 'n_estimators': 400, 'num_leaves': 9, 'subsample': 0.5, 'early_stopping_rounds': 40}

In [159]:
model = model.set_params(**grid_best)
model = model.fit(x_train, y_train, eval_set=[[x_val, y_val]], callbacks=[lgb.early_stopping(stopping_rounds=40, verbose=False)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 160406, number of used features: 5
[LightGBM] [Info] Start training from score 0.372028
Training until validation scores don't improve for 40 rounds


In [160]:
y_pred = model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df = results_df.join(df[['batter', 'year']])
print(f'RMSE: {rmse}')

RMSE: 0.5033683310184024


In [161]:
print(model.feature_importances_)

[258 428 211 341 354]


In [165]:
grouped_results = results_df.groupby(['year', 'batter'])[['actual', 'predicted']].agg(['mean', 'count'])
grouped_results.columns = ['_'.join(col).strip() for col in grouped_results.columns.values]
grouped_results = grouped_results.reset_index()
grouped_rmse = np.sqrt(mean_squared_error(grouped_results['actual_mean'], grouped_results['predicted_mean']))
qualified_results = grouped_results[grouped_results['actual_count'] > 40]
qualified_rmse = np.sqrt(mean_squared_error(qualified_results['actual_mean'], qualified_results['predicted_mean']))
print(f'RMSE for batters with more than 60 plate appearances: {qualified_rmse}')
print(f'Grouped RMSE {grouped_rmse}')

RMSE for batters with more than 60 plate appearances: 0.07396243072837254
Grouped RMSE 0.17253456466657058
