In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from joblib import dump

In [2]:
train_df = pd.read_csv('../Data/model_data.csv', index_col=0)

  mask |= (ar1 == a)


This notebook is dedicated to tuning the hyperparameters of the XGBoost model for use in the main Challenge notebook. Here we will tune 3 hyperparameters: n_estimators, max_depth, and learning rate. I chose these parameters in specific, because in doing some research, these were the common parameters tuned for a classification problem. Seeing how this is a very large dataset, we're also going to be using the subsample parameter and set that to train on 33% of the dataset (instances selected randomly). This does trade some bias for variance, but it also significantly speeds up training time which is what we're after.

In [3]:
X = train_df.drop(['pitcher_id', 'batter_id', 'stadium_id', 'umpire_id', 'catcher_id', 'pitch_call', 'is_swing', 'pitch_id', 'date'], axis=1)
y = train_df['is_swing']

X = pd.get_dummies(X, prefix=['level', 'pitcher', 'batter', 'is'], columns=['level', 'pitcher_side', 'batter_side', 'pitch_type'], dtype='int64')

In [4]:
steps = [('xgb', XGBClassifier(seed=34, subsample=.33))]
param_grid = {'xgb__n_estimators': np.arange(500, 1600, 100)}
pipeline = Pipeline(steps)
cv_1 = GridSearchCV(pipeline, param_grid, cv=3)
cv_1.fit(X, y)
print(cv_1.best_params_, cv_1.best_score_)
n_estimators = cv_1.best_params_['xgb__n_estimators']

<IPython.core.display.Javascript object>

{'xgb__n_estimators': 1500} 0.7826578143525036


In [5]:
steps = [('xgb', XGBClassifier(n_estimators=n_estimators, seed=34, subsample=.33))]
param_grid = {'xgb__max_depth': np.arange(3,10,2)}
pipeline = Pipeline(steps)
cv_2 = GridSearchCV(pipeline, param_grid, cv=3)
cv_2.fit(X, y)
print(cv_2.best_params_, cv_2.best_score_)
max_depth = cv_2.best_params_['xgb__max_depth']

<IPython.core.display.Javascript object>

{'xgb__max_depth': 5} 0.7830036649737934


In [6]:
steps = [('xgb', XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, seed=34, subsample=.33))]
param_grid = {'xgb__learning_rate': np.arange(0.05, 0.35, .05)}
pipeline = Pipeline(steps)
cv_3 = GridSearchCV(pipeline, param_grid, cv=3)
cv_3.fit(X, y)
print(cv_3.best_params_, cv_3.best_score_)
learning_rate = cv_3.best_params_['xgb__learning_rate']

<IPython.core.display.Javascript object>

{'xgb__learning_rate': 0.05} 0.7841352088736265


In [7]:
xgb = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, seed=34)
xgb.fit(X, y)

XGBClassifier(learning_rate=0.05, max_depth=5, n_estimators=1500, seed=34)

Now that we have our model with its optimal hyperparameters and fitted, we'll save this model for use in the other notebook to look at metrics on it, as well as make predictions on the testing set.

In [8]:
model_filename = 'tuned_xgboost_model.pkl'
dump(xgb, model_filename)

['tuned_xgboost_model.pkl']