In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc, matthews_corrcoef
from sklearn.pipeline import Pipeline

import xgboost as xgb
from xgboost import XGBClassifier

from joblib import dump

In [2]:
train_df = pd.read_csv('../Data/2020-train.csv')

In [3]:
is_strike = []
is_strike_list = ['InPlay', 'StrikeCalled', 'StrikeSwinging', 'FoulBall']
for i in train_df['pitch_call']:
    if i in is_strike_list:
        is_strike.append(1)
    else: 
        is_strike.append(0)
train_df['is_strike'] = is_strike

In [4]:
def clean_and_wrangle(df):
    
    df = df.drop(df[df['release_speed'].isnull()].index)
    df = df.drop(df[df['outs'].isnull()].index)
    df['spin_rate'].fillna(df['spin_rate'].mean(), inplace=True)
    df = df.drop(df[df['tilt'].isnull()].index)
    df = df.drop(df[df['pitch_type'].isnull()].index)
    df = df.reset_index().drop('index', axis=1)
    
    df['tilt'] = df['tilt'].map(lambda x: sum(a*int(t) for a, t in zip([3600, 60], x.split(':'))) \
                                          if ':' in x else int(x[:-5]))
    return df

In [5]:
train_df = clean_and_wrangle(train_df)

In [6]:
X = train_df.drop(['pitcher_id', 'batter_id', 'stadium_id', 'umpire_id', 'catcher_id', 'pitch_call', 'is_strike', 'pitch_id'], axis=1)
y = train_df['is_strike']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=34)

X_train = pd.get_dummies(X_train, prefix=['pitcher', 'batter', 'is'], columns=['pitcher_side', 'batter_side', 'pitch_type'])
X_test = pd.get_dummies(X_test, prefix=['pitcher', 'batter', 'is'], columns=['pitcher_side', 'batter_side', 'pitch_type'])

In [7]:
steps = [('xgb', XGBClassifier(seed=34))]
param_grid = {'xgb__n_estimators': np.arange(200, 1100, 100)}
pipeline = Pipeline(steps)
cv_1 = GridSearchCV(pipeline, param_grid, cv=3)
cv_1.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('xgb',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=1,
                                                      gamma=0,
                                                      learning_rate=0.1,
                                                      max_delta_step=0,
                                                      max_depth=3,
                                                      min_child_weight=1,
                                                      missing=None,
                                                      n_estimators=100,
                  

In [8]:
print(cv_1.best_params_, cv_1.best_score_)
n_estimators = cv_1.best_params_['xgb__n_estimators']

{'xgb__n_estimators': 900} 0.8690059652694787


In [9]:
steps = [('xgb', XGBClassifier(n_estimators=n_estimators, seed=34))]
param_grid = {'xgb__max_depth': np.arange(3,10,2)}
pipeline = Pipeline(steps)
cv_2 = GridSearchCV(pipeline, param_grid, cv=3)
cv_2.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('xgb',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=1,
                                                      gamma=0,
                                                      learning_rate=0.1,
                                                      max_delta_step=0,
                                                      max_depth=3,
                                                      min_child_weight=1,
                                                      missing=None,
                                                      n_estimators=900,
                  

In [10]:
print(cv_2.best_params_, cv_2.best_score_)
max_depth = cv_2.best_params_['xgb__max_depth']

{'xgb__max_depth': 5} 0.8697434097759663


In [11]:
steps = [('xgb', XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, seed=34))]
param_grid = {'xgb__gamma': [0, .1, .25, .5, 1]}
pipeline = Pipeline(steps)
cv_3 = GridSearchCV(pipeline, param_grid, cv=3)
cv_3.fit(X_train, y_train)
print(cv_3.best_params_, cv_3.best_score_)
gamma = cv_3.best_params_['xgb__gamma']

{'xgb__gamma': 0.5} 0.8699524670372843


In [12]:
steps = [('xgb', XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, gamma=gamma, seed=34))]
param_grid = {'xgb__learning_rate': np.arange(0.05, 0.35, .05)}
pipeline = Pipeline(steps)
cv_4 = GridSearchCV(pipeline, param_grid, cv=3)
cv_4.fit(X_train, y_train)
print(cv_4.best_params_, cv_4.best_score_)
learning_rate = cv_4.best_params_['xgb__learning_rate']

{'xgb__learning_rate': 0.1} 0.8699524670372843


In [19]:
steps = [('xgb', XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, gamma=gamma, seed=34))]
param_grid = {'xgb__colsample_bytree': np.arange(0.5, 1.1, .1)}
pipeline = Pipeline(steps)
cv_5 = GridSearchCV(pipeline, param_grid, cv=3)
cv_5.fit(X_train, y_train)
print(cv_5.best_params_, cv_5.best_score_)
colsample_bytree = cv_5.best_params_['xgb__colsample_bytree']

xgboost.core.XGBoostError: value 1.1 for Parameter colsample_bytree exceed bound [0,1]



{'xgb__colsample_bytree': 0.7999999999999999} 0.8700880106227418


In [26]:
steps = [('xgb', XGBClassifier(max_depth=max_depth, gamma=gamma, learning_rate=learning_rate, 
                               colsample_bytree=colsample_bytree, seed=34))]
param_grid = {'xgb__n_estimators': np.arange(200, 1100, 100)}
pipeline = Pipeline(steps)
cv_final = GridSearchCV(pipeline, param_grid, cv=3)
cv_final.fit(X_train, y_train)
print(cv_final.best_params_, cv_final.best_score_)
n_estimators = cv_final.best_params_['xgb__n_estimators']

{'xgb__n_estimators': 800} 0.8701109839502493


In [27]:
xgb = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                    gamma=gamma, learning_rate=learning_rate, subsample=subsample, seed=34)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.5,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=800, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=34,
              silent=None, subsample=1.0, verbosity=1)

In [28]:
model_filename = 'xgboost_model.pkl'
dump(xgb, model_filename)

['xgboost_model.pkl']

In [29]:
xgb_pred = xgb.predict(X_test)

print(classification_report(xgb_pred, y_test))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82     52217
           1       0.90      0.90      0.90     92879

    accuracy                           0.87    145096
   macro avg       0.86      0.86      0.86    145096
weighted avg       0.87      0.87      0.87    145096

