# Game of Zones: Using ML to Predict the Outcome of an MLB At Bat

Goal: predict where the batter is most likely to hit the ball (zones of the field) in an at-bat given the situation and the pitcher he is facing
    
Input Data: 
- the pitcher's repertoire: given that each pitcher has a different arsenal of pitches and each pitch moves differently, we use a cluster analysis to categorize pitch types.  In this way, we put each pitcher on the same footing.

- pitcher stats such as groundball and flyball rates

- the game situation: the inning (and top/bottom), the number of outs, positions of baserunners, the count, positions of fielders(?)

- the batter's priors: distribution of batted balls into zones

- any other batter data?

Output: 
- probabilities for each zone on the field where the batter can hit the ball

- contributing factors for each prediction (things the defensive team could use to intervene)

In [None]:
%load_ext autoreload
%autoreload 2
from pybaseball import statcast, pitching_stats
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from matplotlib import patches
%matplotlib inline

from get_data import get_data, get_hit_zone, get_pitch_data, get_situation_data
from pitch_clustering import pitch_clustering
from batter_zone import get_batter_zone_data

# use Statcast data (from 2015-2018) so we can get spin rate, etc.
train_data_dates = [('2015-04-05', '2015-10-04'),      # 2015 data
                    ('2016-04-03', '2016-10-02'),       # 2016 data
                    ('2017-04-02', '2017-10-01'),       # 2017 data
                    ('2018-03-29', '2018-10-01')]       # 2018 data

### Build the Outcome

In [None]:
# get the outcome data
outcome_data = get_data(get_hit_zone, train_data_dates)

# write to file
outcome_data.to_csv("./outcome.csv", index=False)

print(f"Shape of the outcome data: {outcome_data.shape}")
outcome_data.head()

### Build the Pitcher Data

In [None]:
# get the outcome data
pitch_data = get_data(get_pitch_data, train_data_dates)

# write to file
pitch_data.to_csv("./pitch.csv", index=False)

# print the number of pitchers in the data set
print(f"Number of pitchers in the data: {len(pitch_data['pitcher'].unique())}")

print(f"Shape of training data: {pitch_data.shape}")

pitch_data.head()

### Perform PCA & Clustering on Pitch Data... and Add in Groundball/Flyball Percentages

In [None]:
# read in pitch data
pitch_data = pd.read_csv("./pitch.csv")

# perform PCA and K-Means clustering
pitch_data = pitch_clustering(pitch_data)

# write to file
pitch_data.to_csv("./pitch.csv")

pitch_data.head()

### Build the Batter's Prior Zone Distribution

In [None]:
# use the hit zone data from the outcome (calculated above)
batter_zone_data = pd.read_csv("./outcome.csv")

batter_zone_data_pct = get_batter_zone_data(batter_zone_data)

batter_zone_data_pct.to_csv("./batter_zones.csv", index=False)

print(batter_zone_data_pct.shape)
batter_zone_data_pct.head()

### Get the Game Situation Features

In [None]:
situation_data = get_data(get_situation_data, train_data_dates)

# write to file
situation_data.to_csv("./situation.csv", index=False)

print(situation_data.shape)
situation_data.head()

### Combine the Game Situation, Pitcher and Batter Features along with the Outcome

In [None]:
# game situation data
game_situation_df = pd.read_csv("./situation.csv")

# pitch type data
pitch_type_df = pd.read_csv("./pitch.csv")
pitch_type_df.drop('player_name', axis=1, inplace=True)
pitch_type_df['pitcher'] = pitch_type_df['pitcher'].astype(int)

# batter's prior hit zone distribution
batter_zone_df = pd.read_csv("./batter_zones.csv")

# the outcome of the at-bat
outcome_df = pd.read_csv("./outcome.csv")

# combine all of the data sources into one dataframe
full_data = pd.merge(game_situation_df, pitch_type_df, on="pitcher")
full_data = pd.merge(full_data, batter_zone_df, on="batter")
full_data = pd.merge(outcome_df, full_data, on=['game_pk', 'index', 'batter', 'pitcher'])

print(full_data.shape)
full_data.head()

### Split the Data into Train/Test Feature and Target Sets

In [None]:
# drop index columns for training
full_data = full_data.drop(['game_pk', 'index', 'batter', 'pitcher'], axis=1)

# keep index columns in a separate dataframe for future use
index_data = full_data[['game_pk', 'index', 'batter', 'pitcher']]

# split the dataframe into a feature set and an outcome column
X = full_data.drop('hit_zone', axis=1)
y = full_data['hit_zone']

# split the data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4256)

### Train an XGBoost Model

In [None]:
# ----------------------
# train an XGBoost model
# ----------------------

# small set of hyperparameters to optimize over
# xgb_params = {"max_depth": (3, 5, 10, 15, 20),
#               "learning_rate": (0.01, 0.5, 0.1, 0.2, 0.4),
#               "gamma": (0, 33, 66, 100),
#               "min_child_weight": (0, 33, 66, 100),
#               "colsample_bytree": (0.5, 0.75, 1),
#               "subsample": (0.5, 0.75, 1),}

# # perform the paramater grid search using 5-fold cross validation
# xgb_opt = GridSearchCV(XGBClassifier(objective='multi:softprob', num_class=4), 
#                        param_grid=xgb_params, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

xgb_opt = XGBClassifier(objective='multi:softprob', num_class=4)

# perform fit and make predictions
xgb_opt.fit(X_train, y_train)
y_pred = xgb_opt.predict(X_test)
y_prob = xgb_opt.predict_proba(X_test)

# compute accuracy
accuracy = round(accuracy_score(y_test, y_pred) * 100, 1)

# the naive model - the max of the prior probabilities
def naive_model(df):
    df = df[['batter_zone_1', 'batter_zone_2', 'batter_zone_3', 'batter_zone_4']]
    df.columns = [1, 2, 3, 4]
    return df.idxmax(axis=1)
y_naive = naive_model(X_test).as_matrix()

# compute naive accuracy
naive_accuracy = round(accuracy_score(y_test, y_naive) * 100, 1)

print(f"Accuracy of the Naive model: {naive_accuracy}%")
print(f"Accuracy of the XGBoost model: {accuracy}%")

# print the confusion matrix
print()
print("The Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))

In [None]:
features = X_train.columns.tolist()
importances = list(xgb_opt.feature_importances_)
for i in range(len(features)):
    print(features[i] + "\t" + str(importances[i] * 100.))