In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import set_random_seed
import os

np.random.seed(123)
set_random_seed(1234)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report

from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

import tensorflow as tf
from tensorflow import keras

In [2]:
# Import 'all_pitches.csv'
all_pitches = pd.read_csv('../Data/all_pitches.csv')

In [3]:
all_pitches.head()

Unnamed: 0,game_id,batSide_code,batSide_des,batter,batter_id,call_des,inning_top_bot,pitchHand_code,pitchHand_des,pitch_type,...,strike_left,strike_down_right,strike_down,strike_down_left,call_B,call_C,call_F,call_H,call_S,is_out
0,413661,L,Left,Matt Carpenter,572761,Called Strike,top,L,Left,FF,...,0,0,0,0,0,1,0,0,0,0
1,413661,L,Left,Matt Carpenter,572761,Swinging Strike,top,L,Left,FF,...,0,0,0,0,0,0,0,0,1,0
2,413661,L,Left,Matt Carpenter,572761,Foul,top,L,Left,FF,...,0,0,0,0,0,0,1,0,0,0
3,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,FF,...,0,0,0,0,1,0,0,0,0,0
4,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,CU,...,0,0,0,0,1,0,0,0,0,0


In [4]:
calls = all_pitches.loc[:,'call_B':'is_out']

In [5]:
calls_pct = calls.mean()
calls_pct

call_B    0.360146
call_C    0.168416
call_F    0.173895
call_H    0.064159
call_S    0.114559
is_out    0.173634
dtype: float64

## Simple Model

In [6]:
# Extract predictors
predictors = all_pitches.loc[:,'count_balls':'strike_down_left'].drop(['pitch_locx', 'pitch_locy'], axis=1)

In [7]:
# Extract target variable
targets = calls['is_out']

In [8]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors,
                                                    targets, 
                                                    test_size = 0.25, 
                                                    stratify = targets,
                                                    random_state = 123)

In [13]:
base_checkpoint_path = 'base_training/cp.ckpt'
base_checkpoint_dir = os.path.dirname(base_checkpoint_path)

# Create checkpoint callback
base_cp_callback = tf.keras.callbacks.ModelCheckpoint(base_checkpoint_path,
                                                    save_weights_only = True,
                                                    verbose=1)

In [14]:
# Instatiate a Sequential model
model_base = Sequential()

# Create a basic network
model_base.add(Dense(40, activation = 'relu', input_shape = (X_train.shape[1],)))
model_base.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model_base.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [15]:
# Set up early stopping monitor
early_stopping_monitor = EarlyStopping(patience=3)

In [17]:
# fit the training data to the basic model
model_base.fit(X_train,
               y_train,
               validation_split = 0.25,
               epochs = 20,
               callbacks = [early_stopping_monitor, base_cp_callback])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20

Epoch 00001: saving model to base_training/cp.ckpt
Epoch 2/20

Epoch 00002: saving model to base_training/cp.ckpt
Epoch 3/20

Epoch 00003: saving model to base_training/cp.ckpt
Epoch 4/20

Epoch 00004: saving model to base_training/cp.ckpt
Epoch 5/20

Epoch 00005: saving model to base_training/cp.ckpt
Epoch 6/20

Epoch 00006: saving model to base_training/cp.ckpt
Epoch 7/20

Epoch 00007: saving model to base_training/cp.ckpt
Epoch 8/20

Epoch 00008: saving model to base_training/cp.ckpt
Epoch 9/20

Epoch 00009: saving model to base_training/cp.ckpt
Epoch 10/20

Epoch 00010: saving model to base_training/cp.ckpt
Epoch 11/20

Epoch 00011: saving model to base_training/cp.ckpt
Epoch 12/20

Epoch 00012: saving model to base_training/cp.ckpt
Epoch 13/20

Epoch 00013: saving model to base_training/cp.ckpt
Epoch 14/20

Epoch 00014: saving model to base_training/cp.ckpt
Epoch 15/20

Epoch 00015: saving model to base_training/cp.c

<keras.callbacks.History at 0x1da8cbde5f8>

In [18]:
# Make predictions on the test set
score, acc = model_base.evaluate(X_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3896250728908871
Test accuracy: 0.824713393981086


In [19]:
# Make predictions on the test data
predictions = model_base.predict_classes(X_test)

# Print the confusion matrix
confusion_matrix(y_test, predictions)

array([[424226,  15691],
       [ 77623,  14811]], dtype=int64)

In [20]:
fpr, tpr, thresholds = roc_curve(y_test, predictions)

print(roc_auc_score(y_test, predictions))

0.5622825777854022


This simple model was able to accurately predict outs roughly 82.64% of the time on the test set. That may seem like a promising result but if the model were to predict only no outs, it would perform the same. It's also worth noting the models true-positive rate is almost nonexistent.

### Increase Model Capacity

In [None]:
# Instatiate a Sequential model
model_outs = Sequential()

# Create a 3 layers
model_outs.add(Dense(300, activation = 'relu', input_shape = (X_train.shape[1],)))
model_outs.add(Dense(100, activation = 'relu'))
model_outs.add(Dense(100, activation = 'relu'))
model_outs.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model_outs.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
# fit the training data to the basic model
model_outs.fit(X_train,
               y_train,
               validation_split = 0.25,
               epochs = 20,
               callbacks = [early_stopping_monitor])

In [None]:
# Make predictions on the test set
score, acc = model_outs.evaluate(X_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# Make predictions on the test data
predictions = model_outs.predict_classes(X_test)

# Print the confusion matrix
confusion_matrix(y_test, predictions)

Increasing the model capacity improved model performance. Not only is it more accurate the the baseline model, but it also has a much better true-positive rate.

This model still needs improvement but next will be to add different predictors and target variables to the model to see how models improve.

## Calls, No Player ID's

Predictors stay the same but target labels change.

In [None]:
# Split the data into train and test sets
X_call_train, X_call_test, y_call_train, y_call_test = train_test_split(predictors,
                                                                        calls, 
                                                                        test_size = 0.25, 
                                                                        stratify = calls,
                                                                        random_state = 123)

In [None]:
# Instatiate a Sequential model
model_calls = Sequential()

# Create a basic network
model_calls.add(Dense(300, activation = 'relu', input_shape = (X_call_train.shape[1],)))
model_calls.add(Dense(100, activation = 'relu'))
model_calls.add(Dense(100, activation = 'relu'))
model_calls.add(Dense(y_call_train.shape[1], activation = 'sigmoid'))

# Create a compiler
model_calls.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# fit the training data to the basic model
model_calls.fit(X_call_train,
                y_call_train,
                validation_split = 0.25,
                epochs = 20,
                callbacks = [early_stopping_monitor])

In [None]:
# Make predictions on the test set
score, acc = model_calls.evaluate(X_call_test, y_call_test)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# Make predictions on the test data
predictions = model_calls.predict(X_call_test)

In [None]:
matrix = confusion_matrix(y_call_test.values.argmax(axis=1), predictions.argmax(axis=1))
matrix

Looking at the confusion matrix, it appears the model wasn't able to pick up which pitches resulted in hits. More capacity may be needed

## ``is_out``, Player ID's

In [None]:
# Extract predictors
predictors_ids = predictors.drop(['pitchHand_isRight', 'batSide_isRight'], axis = 1).join(all_pitches.loc[:,['batter_id', 'pitcher_id']])

In [None]:
# Split the data into train and test sets
X_id_train, X_id_test, y_id_train, y_id_test = train_test_split(predictors_ids,
                                                                targets, 
                                                                test_size = 0.25, 
                                                                stratify = targets,
                                                                random_state = 123)

In [None]:
scaler = StandardScaler()

X_id_train_scaled = scaler.fit_transform(X_id_train)

In [None]:
X_id_test_scaled = scaler.transform(X_id_test)

In [None]:
# Instatiate a Sequential model
model_ids = Sequential()

# Create a 3 layers
model_ids.add(Dense(300, activation = 'relu', input_shape = (X_id_train_scaled.shape[1],)))
model_ids.add(Dense(100, activation = 'relu'))
model_ids.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model_ids.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [None]:
# fit the training data to the basic model
model_ids.fit(X_id_train_scaled,
              y_id_train,
              validation_split = 0.25,
              epochs = 20,
              callbacks = [early_stopping_monitor])

In [None]:
# Make predictions on the test set
score, acc = model_ids.evaluate(X_id_test_scaled, y_id_test)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# Make predictions on the test data
predictions = model_ids.predict_classes(X_id_test_scaled)

# Print the confusion matrix
confusion_matrix(y_id_test, predictions)

## Calls, Player ID's

In [None]:
# Split the data into train and test sets
X_callid_train, X_callid_test, y_callid_train, y_callid_test = train_test_split(predictors_ids,
                                                                                calls, 
                                                                                test_size = 0.25, 
                                                                                stratify = calls,
                                                                                random_state = 123)

In [None]:
scaler_calls = StandardScaler()

X_callid_train_scaled = scaler_calls.fit_transform(X_callid_train)

In [None]:
X_callid_test_scaled = scaler_calls.transform(X_callid_test)

In [None]:
# Instatiate a Sequential model
model_calls_id = Sequential()

# Create a basic network
model_calls_id.add(Dense(300, activation = 'relu', input_shape = (X_callid_train_scaled.shape[1],)))
model_calls_id.add(Dense(100, activation = 'relu'))
model_calls_id.add(Dense(100, activation = 'relu'))
model_calls_id.add(Dense(y_callid_train.shape[1], activation = 'sigmoid'))

# Create a compiler
model_calls_id.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# fit the training data to the basic model
model_calls_id.fit(X_callid_train_scaled,
                   y_callid_train,
                   validation_split = 0.25,
                   epochs = 20,
                   callbacks = [early_stopping_monitor])

In [None]:
# Make predictions on the test set
score, acc = model_calls.evaluate(X_callid_test_scaled, y_callid_test)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# Make predictions on the test data
predictions = model_calls.predict(X_callid_test_scaled)

In [None]:
matrix = confusion_matrix(y_callid_test.values.argmax(axis=1), predictions.argmax(axis=1))
matrix