In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report

from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

In [2]:
# Import 'all_pitches.csv'
all_pitches = pd.read_csv('../Data/all_pitches.csv')

In [3]:
all_pitches.head()

Unnamed: 0,game_id,batSide_code,batSide_des,batter,batter_id,call_des,inning_top_bot,pitchHand_code,pitchHand_des,pitch_type,...,strike_left,strike_down_right,strike_down,strike_down_left,call_B,call_C,call_F,call_H,call_S,is_out
0,413661,L,Left,Matt Carpenter,572761,Called Strike,top,L,Left,FF,...,0,0,0,0,0,1,0,0,0,0
1,413661,L,Left,Matt Carpenter,572761,Swinging Strike,top,L,Left,FF,...,0,0,0,0,0,0,0,0,1,0
2,413661,L,Left,Matt Carpenter,572761,Foul,top,L,Left,FF,...,0,0,0,0,0,0,1,0,0,0
3,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,FF,...,0,0,0,0,1,0,0,0,0,0
4,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,CU,...,0,0,0,0,1,0,0,0,0,0


In [67]:
calls = all_pitches.loc[:,'call_B':'is_out']

In [5]:
calls_pct = calls.mean()
calls_pct

call_B    0.360146
call_C    0.168416
call_F    0.173895
call_H    0.064159
call_S    0.114559
is_out    0.173634
dtype: float64

## ``is_out``, No Player ID's

This first iteration of the model will use generic player and pitch information to predict outs. The percentage of pitches that resulted in an out calculated in the ``calls_pct`` table is around 17.36% so I'll be basing the models performance on this statistic. In other words, if the model were to predict no outs for all pitches, it'll have an accuracy of around 82.64% which doesn't provide any insight.

### Simple Model

In [6]:
# Extract predictors
predictors = all_pitches.loc[:,'count_balls':'strike_down_left'].drop(['pitch_locx', 'pitch_locy'], axis=1)

In [7]:
# Extract target variable
targets = calls['is_out']

In [8]:
# Split the data into train and test sets
pitches_train, pitches_test, targets_train, targets_test = train_test_split(predictors,
                                                                            targets, 
                                                                            test_size = 0.25, 
                                                                            stratify = targets,
                                                                            random_state = 123)

In [9]:
# Instatiate a Sequential model
model = Sequential()

# Create a basic network
model.add(Dense(37, activation = 'relu', input_shape = (predictors.shape[1],)))
model.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [10]:
# Set up early stopping monitor
early_stopping_monitor = EarlyStopping(patience=5)

In [11]:
# fit the training data to the basic model
model.fit(pitches_train,
          targets_train,
          validation_split = 0.25,
          epochs = 20,
          callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<keras.callbacks.History at 0x20e8c064908>

In [12]:
# Make predictions on the test set
score, acc = model.evaluate(pitches_test, targets_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.39254891093538735
Test accuracy: 0.8263739525241407


In [16]:
# Make predictions on the test data
predictions = model.predict_classes(pitches_test)

# Print the confusion matrix
confusion_matrix(targets_test, predictions)

array([[439875,     42],
       [ 92388,     46]], dtype=int64)

In [27]:
fpr, tpr, thresholds = roc_curve(targets_test, predictions)

print(roc_auc_score(targets_test, predictions))

0.5002010899119723


This simple model was able to accurately predict outs roughly 82.64% of the time on the test set. That may seem like a promising result but if the model were to predict only no outs, it would perform the same. It's also worth noting the models true-positive rate is almost nonexistent.

### Increase Model Capacity

In [44]:
# Instatiate a Sequential model
model = Sequential()

# Create a 3 layers
model.add(Dense(300, activation = 'relu', input_shape = (predictors.shape[1],)))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [45]:
# fit the training data to the basic model
model.fit(pitches_train,
          targets_train,
          validation_split = 0.25,
          epochs = 20,
          callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x20eb342bb70>

In [46]:
# Make predictions on the test set
score, acc = model.evaluate(pitches_test, targets_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3864145097797152
Test accuracy: 0.8269318551109678


In [47]:
# Make predictions on the test data
predictions = model.predict_classes(pitches_test)

# Print the confusion matrix
confusion_matrix(targets_test, predictions)

array([[433538,   6379],
       [ 85754,   6680]], dtype=int64)

Increasing the model capacity improved model performance. Not only is it more accurate the the baseline model, but it also has a much better true-positive rate.

This model still needs improvement but next will be to add different predictors and target variables to the model to see how models improve.

## Calls, No Player ID's

Predictors stay the same but target labels change.

In [68]:
# Split the data into train and test sets
pitches_train, pitches_test, calls_train, calls_test = train_test_split(predictors,
                                                                        calls, 
                                                                        test_size = 0.25, 
                                                                        stratify = calls,
                                                                        random_state = 123)

In [90]:
# Instatiate a Sequential model
model = Sequential()

# Create a basic network
model.add(Dense(300, activation = 'relu', input_shape = (predictors.shape[1],)))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(calls_train.shape[1], activation = 'sigmoid'))

# Create a compiler
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [10]:
# Set up early stopping monitor
# early_stopping_monitor = EarlyStopping(patience=5)

In [91]:
# fit the training data to the basic model
model.fit(pitches_train,
          calls_train,
          validation_split = 0.25,
          epochs = 20,
          callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


<keras.callbacks.History at 0x2104668a780>

In [92]:
# Make predictions on the test set
score, acc = model.evaluate(pitches_test, calls_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3415373240703978
Test accuracy: 0.8580178707789186


In [93]:
# Make predictions on the test data
predictions = model.predict_classes(pitches_test)

In [94]:
predictions

array([5, 5, 0, ..., 0, 0, 5], dtype=int64)

In [None]:
# Print the confusion matrix
classification_report(calls_test, predictions)