In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import set_random_seed

np.random.seed(123)
set_random_seed(1234)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report

from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
# Import 'all_pitches.csv'
all_pitches = pd.read_csv('../Data/all_pitches.csv')

In [3]:
all_pitches.head()

Unnamed: 0,game_id,batSide_code,batSide_des,batter,batter_id,call_des,inning_top_bot,pitchHand_code,pitchHand_des,pitch_type,...,strike_left,strike_down_right,strike_down,strike_down_left,call_B,call_C,call_F,call_H,call_S,is_out
0,413661,L,Left,Matt Carpenter,572761,Called Strike,top,L,Left,FF,...,0,0,0,0,0,1,0,0,0,0
1,413661,L,Left,Matt Carpenter,572761,Swinging Strike,top,L,Left,FF,...,0,0,0,0,0,0,0,0,1,0
2,413661,L,Left,Matt Carpenter,572761,Foul,top,L,Left,FF,...,0,0,0,0,0,0,1,0,0,0
3,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,FF,...,0,0,0,0,1,0,0,0,0,0
4,413661,L,Left,Matt Carpenter,572761,Ball,top,L,Left,CU,...,0,0,0,0,1,0,0,0,0,0


In [4]:
calls = all_pitches.loc[:,'call_B':'is_out']

In [5]:
calls_pct = calls.mean()
calls_pct

call_B    0.360146
call_C    0.168416
call_F    0.173895
call_H    0.064159
call_S    0.114559
is_out    0.173634
dtype: float64

## ``is_out``, No Player ID's

This first iteration of the model will use generic player and pitch information to predict outs. The percentage of pitches that resulted in an out calculated in the ``calls_pct`` table is around 17.36% so I'll be basing the models performance on this statistic. In other words, if the model were to predict no outs for all pitches, it'll have an accuracy of around 82.64% which doesn't provide any insight.

### Simple Model

In [6]:
# Extract predictors
predictors = all_pitches.loc[:,'count_balls':'strike_down_left'].drop(['pitch_locx', 'pitch_locy'], axis=1)

In [7]:
# Extract target variable
targets = calls['is_out']

In [8]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors,
                                                    targets, 
                                                    test_size = 0.25, 
                                                    stratify = targets,
                                                    random_state = 123)

In [9]:
# Instatiate a Sequential model
model_base = Sequential()

# Create a basic network
model_base.add(Dense(40, activation = 'relu', input_shape = (X_train.shape[1],)))
model_base.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model_base.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [10]:
# Set up early stopping monitor
early_stopping_monitor = EarlyStopping(patience=3)

In [11]:
# fit the training data to the basic model
model_base.fit(X_train,
               y_train,
               validation_split = 0.25,
               epochs = 20,
               callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.callbacks.History at 0x1da536d4400>

In [12]:
# Make predictions on the test set
score, acc = model_base.evaluate(X_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 2.798642316433014
Test accuracy: 0.8263664386845875


In [13]:
# Make predictions on the test data
predictions = model_base.predict_classes(X_test)

# Print the confusion matrix
confusion_matrix(y_test, predictions)

array([[439917,      0],
       [ 92434,      0]], dtype=int64)

In [14]:
fpr, tpr, thresholds = roc_curve(y_test, predictions)

print(roc_auc_score(y_test, predictions))

0.5


This simple model was able to accurately predict outs roughly 82.64% of the time on the test set. That may seem like a promising result but if the model were to predict only no outs, it would perform the same. It's also worth noting the models true-positive rate is almost nonexistent.

### Increase Model Capacity

In [15]:
# Instatiate a Sequential model
model_outs = Sequential()

# Create a 3 layers
model_outs.add(Dense(300, activation = 'relu', input_shape = (X_train.shape[1],)))
model_outs.add(Dense(100, activation = 'relu'))
model_outs.add(Dense(100, activation = 'relu'))
model_outs.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model_outs.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [16]:
# fit the training data to the basic model
model_outs.fit(X_train,
               y_train,
               validation_split = 0.25,
               epochs = 20,
               callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<keras.callbacks.History at 0x1da212ebf28>

In [17]:
# Make predictions on the test set
score, acc = model_outs.evaluate(X_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.38958796286646724
Test accuracy: 0.8265524062135298


In [19]:
# Make predictions on the test data
predictions = model_outs.predict_classes(X_test)

# Print the confusion matrix
confusion_matrix(y_test, predictions)

array([[438641,   1276],
       [ 91059,   1375]], dtype=int64)

Increasing the model capacity improved model performance. Not only is it more accurate the the baseline model, but it also has a much better true-positive rate.

This model still needs improvement but next will be to add different predictors and target variables to the model to see how models improve.

## Calls, No Player ID's

Predictors stay the same but target labels change.

In [20]:
# Split the data into train and test sets
X_call_train, X_call_test, y_call_train, y_call_test = train_test_split(predictors,
                                                                        calls, 
                                                                        test_size = 0.25, 
                                                                        stratify = calls,
                                                                        random_state = 123)

In [23]:
# Instatiate a Sequential model
model_calls = Sequential()

# Create a basic network
model_calls.add(Dense(300, activation = 'relu', input_shape = (X_call_train.shape[1],)))
model_calls.add(Dense(100, activation = 'relu'))
model_calls.add(Dense(100, activation = 'relu'))
model_calls.add(Dense(y_call_train.shape[1], activation = 'sigmoid'))

# Create a compiler
model_calls.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['categorical_accuracy'])

In [24]:
# fit the training data to the basic model
model_calls.fit(X_call_train,
                y_call_train,
                validation_split = 0.25,
                epochs = 20,
                callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


<keras.callbacks.History at 0x1da2221dbe0>

In [25]:
# Make predictions on the test set
score, acc = model_calls.evaluate(X_call_test, y_call_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3410769599902492
Test accuracy: 0.5038348758612505


In [26]:
# Make predictions on the test data
predictions = model_calls.predict(X_call_test)

In [27]:
matrix = confusion_matrix(y_call_test.values.argmax(axis=1), predictions.argmax(axis=1))
matrix

array([[156504,  20594,   4218,      0,   2369,   8039],
       [  6894,  66822,   5925,      0,   1899,   8116],
       [ 17134,  25199,  17847,      0,   2969,  29424],
       [  3702,  11022,   5570,      0,   1277,  12584],
       [ 25532,  12235,   7253,      0,   3847,  12119],
       [  8594,  19036,   9887,      0,   2543,  23197]], dtype=int64)

Looking at the confusion matrix, it appears the model wasn't able to pick up which pitches resulted in hits. More capacity may be needed

## ``is_out``, Player ID's

In [29]:
# Extract predictors
predictors_ids = predictors.drop(['pitchHand_isRight', 'batSide_isRight'], axis = 1).join(all_pitches.loc[:,['batter_id', 'pitcher_id']])

In [31]:
# Split the data into train and test sets
X_id_train, X_id_test, y_id_train, y_id_test = train_test_split(predictors_ids,
                                                                targets, 
                                                                test_size = 0.25, 
                                                                stratify = targets,
                                                                random_state = 123)

In [32]:
scaler = StandardScaler()

X_id_train_scaled = scaler.fit_transform(X_id_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [33]:
X_id_test_scaled = scaler.transform(X_id_test)

  """Entry point for launching an IPython kernel.


In [34]:
# Instatiate a Sequential model
model_ids = Sequential()

# Create a 3 layers
model_ids.add(Dense(300, activation = 'relu', input_shape = (X_id_train_scaled.shape[1],)))
model_ids.add(Dense(100, activation = 'relu'))
model_ids.add(Dense(1, activation = 'sigmoid'))

# Create a compiler
model_ids.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [35]:
# fit the training data to the basic model
model_ids.fit(X_id_train_scaled,
              y_id_train,
              validation_split = 0.25,
              epochs = 20,
              callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


<keras.callbacks.History at 0x1da22ca6198>

In [36]:
# Make predictions on the test set
score, acc = model_ids.evaluate(X_id_test_scaled, y_id_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3887841421499502
Test accuracy: 0.8255324024923878


In [37]:
# Make predictions on the test data
predictions = model_ids.predict_classes(X_id_test_scaled)

# Print the confusion matrix
confusion_matrix(y_id_test, predictions)

array([[429977,   9940],
       [ 82938,   9496]], dtype=int64)

## Calls, Player ID's

In [40]:
# Split the data into train and test sets
X_callid_train, X_callid_test, y_callid_train, y_callid_test = train_test_split(predictors_ids,
                                                                                calls, 
                                                                                test_size = 0.25, 
                                                                                stratify = calls,
                                                                                random_state = 123)

In [41]:
scaler_calls = StandardScaler()

X_callid_train_scaled = scaler_calls.fit_transform(X_callid_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [42]:
X_callid_test_scaled = scaler_calls.transform(X_callid_test)

  """Entry point for launching an IPython kernel.


In [43]:
# Instatiate a Sequential model
model_calls_id = Sequential()

# Create a basic network
model_calls_id.add(Dense(300, activation = 'relu', input_shape = (X_callid_train_scaled.shape[1],)))
model_calls_id.add(Dense(100, activation = 'relu'))
model_calls_id.add(Dense(100, activation = 'relu'))
model_calls_id.add(Dense(y_callid_train.shape[1], activation = 'sigmoid'))

# Create a compiler
model_calls_id.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['categorical_accuracy'])

In [44]:
# fit the training data to the basic model
model_calls_id.fit(X_callid_train_scaled,
                   y_callid_train,
                   validation_split = 0.25,
                   epochs = 20,
                   callbacks = [early_stopping_monitor])

Train on 1197789 samples, validate on 399263 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.callbacks.History at 0x1da22f35d30>

In [45]:
# Make predictions on the test set
score, acc = model_calls.evaluate(X_callid_test_scaled, y_callid_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 1.900280924400127
Test accuracy: 0.39021622951718316


In [46]:
# Make predictions on the test data
predictions = model_calls.predict(X_callid_test_scaled)

In [47]:
matrix = confusion_matrix(y_callid_test.values.argmax(axis=1), predictions.argmax(axis=1))
matrix

array([[152318,  24459,   1335,   1344,   1527,  10741],
       [ 38763,  43984,    230,   1170,    875,   4634],
       [ 43423,  31440,    899,   1366,   1563,  13882],
       [ 14467,  13562,    268,    564,    429,   4865],
       [ 36695,  14804,    543,    925,    925,   7094],
       [ 28537,  23121,    556,   1102,    899,   9042]], dtype=int64)