In [5]:
import pandas as pd
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import sklearn.metrics as metrics

In [6]:
random.seed(16)

In [7]:
def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
    
    y_score = clf.predict_proba(X_test)

    # structures
    fpr = dict()
    tpr = dict()
    #hpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    y_test_columns = ['Fastball', 'Breaking Ball', 'Changeup']
    
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for %s' % (roc_auc[i], y_test_columns[i])) #% (roc_auc[i], i))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()

def multiclass_classification_metrics(gs, X_test, y_test):
    
    y_hat = gs.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_hat)

    #specificity = true negative/(true neagtive +false positive)
    specificity = 1984/(1984+21) 

    sensitivity =  metrics.recall_score(y_test, y_hat, average='macro')

    precision = metrics.precision_score(y_test, y_hat, average='macro')

    f1 = metrics.f1_score(y_test, y_hat, average='macro')
    
    
    print('My accuracy is: ', round(accuracy,4))
    print('My specificity is: ', round(specificity, 4))
    print('My sensitivity is: ', round(sensitivity,4))
    print('My precision is: ', round(precision,4))
    print('My f1 score is: ', round(precision,4))
    
#     cm = np.array([['True Negative', 'False Positive'],
#                             ['False Negative', 'True Positive']])

#     cm = pd.DataFrame(cm,columns = ['Pred Offspeed', 'Pred Fastball'], 
#                       index = ['Actual Offspeed','Actual Fastball'])
    
    # Displaying sample confusion matrix
#     display(cm)

    # Displaying actual confusion matrix 
    metrics.plot_confusion_matrix(gs, X_test, y_test, cmap='Accent', 
                          values_format='d', display_labels=[ 'Change-up',
                                                              'Breaking Ball', 
                                                             'Fastball']);
    
    plot_multiclass_roc(gs, X_test, y_test, 3, figsize=(17, 6))

def binary_classification_metrics(gs, X_test, y_test):
    
    y_hat = gs.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_hat)

    #specificity = true negative/(true neagtive +false positive)
    specificity = 1984/(1984+21) 

    sensitivity =  metrics.recall_score(y_test, y_hat)

    precision = metrics.precision_score(y_test, y_hat)

    f1 = metrics.f1_score(y_test, y_hat)
    print('My accuracy is: ', round(accuracy,4))
    print('My specificity is: ', round(specificity, 4))
    print('My sensitivity is: ', round(sensitivity,4))
    print('My precision is: ', round(precision,4))
    print('My f1 score is: ', round(precision,4))
    
    cm = np.array([['True Negative', 'False Positive'],
                            ['False Negative', 'True Positive']])

    cm = pd.DataFrame(cm,columns = ['Pred Offspeed', 'Pred Fastball'], 
                      index = ['Actual Offspeed','Actual Fastball'])
    
    # Displaying sample confusion matrix
    display(cm)

    # Displaying actual confusion matrix 
    metrics.plot_confusion_matrix(gs, X_test, y_test, cmap='Accent', 
                          values_format='d', display_labels=['Offspeed Pitch', 
                                                             'Fastball Pitch']);
    
    metrics.plot_roc_curve(gs, X_test, y_test)
    # add worst case scenario line
    plt.plot([0, 1], [0, 1])
    plt.title('ROC AUC Curve');
    
    return f'My ROC AUC score is: {metrics.roc_auc_score(y_test, y_hat)}'

In [8]:
# Specifying s3 as boto3 resource and connecting to bucket
s3 = boto3.resource('s3')
bucketname = 'baseballstats'

# Name of files within the bucket
binary_label = 'statcast_data/clean_2019_binary_label.csv'

# multiclass 2019 DataFrame
multiclass_label = 'statcast_data/clean_2019_3.csv'

# Specifying location of files
multiclass_obj = s3.Object(bucketname, multiclass_label)
binary_obj = s3.Object(bucketname, binary_label)

# Getting the body of the files
multiclass = multiclass_obj.get()['Body']
binary = binary_obj.get()['Body']

# Reading the boto3 object into a csv

# df is my default for binary labels
df = pd.read_csv(multiclass)

# df_3 is my default for 
binary_df = pd.read_csv(binary)

In [9]:
df['pitch'].value_counts(normalize=True)

3    0.584354
2    0.286061
1    0.129585
Name: pitch, dtype: float64

In [11]:
binary_df['pitch'].value_counts(normalize=True)

1    0.584354
0    0.415646
Name: pitch, dtype: float64

No need to dummy independent features because the random forest will find the best splits for a feature.
Random forest is a tree-based model and therefore there is no need to standardize the independent variables. 

In [None]:
X = df.drop(columns=['pitch'])
y = df['pitch']

In [4]:
y.value_counts(normalize=True)

Unnamed: 0.1,Unnamed: 0,balls,strikes,on_1b,outs_when_up,inning,pitch,1,2,3,...,of_fielding_alignment_4th outfielder,of_fielding_alignment_Standard,Starting_Pitcher,risp,run_diff,inning_topbot_Top,previous_type_B,previous_type_S,woba_value_y,launch_speed
0,0,3.0,2.0,0.0,2.0,9.0,2,0,1,5,...,0,1,0,0,4.0,0,0,1,0.366304,90.160819
1,1,3.0,2.0,0.0,2.0,9.0,3,0,1,4,...,0,1,0,0,4.0,0,1,0,0.366304,90.160819
2,2,2.0,2.0,0.0,2.0,9.0,3,0,1,3,...,0,1,0,0,4.0,0,0,1,0.366304,90.160819
3,3,2.0,1.0,0.0,2.0,9.0,3,0,1,2,...,0,1,0,0,4.0,0,1,0,0.366304,90.160819
4,4,1.0,1.0,0.0,2.0,9.0,2,0,0,2,...,0,1,0,0,4.0,0,0,1,0.366304,90.160819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484312,484312,0.0,0.0,0.0,2.0,5.0,3,0,0,0,...,0,1,0,0,3.0,1,0,0,0.266270,88.245098
484313,484313,1.0,0.0,1.0,1.0,4.0,3,1,0,0,...,0,1,0,0,-2.0,1,1,0,0.257237,86.421008
484314,484314,0.0,0.0,1.0,1.0,4.0,1,0,0,0,...,0,1,0,0,-2.0,1,0,0,0.257237,86.421008
484315,484315,0.0,1.0,1.0,0.0,2.0,1,0,0,1,...,0,1,0,1,0.0,1,0,1,0.257237,86.421008


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=16, stratify=y)

In [None]:
# Random Forest does not require standardization
pipe = Pipeline([
    ('rf', RandomForestClassifier(random_state=16))
])

Gridsearched over many parameters over the course of a week:

params = {
    'rf__n_estimators': [250, 500],
    'rf__max_depth': [10, 25, 50],
    'rf__class_weight': ['balanced'],
    'rf__min_samples_split': [2, 20, 40]
    'rf__min_samples_split': [250, 500, 1000],
    'rf__min_weight_fraction_leaf': [0.12],
    'rf__max_features': [0.33, 0.5],
    'rf__min_impurity_decrease': [0, 0.1],
    'rf__ccp_alpha': [0.005, 0.05, 0.1],
    'rf__max_samples': [None, 100_000, 250_000],
    'rf__validation_fraction': [0.2]
    'n_iter_no_change': [2, 10]
}


I found that parameters that reduced variance error also brought my accuracy far below the baseline. Therefore, I made the conscious decision to overfit my model in order to obtain the best accuracy score for a pitcher who relies on a fastball most of the time. My logistic regression was able to account for pitchers with a more even mix, so I wanted to have a model that best fit a fastball-reliant pitcher. 


In [None]:
params = {
    'rf__n_estimators': [300],
    'rf__max_depth': [None],
    'rf__class_weight': ['balanced'],
    'rf__min_samples_split': [2],
    'rf__min_samples_leaf': [1],
    'rf__max_features': [0.33, 0.5],
    'rf__max_samples': [None],
}

In [None]:
grid = GridSearchCV(pipe, params, cv = 5,
                    scoring = 'accuracy', verbose = 0, n_jobs=-1)

grid.fit(X_train, y_train)

In [None]:
grid.score(X_train, y_train)
# Accuracy score: 0.9998033099336794

![figure-1](../files/rf-mc-accuracy-precision.png)

![figure-1](../files/download-4.png)

![figure-1](../files/download-5.png)

Binary Classification 

In [None]:
binary_df

In [None]:
X_2 = binary_df.drop(columns=['pitch'])
y_2 = binary_df['pitch']

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.25, 
                                                    random_state=16, stratify=y)

In [None]:
grid_2 = GridSearchCV(pipe, params, cv = 5,
                    scoring = 'accuracy', verbose = 1, n_jobs = -1)

grid_2.fit(X_train_2, y_train_2)

In [None]:
grid_2.score(X_train_2, y_train_2)
# score 0.9998088504989279

In [None]:
binary_classification_metrics(grid_2, X_test_2, y_test_2)

![figure-1](../files/rf-binary.png)

![figure-1](../files/download-6.png)

![figure-1](../files/download-7.png)