In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import boto3

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics

In [2]:
random.seed(16)

In [3]:
def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
    
    y_score = clf.predict_proba(X_test)

    # structures
    fpr = dict()
    tpr = dict()
    #hpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    y_test_columns = ['Fastball', 'Breaking Ball', 'Changeup']
    
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for %s' % (roc_auc[i], y_test_columns[i])) #% (roc_auc[i], i))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()

In [4]:
def multiclass_classification_metrics(gs, X_test, y_test):
    
    y_hat = gs.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_hat)

    #specificity = true negative/(true neagtive +false positive)
    specificity = 1984/(1984+21) 

    sensitivity =  metrics.recall_score(y_test, y_hat, average='macro')

    precision = metrics.precision_score(y_test, y_hat, average='macro')

    f1 = metrics.f1_score(y_test, y_hat, average='macro')
    
    
    print('My accuracy is: ', round(accuracy,4))
    print('My specificity is: ', round(specificity, 4))
    print('My sensitivity is: ', round(sensitivity,4))
    print('My precision is: ', round(precision,4))
    print('My f1 score is: ', round(precision,4))
    
#     cm = np.array([['True Negative', 'False Positive'],
#                             ['False Negative', 'True Positive']])

#     cm = pd.DataFrame(cm,columns = ['Pred Offspeed', 'Pred Fastball'], 
#                       index = ['Actual Offspeed','Actual Fastball'])
    
    # Displaying sample confusion matrix
#     display(cm)

    # Displaying actual confusion matrix 
    metrics.plot_confusion_matrix(gs, X_test, y_test, cmap='Accent', 
                          values_format='d', display_labels=[ 'Change-up',
                                                              'Breaking Ball', 
                                                             'Fastball']);
    
    plot_multiclass_roc(gs, X_test, y_test, 3, figsize=(17, 6))

In [5]:
def binary_classification_metrics(gs, X_test, y_test):
    
    y_hat = gs.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_hat)

    #specificity = true negative/(true neagtive +false positive)
    specificity = 1984/(1984+21) 

    sensitivity =  metrics.recall_score(y_test, y_hat)

    precision = metrics.precision_score(y_test, y_hat)

    f1 = metrics.f1_score(y_test, y_hat)
    print('My accuracy is: ', round(accuracy,4))
    print('My specificity is: ', round(specificity, 4))
    print('My sensitivity is: ', round(sensitivity,4))
    print('My precision is: ', round(precision,4))
    print('My f1 score is: ', round(precision,4))
    
    cm = np.array([['True Negative', 'False Positive'],
                            ['False Negative', 'True Positive']])

    cm = pd.DataFrame(cm,columns = ['Pred Offspeed', 'Pred Fastball'], 
                      index = ['Actual Offspeed','Actual Fastball'])
    
    # Displaying sample confusion matrix
    display(cm)

    # Displaying actual confusion matrix 
    metrics.plot_confusion_matrix(gs, X_test, y_test, cmap='Accent', 
                          values_format='d', display_labels=['Offspeed Pitch', 
                                                             'Fastball Pitch']);
    
    metrics.plot_roc_curve(gs, X_test, y_test)
    # add worst case scenario line
    plt.plot([0, 1], [0, 1])
    plt.title('ROC AUC Curve');
    
    return f'My ROC AUC score is: {metrics.roc_auc_score(y_test, y_hat)}'

In [6]:
# Specifying s3 as boto3 resource and connecting to bucket
s3 = boto3.resource('s3')
bucketname = 'baseballstats'

# Name of files within the bucket
binary_label = 'statcast_data/clean_2019_binary_label.csv'

# multiclass 2019 DataFrame
multiclass_label = 'statcast_data/clean_2019_3.csv'

# Specifying location of files
multiclass_obj = s3.Object(bucketname, multiclass_label)
binary_obj = s3.Object(bucketname, binary_label)

# Getting the body of the files
multiclass = multiclass_obj.get()['Body']
binary = binary_obj.get()['Body']

# Reading the boto3 object into a csv

# df is my default for binary labels
df = pd.read_csv(multiclass)

# df_3 is my default for 
binary_df = pd.read_csv(binary)

In [7]:
# Specifying s3 as boto3 resource and connecting to bucket
s3 = boto3.resource('s3')
bucketname = 'baseballstats'

# Name of files within the bucket
binary_label = 'statcast_data/clean_2019_binary_label.csv'

# multiclass 2019 DataFrame
multiclass_label = 'statcast_data/clean_2019_3.csv'

# Specifying location of files
multiclass_obj = s3.Object(bucketname, multiclass_label)
binary_obj = s3.Object(bucketname, binary_label)

# Getting the body of the files
multiclass = multiclass_obj.get()['Body']
binary = binary_obj.get()['Body']

# Reading the boto3 object into a csv

# df is my default for binary labels
df = pd.read_csv(multiclass)

# df_3 is my default for 
binary_df = pd.read_csv(binary)

## Dummy Independent Variables

Logistic regression performed better with dummy variables, therefore I wanted to perform my KNN model with the same dummy variables.

In [9]:
df = pd.get_dummies(data=df, columns=['balls', 'strikes', 'outs_when_up',
                                           'inning', 'previous_pitch', 
                                            'previous_zone'], drop_first=True)

In [10]:
df.isna().sum()

on_1b                        0
pitch                        0
changeup_in_sequence         0
breaking_ball_in_sequence    0
fastball_in_sequence         0
                            ..
previous_zone_9.0            0
previous_zone_11.0           0
previous_zone_12.0           0
previous_zone_13.0           0
previous_zone_14.0           0
Length: 62, dtype: int64

In [11]:
X = df.drop(columns='pitch')
y = df['pitch']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=16, stratify=y)

In [13]:
knn_pipe = Pipeline([
    ('mm', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
    #'knn_c', KNeighborsClassifier())
])

knn_params = {
    'knn__algorithm': ['brute'],
    'knn__n_neighbors': [250, 500],
    'knn__weights': ['uniform'],
    'knn__p':[1,2]
}


Best params:
{'knn__algorithm': 'brute',
 'knn__n_neighbors': 250,
 'knn__p': 1,
 'knn__weights': 'uniform'}

In [14]:
knn_gridsearch = GridSearchCV(knn_pipe, knn_params, cv=3, verbose=1, n_jobs=-1, scoring='accuracy')
knn_gridsearch.fit(X_train, y_train)

In [15]:
multiclass_classification_metrics(knn_gridsearch, X_test, y_test)

In [16]:
knn_gridsearch.score(X_train, y_train)
# Value is 0.5888401934765385

![figure-2](../files/MC-KNN-accuracy-precision.png)

![figure-1](../files/download.png)



![figure-1](../files/download-1.png)


## Binary Classification 

In [17]:
binary_df = pd.get_dummies(data=binary_df, columns=['balls', 'strikes', 'outs_when_up',
                                           'inning', 'previous_pitch', 
                                            'previous_zone'], drop_first=True)

In [18]:
X_2 = binary_df.drop(columns='pitch')
y_2 = binary_df['pitch']

In [19]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.25, 
                                                    random_state=16, stratify=y)

In [20]:
knn_gridsearch_2 = GridSearchCV(knn_pipe, knn_params, cv=3, verbose=0, n_jobs=-1)
knn_gridsearch_2.fit(X_train_2, y_train_2)

Best Params:
{'knn__algorithm': 'brute',
 'knn__n_neighbors': 250,
 'knn__p': 1,
 'knn__weights': 'uniform'}

In [21]:
knn_gridsearch_2.score(X_train_2, y_train_2)
# Score is 0.6048524270446071

In [22]:
binary_classification_metrics(knn_gridsearch_2, X_test_2, y_test_2)

![figure-2](../files/Binary-accuracy-precision.png) 

![figure-1](../files/download-2.png)

![figure-1](../files/download-3.png)

## Model Interpretations

* KNN did not significantly improve the accuracy of my model. 
* Furthermore, I tested SVM Classifier with PCA selection, and Gaussian Naive Bayes Classifier. All of the scores were not worth displaying because they could not beat the logistic regression or random forest models. 