In [59]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [61]:
pbpfoc4gomdlF = pd.read_csv('../data/pbpfoc4gomdlF.csv')

In [62]:
X = pbpfoc4gomdlF[['game_seconds_remaining','ydstogo','rush_attempt','pass_length_deep','qb_dropback','defteam_score']]
y = pbpfoc4gomdlF['fourth_down_converted']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)                                                   

In [64]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [65]:
knn = KNeighborsClassifier()
knn.fit(X_train_sc, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [66]:
knn.score(X_train, y_train)

0.6047215496368039

In [67]:
knn.score(X_test, y_test)

0.6196189131968949

In [68]:
y_preds = knn.predict(X_test)

In [69]:
confusion_matrix(y_test, y_preds)

array([[583, 126],
       [413, 295]])

In [70]:
tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()

In [71]:
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print('===========================')
acc = (tp + tn) / (tp + tn + fp + fn)
print(f'Accuracy: {round(acc,4)}')
prec = tp / (tp + fp)
print(f'Precision: {round(prec,4)}')
spec = tn / (tn + fp)
print(f'Specificity (Recall): {round(spec,4)}')
sens = tp / (tp + fn)
print(f'Sensitivity: {round(sens,4)}')

True Negatives: 583
False Positives: 126
False Negatives: 413
True Positives: 295
Accuracy: 0.6196
Precision: 0.7007
Specificity (Recall): 0.8223
Sensitivity: 0.4167


In [72]:
# calculate baseline score
y.value_counts()

0.0    2383
1.0    2338
Name: fourth_down_converted, dtype: int64

In [74]:
baseline_score = 2338 / (2383+2338)
baseline_score

0.4952340605803855