In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [5]:
pbpfoc4gomdlF = pd.read_csv('../data/pbpfoc4gomdlF.csv')

In [6]:
pbpfoc4gomdlF.isnull().sum()

Unnamed: 0                       0
date_id                          0
yardline_100                     0
quarter_seconds_remaining        0
half_seconds_remaining           0
game_seconds_remaining           0
qtr                              0
down                             0
goal_to_go                       0
ydstogo                          0
play_type                        0
yards_gained                     4
shotgun                          0
no_huddle                        0
qb_dropback                      0
qb_scramble                      0
air_yards                     1929
yards_after_catch             3257
posteam_timeouts_remaining       0
posteam_score                    0
defteam_score                    0
score_differential               0
fourth_down_converted            0
fourth_down_failed               0
rush_attempt                     0
pass_attempt                     0
temp                           965
humd                          1284
wspd                

In [7]:
X = pbpfoc4gomdlF[['game_seconds_remaining','ydstogo','rush_attempt','pass_length_deep','qb_dropback','defteam_score']]
y = pbpfoc4gomdlF['fourth_down_converted']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)                                                   

In [9]:
parameters = {'C': [0.001, 0.01, 0.1, 1, 10],'penalty': ['l1', 'l2']}

In [10]:
gs_results = GridSearchCV(estimator = LogisticRegression(random_state = 42), 
                          param_grid = parameters,                           
                          scoring = 'precision',                                
                          cv = 5).fit(X_train, y_train) 

In [11]:
gs_results.best_estimator_

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
lr = LogisticRegression(C=10,penalty='l1', random_state=42)
lr.fit(X_train, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
lr.fit(X = X_train, y = y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
lr.score(X_train, y_train)

0.6346852300242131

In [15]:
lr.score(X_test, y_test)

0.6224417784050812

In [16]:
y_preds = lr.predict(X_test)

In [17]:
confusion_matrix(y_test, y_preds)

array([[404, 305],
       [230, 478]])

In [18]:
tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()

In [19]:
print("True Negatives: " + str(tn))
print("False Positives: " + str(fp))
print("False Negatives: " + str(fn))
print("True Positives: " + str(tp))
print('===========================')
acc = (tp + tn) / (tp + tn + fp + fn)
print(f'Accuracy: {round(acc,4)}')
prec = tp / (tp + fp)
print(f'Precision: {round(prec,4)}')
spec = tn / (tn + fp)
print(f'Specificity (Recall): {round(spec,4)}')
sens = tp / (tp + fn)
print(f'Sensitivity: {round(sens,4)}')

True Negatives: 404
False Positives: 305
False Negatives: 230
True Positives: 478
Accuracy: 0.6224
Precision: 0.6105
Specificity (Recall): 0.5698
Sensitivity: 0.6751


In [20]:
# calculate baseline score
y.value_counts()

0.0    2383
1.0    2338
Name: fourth_down_converted, dtype: int64

In [21]:
baseline_score = 2338 / (2383+2338)
baseline_score

0.4952340605803855