In [17]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import tensorflow as tf
import tensorflow.keras.layers as layers

#### Starting the data analysis

In [10]:
nfl_df = pd.read_csv('../data/cleaned_data.csv')

In [None]:
away_cols = [col for col in nfl_df.columns if 'away' in col]
nfl_df.drop(away_cols, axis=1, inplace=True)

In [None]:
nfl_df.describe()

Unnamed: 0,home_team,home_st_downs,home_passing_st_downs,home_rushing_st_downs,home_st_downs_from_penalties,home_rd_down_efficiency,home_th_down_efficiency,home_total_plays,home_total_yards,home_total_drives,...,home_rushing,home_rushing_attempts,home_yards_per_rush,home_red_zone_made_att,home_penalties,home_turnovers,home_fumbles_lost,home_defensive_special_teams_tds,home_possession,home_wins
count,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,...,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0
mean,15.427061,20.295278,12.182523,6.036646,2.07611,0.393012,0.263789,63.659619,351.127555,11.508809,...,112.804087,26.768147,4.144397,0.482692,0.126379,1.399577,0.558844,0.341085,1808.558844,0.564482
std,9.247468,4.927634,3.911551,2.97238,1.528705,0.141181,0.402143,8.347047,84.108843,1.733402,...,52.111857,7.643905,1.331432,0.350088,0.045779,1.278674,0.763148,0.820624,258.007506,0.495999
min,0.0,5.0,2.0,0.0,0.0,0.0,0.0,36.0,106.0,0.0,...,7.0,6.0,0.7,0.0,0.0,0.0,0.0,0.0,1085.0,0.0
25%,7.0,17.0,10.0,4.0,1.0,0.3,0.0,58.0,292.0,10.0,...,77.0,21.0,3.3,0.2,0.101868,0.0,0.0,0.0,1637.0,0.0
50%,16.0,20.0,12.0,6.0,2.0,0.388889,0.0,64.0,349.0,11.0,...,104.0,26.0,4.0,0.5,0.120879,1.0,0.0,0.0,1809.0,1.0
75%,23.0,24.0,15.0,8.0,3.0,0.5,0.5,69.0,408.0,13.0,...,141.0,32.0,4.9,0.75,0.142857,2.0,1.0,0.0,1993.0,1.0
max,31.0,40.0,25.0,19.0,10.0,0.846154,1.0,92.0,625.0,18.0,...,352.0,54.0,10.6,1.0,1.0,7.0,4.0,6.0,2569.0,1.0


In [None]:
nfl_df.corr()

Unnamed: 0,home_team,home_st_downs,home_passing_st_downs,home_rushing_st_downs,home_st_downs_from_penalties,home_rd_down_efficiency,home_th_down_efficiency,home_total_plays,home_total_yards,home_total_drives,...,home_rushing,home_rushing_attempts,home_yards_per_rush,home_red_zone_made_att,home_penalties,home_turnovers,home_fumbles_lost,home_defensive_special_teams_tds,home_possession,home_wins
home_team,1.0,-0.066051,-0.011456,-0.096987,0.004982,-0.024526,0.00785,-0.030412,-0.072112,-0.009078,...,-0.062358,-0.056882,-0.043932,-0.028863,-0.014204,0.004345,-0.001265,0.022796,-0.008188,-0.042908
home_st_downs,-0.066051,1.0,0.734739,0.500388,0.370459,0.519295,0.104975,0.679953,0.777565,-0.095459,...,0.34027,0.344951,0.180593,0.229265,-0.033474,-0.030155,-0.018032,-0.07829,0.546366,0.226353
home_passing_st_downs,-0.011456,0.734739,1.0,-0.133229,0.068673,0.390836,0.103935,0.566405,0.655377,-0.038773,...,-0.185199,-0.127436,-0.15158,0.146755,-0.027917,0.083402,0.038097,-0.094985,0.304029,0.021373
home_rushing_st_downs,-0.096987,0.500388,-0.133229,1.0,0.009474,0.335954,0.030817,0.289263,0.414545,-0.096832,...,0.817264,0.705823,0.545228,0.148765,-0.023635,-0.166397,-0.08054,0.014243,0.441767,0.341845
home_st_downs_from_penalties,0.004982,0.370459,0.068673,0.009474,1.0,0.020632,0.012516,0.180047,0.023438,-0.020213,...,-0.018367,0.065604,-0.090153,0.074251,0.009487,0.012932,0.000994,-0.037011,0.124265,0.010264
home_rd_down_efficiency,-0.024526,0.519295,0.390836,0.335954,0.020632,1.0,-0.121319,0.302829,0.485504,-0.329191,...,0.266116,0.341697,0.082819,0.238892,-0.032656,-0.115218,-0.026663,-0.033869,0.3782,0.285271
home_th_down_efficiency,0.00785,0.104975,0.103935,0.030817,0.012516,-0.121319,1.0,0.150239,0.05298,-0.03238,...,-0.021258,0.015708,-0.051963,0.016023,0.012422,0.012295,0.002869,-0.037769,0.095404,-0.003695
home_total_plays,-0.030412,0.679953,0.566405,0.289263,0.180047,0.302829,0.150239,1.0,0.539669,0.306177,...,0.192563,0.382507,-0.069101,0.002331,-0.038334,0.13043,0.058446,-0.120998,0.697739,0.068074
home_total_yards,-0.072112,0.777565,0.655377,0.414545,0.023438,0.485504,0.05298,0.539669,1.0,0.018376,...,0.429977,0.342207,0.311719,0.173023,-0.058739,-0.062427,-0.02214,-0.052668,0.468796,0.30754
home_total_drives,-0.009078,-0.095459,-0.038773,-0.096832,-0.020213,-0.329191,-0.03238,0.306177,0.018376,1.0,...,-0.007975,0.087362,-0.094406,-0.005825,-0.027178,0.306562,0.186329,0.023666,0.089754,0.013488


##### Because 4th downs occur at such a low rate in games we can get rid of the 4th down columns. 
##### It also looks like the sack yardage doesn't play into the winning probability of the home team and home penalties

In [None]:
removals = [
    'home_th_down_efficiency', 
    'home_total_plays', 
    'home_total_drives', 
    'home_sacks_yards_lost',
    'home_st_downs_from_penalties', 
    'home_penalties']
nfl_df.drop(removals, axis=1, inplace=True)

In [None]:
home_win_corr = nfl_df.corr()['home_wins']
print(home_win_corr)

home_team                          -0.042908
home_st_downs                       0.226353
home_passing_st_downs               0.021373
home_rushing_st_downs               0.341845
home_rd_down_efficiency             0.285271
home_total_yards                    0.307540
home_yards_per_play                 0.320065
home_passing                        0.077728
home_comp_att                       0.337767
home_yards_per_pass                 0.441426
home_interceptions_thrown          -0.374637
home_rushing                        0.380635
home_rushing_attempts               0.583820
home_yards_per_rush                 0.034106
home_red_zone_made_att              0.145718
home_turnovers                     -0.409263
home_fumbles_lost                  -0.204258
home_defensive_special_teams_tds    0.214480
home_possession                     0.392262
home_wins                           1.000000
Name: home_wins, dtype: float64


#### Separating the labels from the DataFrame

In [11]:
labels = nfl_df.pop('home_wins')

In [12]:
def min_max_scaler(data: pd.Series) -> pd.Series:
    """ Function to transform the data into the range 0 to 1. """
    min_val = data.min()
    max_val = data.max()

    f = lambda x: (x - min_val) / (max_val - min_val)

    return data.map(f)

In [13]:
nfl_df_scaled = nfl_df.apply(min_max_scaler)

In [None]:
nfl_df_scaled.head()

Unnamed: 0,home_team,home_st_downs,home_passing_st_downs,home_rushing_st_downs,home_rd_down_efficiency,home_total_yards,home_yards_per_play,home_passing,home_comp_att,home_yards_per_pass,home_interceptions_thrown,home_rushing,home_rushing_attempts,home_yards_per_rush,home_red_zone_made_att,home_turnovers,home_fumbles_lost,home_defensive_special_teams_tds,home_possession
0,0.483871,0.285714,0.391304,0.157895,0.393939,0.314066,0.346154,0.34433,0.557796,0.352,0.0,0.217391,0.270833,0.363636,0.0,0.142857,0.25,0.0,0.315364
1,0.806452,0.6,0.565217,0.421053,0.393939,0.620424,0.487179,0.606186,0.473118,0.608,0.2,0.310145,0.5625,0.282828,0.0,0.142857,0.0,0.0,0.70283
2,0.645161,0.2,0.217391,0.157895,0.181818,0.200385,0.166667,0.187629,0.086022,0.168,0.8,0.266667,0.333333,0.383838,0.0,0.571429,0.0,0.333333,0.240566
3,0.967742,0.657143,0.913043,0.263158,0.525253,0.622351,0.525641,0.672165,0.573477,0.496,0.6,0.22029,0.25,0.393939,0.0,0.428571,0.0,0.0,0.477763
4,0.709677,0.314286,0.478261,0.052632,0.422078,0.342967,0.307692,0.503093,0.585146,0.4,0.2,0.037681,0.208333,0.060606,0.0,0.285714,0.25,0.0,0.440701


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    nfl_df_scaled, labels, test_size=0.3, random_state=42)

In [15]:
X_train.shape

(993, 50)

In [18]:
input_data = layers.Input(shape=(X_train.shape[1],))
x = layers.Dense(64, activation='relu')(input_data)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(16, activation='relu')(x)
x = layers.Dense(8, activation='relu')(x)
output_layer = layers.Dense(1)(x)

model = tf.keras.Model(inputs=input_data, outputs=output_layer, name='linear_regression')

In [19]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.003),
    metrics=['accuracy']
)
model.fit(X_train, y_train, epochs=100, validation_split=0.3)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fcfe04b4ad0>

In [20]:
y_pred = model.predict(X_test)

In [21]:
y_pred = [1 if y >= 0.5 else 0 for y in y_pred]

In [22]:
list(zip(y_test, y_pred))

[(1, 1),
 (0, 0),
 (1, 1),
 (0, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 1),
 (0, 0),
 (1, 1),
 (1, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 0),
 (1, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[153  29]
 [ 16 228]]
0.8943661971830986
