In [77]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping



# PreProcessing Data

In [78]:

file_path = 'finalMergedData.csv'
df = pd.read_csv(file_path)

Delete the features that we are able to acess before the play starts, including passResult, passLength, penaltyYards, playNullifiedByPenalty, passPossibility

In [79]:
# Add features to delete
features_to_delete = ['passResult', 'passProbability','prePenaltyPlayResult', 'penaltyYards', 'playNullifiedByPenalty', 'foulName1', 'foulName2', 'foulNFLId1', 'foulNFLId2', 'homeTeamWinProbabilityAdded', 'visitorTeamWinProbilityAdded','expectedPointsAdded', 'expectedPoints' ]
# Drop the unwanted features
df = df.drop(features_to_delete, axis=1)

In [80]:
# delete a few descriptive features:
Features_to_delete = ['playDescription', 'gameId', 'ballCarrierId', 'ballCarrierDisplayName', 'collegeName', 'Full Name', 'Team']
df = df.drop(Features_to_delete, axis=1)

In [81]:
column_data_types = df.dtypes
print(column_data_types)
df

playId                                int64
quarter                               int64
down                                  int64
yardsToGo                             int64
possessionTeam                       object
defensiveTeam                        object
yardlineSide                         object
yardlineNumber                        int64
gameClock                            object
preSnapHomeScore                      int64
preSnapVisitorScore                   int64
passLength                          float64
playResult                            int64
absoluteYardlineNumber                int64
offenseFormation                     object
defendersInTheBox                   float64
preSnapHomeTeamWinProbability       float64
preSnapVisitorTeamWinProbability    float64
frameId                               int64
playDirection                        object
x                                   float64
y                                   float64
distanceToEndZone               

Unnamed: 0,playId,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,...,x,y,distanceToEndZone,height_x,weight_x,Position,yearsPro,overall_rating,teamId,age
0,3537,4,1,10,ATL,TB,ATL,41,7:52,21,...,63.87,29.23,63.87,6-3,261,TE,2.0,52.0,14.0,27.0
1,121,1,3,4,ATL,NO,ATL,42,13:41,0,...,52.77,26.42,67.23,6-3,261,TE,2.0,52.0,14.0,27.0
2,1217,2,4,1,ATL,LA,LA,13,5:30,14,...,101.06,8.47,18.94,6-3,261,TE,2.0,52.0,14.0,27.0
3,749,1,2,14,ATL,CLE,CLE,35,2:11,3,...,91.15,51.02,28.85,6-3,261,TE,2.0,52.0,14.0,27.0
4,959,2,2,7,ATL,CAR,ATL,49,13:40,0,...,66.42,39.05,53.58,6-3,261,TE,2.0,52.0,14.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11343,3088,3,2,6,BUF,KC,BUF,29,2:05,17,...,38.12,19.84,81.88,6-5,251,TE,3.0,64.0,3.0,27.0
11344,2612,3,1,10,DEN,IND,DEN,25,8:51,6,...,93.12,23.99,93.12,6-0,225,HB,3.0,63.0,27.0,25.0
11345,1204,2,2,7,DAL,NYG,NYG,14,9:58,3,...,99.29,14.20,20.71,6-3,195,WR,0.0,71.0,11.0,23.0
11346,2064,3,2,10,IND,JAX,IND,34,8:56,17,...,47.00,15.72,73.00,6-4,220,WR,2.0,64.0,10.0,24.0


In [82]:
# Combine all unique values from the relevant columns into one list
unique_teams = pd.unique(df[['possessionTeam', 'defensiveTeam', 'yardlineSide']].values.ravel('K'))

# Create a label encoder object
label_encoder = LabelEncoder()

# Fit the label encoder on the unique values
label_encoder.fit(unique_teams)

# Transform each column with the fitted label encoder
df['possessionTeam'] = label_encoder.transform(df['possessionTeam'])
df['defensiveTeam'] = label_encoder.transform(df['defensiveTeam'])
df['yardlineSide'] = label_encoder.transform(df['yardlineSide'])
df['offenseFormation'] = label_encoder.fit_transform(df['offenseFormation'])
df['Position'] = label_encoder.fit_transform(df['Position'])
df['playDirection'] = label_encoder.fit_transform(df['playDirection'])

In [83]:


# df = pd.get_dummies(df, columns=['possessionTeam', 'defensiveTeam', 'yardlineSide', 'offenseFormation', 'Position', 'playDirection'])
def clock_to_minutes(time_str):
    hours, minutes = map(int, time_str.split(':'))
    return hours * 60 + minutes
def height_to_cm(height):
    feet, inches = height.split('-')
    return int(feet) * 30.48 + int(inches) * 2.54
# Apply this function to each entry in the 'gameClock' column
df['gameClock'] = df['gameClock'].apply(clock_to_minutes)
df['height_x'] = df['height_x'].apply(height_to_cm)

df

Unnamed: 0,playId,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,...,x,y,distanceToEndZone,height_x,weight_x,Position,yearsPro,overall_rating,teamId,age
0,3537,4,1,10,1,29,1,41,472,21,...,63.87,29.23,63.87,190.50,261,7,2.0,52.0,14.0,27.0
1,121,1,3,4,1,22,1,42,821,0,...,52.77,26.42,67.23,190.50,261,7,2.0,52.0,14.0,27.0
2,1217,2,4,1,1,16,16,13,330,14,...,101.06,8.47,18.94,190.50,261,7,2.0,52.0,14.0,27.0
3,749,1,2,14,1,7,7,35,131,3,...,91.15,51.02,28.85,190.50,261,7,2.0,52.0,14.0,27.0
4,959,2,2,7,1,4,1,49,820,0,...,66.42,39.05,53.58,190.50,261,7,2.0,52.0,14.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11343,3088,3,2,6,3,15,3,29,125,17,...,38.12,19.84,81.88,195.58,251,7,3.0,64.0,3.0,27.0
11344,2612,3,1,10,9,13,9,25,531,6,...,93.12,23.99,93.12,182.88,225,2,3.0,63.0,27.0,25.0
11345,1204,2,2,7,8,23,23,14,598,3,...,99.29,14.20,20.71,190.50,195,8,0.0,71.0,11.0,23.0
11346,2064,3,2,10,13,14,13,34,536,17,...,47.00,15.72,73.00,193.04,220,8,2.0,64.0,10.0,24.0


In [84]:
df['defendersInTheBox'] = df['defendersInTheBox'].fillna(df['defendersInTheBox'].mean())
df['passLength'] = df['passLength'].fillna(df['defendersInTheBox'].mean())
# Check if any row has at least one missing value
# Check if any row has at least one missing value
rows_with_missing_values = df.isnull().any(axis=1)
sum(rows_with_missing_values)
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
columns_to_standardize = [col for col in df.columns if col not in ['playResult', 'passLength']]

# Subset the DataFrame to include only the columns to standardize
df_subset = df[columns_to_standardize]
df_standardized = scaler.fit_transform(df_subset)
df_subset = pd.DataFrame(df_standardized, columns=df_subset.columns)
df[columns_to_standardize] = df_subset
df

Unnamed: 0,playId,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,...,x,y,distanceToEndZone,height_x,weight_x,Position,yearsPro,overall_rating,teamId,age
0,1.323072,1.274946,-0.918198,0.397173,-1.483311,1.471248,-1.512065,0.915164,0.113727,1.025407,...,0.143207,0.247832,0.027177,0.977365,2.277991,1.030017,-0.669202,-3.014546,-0.201423,0.548816
1,-1.588305,-1.361688,1.588349,-1.126446,-1.483311,0.718273,-1.512065,0.995623,1.416317,-1.193672,...,-0.310287,-0.046835,0.165632,0.977365,2.277991,1.030017,-0.669202,-3.014546,-0.201423,0.548816
2,-0.654210,-0.482810,2.841622,-1.888255,-1.483311,0.072865,0.065051,-1.337696,-0.416266,0.285714,...,1.662616,-1.929143,-1.824240,0.977365,2.277991,1.030017,-0.669202,-3.014546,-0.201423,0.548816
3,-1.053075,-1.361688,0.335075,1.412919,-1.483311,-0.895246,-0.881219,0.432408,-1.159003,-0.876660,...,1.257740,2.532818,-1.415882,0.977365,2.277991,1.030017,-0.669202,-3.014546,-0.201423,0.548816
4,-0.874097,-0.482810,0.335075,-0.364636,-1.483311,-1.217950,-1.512065,1.558838,1.412585,-1.193672,...,0.247388,1.277596,-0.396840,0.977365,2.277991,1.030017,-0.669202,-3.014546,-0.201423,0.548816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11343,0.940400,0.396068,0.335075,-0.618573,-1.270331,-0.034703,-1.301783,-0.050347,-1.181397,0.602726,...,-0.908818,-0.736840,0.769310,1.743698,1.760168,1.030017,-0.257731,-1.728845,-1.397013,0.548816
11344,0.534716,0.396068,-0.918198,0.397173,-0.631393,-0.249839,-0.670937,-0.372184,0.333936,-0.559649,...,1.338225,-0.301655,1.232473,-0.172135,0.413829,-0.793824,-0.257731,-1.835987,1.211546,-0.221733
11345,-0.665290,-0.482810,0.335075,-0.364636,-0.737883,0.825841,0.801038,-1.257237,0.584003,-0.876660,...,1.590302,-1.328273,-1.751304,0.977365,-1.139639,1.394785,-1.492143,-0.978853,-0.527493,-0.992282
11346,0.067668,0.396068,0.335075,0.397173,-0.205435,-0.142271,-0.250372,0.351949,0.352598,0.602726,...,-0.546022,-1.168880,0.403395,1.360531,0.154918,1.394785,-0.669202,-1.728845,-0.636183,-0.607007


In [85]:
import pandas as pd
import seaborn as sns


# Assuming df is your DataFrame and 'playResult' is your target variable
X = df.drop('playResult', axis=1) # Features
y_final = df['playResult'] # Target variable
y_aux = df['passLength']

# Correlation analysis
# correlation_matrix = df.corr()
# print(correlation_matrix['playResult'].sort_values(ascending=False))
# plt.figure(figsize=(30,20))
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
# plt.show()


In [86]:
# frameId is unrealeted, decide to delete it
df = df.drop('frameId', axis=1)

# Split data into trainig, validation and test sets

In [87]:

# Let's say you have y_aux as your auxiliary target (intermediary play result)
# and y_final as your final target to predict

# First, you split your data into training and a temporary set (combining validation and test)
X_train, X_temp, y_aux_train, y_aux_temp, y_final_train, y_final_temp = train_test_split(
    X, y_aux, y_final, test_size=0.4, random_state=42
)

# Now split the temporary set further into validation and test sets for both auxiliary and final targets
X_val, X_test, y_aux_val, y_aux_test, y_final_val, y_final_test = train_test_split(
    X_temp, y_aux_temp, y_final_temp, test_size=0.5, random_state=42
)


# Set up model

In [88]:
#simplified model without tuning
def create_model(number_of_features, learning_rate, dropout_rate, activation):
    # Input layer
    inputs = Input(shape=(number_of_features,))
    x = inputs

    x = Dense(20, activation=activation)(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(20, activation=activation)(x)
    x = Dropout(dropout_rate)(x)
    # Auxiliary output (assuming it's a regression task)
    auxiliary_output = Dense(1, name='aux_output')(x)
    x = Dense(20, activation=activation)(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(20, activation=activation)(x)
    x = Dropout(dropout_rate)(x)

    # Main output (assuming it's a regression task)
    main_output = Dense(1, name='main_output')(x)

    # Define model
    model = Model(inputs=inputs, outputs=[main_output, auxiliary_output])

    # Compile model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,
                  loss={'main_output': 'mean_squared_error', 'aux_output': 'mean_squared_error'},
                  loss_weights={'main_output': 1., 'aux_output': 0.4},
                  metrics={'main_output': 'mean_squared_error', 'aux_output': 'mean_squared_error'})

    return model


# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_main_output_loss',  # Monitor the validation loss of the main output
    mode='min',
    patience=5,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)

# Hyperparameters
model = create_model(
    number_of_features=X_train.shape[1],
    learning_rate=0.01,
    dropout_rate=0.0,
    activation='relu'
)

# Fit the model with the EarlyStopping callback
history = model.fit(
    X_train, [y_final_train, y_aux_train],
    validation_data=(X_val, [y_final_val, y_aux_val]),
    epochs=100,  # Set to a high value; EarlyStopping will determine the actual number
    batch_size=32,
    callbacks=[early_stopping]  # Add the EarlyStopping callback here
)





Epoch 1/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 806us/step - aux_output_mean_squared_error: 15.5293 - loss: 76.6106 - main_output_mean_squared_error: 70.3994 - val_aux_output_mean_squared_error: 1.9837 - val_loss: 62.6003 - val_main_output_mean_squared_error: 61.8185
Epoch 2/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 439us/step - aux_output_mean_squared_error: 1.0709 - loss: 54.0936 - main_output_mean_squared_error: 53.6656 - val_aux_output_mean_squared_error: 1.1768 - val_loss: 59.5793 - val_main_output_mean_squared_error: 59.1244
Epoch 3/100
[1m129/213[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 394us/step - aux_output_mean_squared_error: 0.8098 - loss: 55.5547 - main_output_mean_squared_error: 55.2308

  current = self.get_monitor_value(logs)


[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 507us/step - aux_output_mean_squared_error: 0.7799 - loss: 55.1311 - main_output_mean_squared_error: 54.8185 - val_aux_output_mean_squared_error: 0.4378 - val_loss: 57.5259 - val_main_output_mean_squared_error: 57.3663
Epoch 4/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441us/step - aux_output_mean_squared_error: 0.6010 - loss: 52.1267 - main_output_mean_squared_error: 51.8853 - val_aux_output_mean_squared_error: 0.8055 - val_loss: 57.8958 - val_main_output_mean_squared_error: 57.5901
Epoch 5/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436us/step - aux_output_mean_squared_error: 0.7769 - loss: 47.8366 - main_output_mean_squared_error: 47.5260 - val_aux_output_mean_squared_error: 0.4504 - val_loss: 58.2718 - val_main_output_mean_squared_error: 58.1077
Epoch 6/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443us/step - aux_output_mean_squared_error:

In [89]:
print(history.history.keys())
val_loss = np.mean(history.history['val_loss'])
print(f"Validation loss: {val_loss}")

dict_keys(['aux_output_mean_squared_error', 'loss', 'main_output_mean_squared_error', 'val_aux_output_mean_squared_error', 'val_loss', 'val_main_output_mean_squared_error'])
Validation loss: 63.962181930541995


In [90]:
from sklearn.metrics import mean_absolute_error
predictions = model.predict(X_test)

# If your model has multiple outputs, you will get multiple arrays of predictions
# For example, if your model has two outputs (main output and auxiliary output)
mean_absolute_error(predictions[0],y_final_test)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 533us/step


4.6460918018996455

In [91]:
# with tuning
def create_model(number_of_features, learning_rate, n_layers, n_units, dropout_rate, activation):
    # Input layer
    inputs = Input(shape=(number_of_features,))
    x = inputs

    # Add hidden layers dynamically based on n_layers
    for i in range(n_layers):
        x = Dense(n_units, activation=activation)(x)
        x = Dropout(dropout_rate)(x)

    # Auxiliary output (assuming it's a regression task)
    auxiliary_output = Dense(1, name='aux_output')(x)

    # Additional layers leading to the main task (if needed)
    for i in range(n_layers):
        x = Dense(n_units, activation=activation)(x)
        x = Dropout(dropout_rate)(x)

    # Main output (assuming it's a regression task)
    main_output = Dense(1, name='main_output')(x)

    # Define model
    model = Model(inputs=inputs, outputs=[main_output, auxiliary_output])

    # Compile model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,
                  loss={'main_output': 'mean_squared_error', 'aux_output': 'mean_squared_error'},
                  loss_weights={'main_output': 1., 'aux_output': 0.4},
                  metrics={'main_output': 'mean_squared_error', 'aux_output': 'mean_squared_error'})

    return model


# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_main_output_loss',  # Monitor the validation loss of the main output
    patience=5,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)

# Hyperparameters
learning_rates = [0.001, 0.0001]
number_of_layers_options = [1, 2, 3]
units_options = [32, 64, 128]
dropout_rates = [0.0, 0.2]
activations = ['relu', 'tanh', 'sigmoid']

# Iterate over hyperparameters
for lr in learning_rates:
    for n_layers in number_of_layers_options:
        for n_units in units_options:
            for dropout_rate in dropout_rates:
                for activation in activations:
                    # Create and compile a new model
                    model = create_model(
                        number_of_features=X_train.shape[1],
                        learning_rate=lr,
                        n_layers=n_layers,
                        n_units=n_units,
                        dropout_rate=dropout_rate,
                        activation=activation
                    )

                    # Fit the model with the EarlyStopping callback
                    history = model.fit(
                        X_train, [y_final_train, y_aux_train],
                        validation_data=(X_val, [y_final_val, y_aux_val]),
                        epochs=100,  # Set to a high value; EarlyStopping will determine the actual number
                        batch_size=32,
                        callbacks=[early_stopping]  # Add the EarlyStopping callback here
                    )

                    # Evaluate the model using the validation set
                    val_loss = np.mean(history.history['val_main_output_loss'])
                    print(f"Validation loss with lr={lr}, n_layers={n_layers}, n_units={n_units}, dropout_rate={dropout_rate}, activation={activation}: {val_loss}")




Epoch 1/100
[1m143/213[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 353us/step - aux_output_mean_squared_error: 32.5258 - loss: 110.7149 - main_output_mean_squared_error: 97.7045   

ValueError: EarlyStopping callback received monitor=val_main_output_loss but Keras isn't able to automatically determine whether that metric should be maximized or minimized. Pass `mode='max'` in order to do early stopping based on the highest metric value, or pass `mode='min'` in order to use the lowest value.