<font color=teal>
_______________________________________
</font>


### <font color=teal>Goal:</font>

- Merge play actions and offense/defense power scores into a play by play dataset focused on play-calling

### <font color=teal>Input:</font>

- pbp_actions.parquet
- defense_power.parquet
- offense_power.parquet


### <font color=teal>Steps:</font>
- merge offense and defense scores into each play based on which team offense and defense
- save the final play-calling dataset


### <font color=teal>Code:</font>
- /src module



### <font color=teal>Output:</font>

- nfl_pbp_play_calls.parquet



<font color=teal>
_______________________________________
</font>

In [None]:
import os
import sys

sys.path.append(os.path.abspath("../src"))

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense
from tensorflow.keras.models import Model

import warnings

warnings.filterwarnings('ignore')

In [None]:
from src import *

In [None]:
DEBUG = False

data_directory = get_config('data_directory')

plt.style.use('seaborn-darkgrid')

In [None]:
full_path = os.path.join(data_directory, "nfl_pbp_play_calls.parquet")
pbp_actions_df = pd.read_parquet(full_path)
pbp_actions_df.head()


Many of these are interesting and needed just to understand and validate the information, but they have varying effectiveness for a play call predictor

- drop: season, week, play counter, -- unless we can use this to weight more recent seasons
- not sure:
        - drive - we get a sense of time using seconds remaining, point differential, yards_to_goal, etc.
        - posteam - we could label this, but really offense and defense power identifies the team better for this type of application
        - defteam - it would just take a lot longer to train - defense power is perhaps just as effective
        - down - again, interesting from an understinf=ding of what's going on, but not really for a play call predictor
        -
        -
- Keepers
        - point differential - float
        - yrdstogo ....float
        - yards_to_goal - int64
        - game seconds remaining  - float
        - action - label
        - yards_gained - float
        - points gained - int
        - defense power - float
        - offense power - float
        -

In [None]:
keepers = [
    'action',
    'point_differential',
    'ydstogo',
    'yards_to_goal',
    'game_seconds_remaining',
    'defense_power',
    'offense_power',
    'yards_gained',
    'points_gained'
]

df = pbp_actions_df[keepers]
df.head()

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
assert df.isna().sum().sum() == 0

In [None]:
df.select_dtypes(include='int64').shape[1]

In [None]:
from src.utils import assert_and_alert

dtype_mapping = {col: 'float' for col in df.select_dtypes(include='int64')}
df = df.astype(dtype_mapping)

assert_and_alert(df.select_dtypes(include='int64').shape[1] == 0, "expected that all integers to be converted to float")

In [None]:
from src.utils import label_encode

encoded_df, labels = label_encode(df, ['action'])
labels['action']


In [None]:
# sns.pairplot(encoded_df[['point_differential',
#                          'ydstogo',
#                          'yards_to_goal',
#                          'game_seconds_remaining',
#                          'defense_power',
#                          'offense_power',
#                          'yards_gained',
#                          'points_gained']], diag_kind='kde');

In [None]:
from keras import Sequential
from keras.src import regularizers
from keras.src.optimizers import Adam, RMSprop


def run_simple_nn(X_train, X_test, y_train, y_test):
    # Set parameters
    learning_rate = .01
    activation_function = "relu"
    output_function = "linear"
    loss_function = "mean_squared_error"
    regularization_function = regularizers.l1(0.001)
    optimizer=RMSprop()


    # Create a neural network model
    model = Sequential()
    model.add(Dense(64, input_dim=X.shape[1], activation=activation_function))
    model.add(Dense(64,  activation=activation_function))
    # model.add(Dense(64, activation=activation_function, kernel_regularizer=regularization_function))
    # model.add(Dense(15, activation=activation_function, kernel_regularizer=regularization_function))
    model.add(Dense(1))  # Single output neuron for binary classification

    model.compile(
        optimizer=optimizer,
        loss=loss_function,
        metrics=['accuracy']
    )

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    r = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

    score = model.evaluate(X_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    return r

In [None]:

from sklearn.metrics import mean_squared_error
from typing import Set
import xgboost as xgb

def run_xgboost(X_train, X_test, y_train, y_test) -> (pd.DataFrame, Set):
    # Create an XGBoost model
    # Convert the data into DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Set the parameters for XGBoost
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse'
    }

    # Train the XGBoost model
    model = xgb.train(params, dtrain)

    # Predict on the test set
    y_pred = model.predict(dtest)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print('Mean Squared Error:', mse)
    return mse

In [None]:
y = encoded_df[['yards_gained']].values
X = encoded_df.drop(columns=['yards_gained', 'points_gained']).values

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# mse = run_xgboost(X_train, X_test, y_train, y_test)
run_simple_nn(X_train, X_test, y_train, y_test)

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.wrappers.scikit_learn import KerasRegressor


from scikeras.wrappers import KerasRegressor

from sklearn.model_selection import GridSearchCV

# Define a function to create the neural network model
def create_model(learning_rate=0.01, activation='relu', optimizer='adam'):
    model = Sequential()
    model.add(Dense(32, input_dim=X.shape[1], activation=activation))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Create a KerasRegressor wrapper
model = KerasRegressor(build_fn=create_model)

# Define the hyperparameters to search over
param_grid = {
    'learning_rate': [0.01, 0.1, 0.001],
    'activation': ['relu', 'tanh'],
    'optimizer': ['adam', 'rmsprop']
}

# Perform grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters: ", grid_result.best_params_)
print("Best Score: ", grid_result.best_score_)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create the scaler object
scaler = MinMaxScaler()

# Normalize selected columns in a DataFrame
X = scaler.fit_transform(X)


In [None]:
actions = [x[0] for x in list(labels['action'])]
actions

In [None]:
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Embedding, Dense
# from tensorflow.keras.models import Model
#
#
#
# # Define the input layers
# action_input = Input(shape=(1,))
# metrics_input = Input(shape=(X.shape[1]-1,))  # Shape correction
#
# # Create the embedding layer for actions
# embedding_dim = 8
# action_embed = Embedding(input_dim=len(actions), output_dim=embedding_dim)(action_input)
# action_flatten = tf.keras.layers.Flatten()(action_embed)
#
# # Combine the input layers and action embedding
# X_concat = tf.keras.layers.Concatenate()([action_flatten, metrics_input])
# x = Dense(32, activation='relu')(X_concat)
# x = Dense(32, activation='relu')(x)
#
# # Task-specific layers for YARDS prediction
# yards_layer = Dense(16, activation='relu')(x)
# yards_output = Dense(1, activation='sigmoid', name='yards_output')(yards_layer)
#
# # Task-specific layers for POINTS prediction
# points_layer = Dense(16, activation='relu')(x)
# points_output = Dense(1, activation='sigmoid', name='points_output')(points_layer)
#
# # Define the model with multiple outputs
# model = Model(inputs=[action_input, metrics_input], outputs=[yards_output, points_output])
#


In [None]:
def feed_nn(X_df, y_df):
    _actions = X_df['action'].values
    _metrics =  X_df.drop(columns=['action']).values
    x = X_df.values
    X = [_actions, _metrics]

    yards = y_df['yards_gained'].values
    points = y_df['points_gained'].values
    y = [yards, points]

    return X, y

In [None]:
def run_complex_model(X_train, X_test, y_train, y_test):

    # Define the input layers
    action_input = Input(shape=(1,))
    metrics_input = Input(shape=(X.shape[1]-1,))  # Shape correction

    # Create the embedding layer for actions
    embedding_dim = 8
    action_embed = Embedding(input_dim=len(actions), output_dim=embedding_dim)(action_input)
    action_flatten = tf.keras.layers.Flatten()(action_embed)

    # Combine the input layers and action embedding
    X_concat = tf.keras.layers.Concatenate()([action_flatten, metrics_input])
    x = Dense(32, activation='relu')(X_concat)
    x = Dense(32, activation='relu')(x)

    # Task-specific layers for YARDS prediction
    yards_layer = Dense(16, activation='relu')(x)
    yards_output = Dense(1, name='yards_output')(yards_layer)

    # Task-specific layers for POINTS prediction
    points_layer = Dense(16, activation='relu')(x)
    points_output = Dense(1, activation='sigmoid', name='points_output')(points_layer)

    # Define the model with multiple outputs
    model = Model(inputs=[action_input, metrics_input], outputs=[yards_output, points_output])


    # Compile the model
    model.compile(optimizer='adam',
                  loss={'yards_output': 'mean_squared_error',
                        'points_output': 'binary_crossentropy'},
                  metrics={'yards_output': 'mae',
                           'points_output': 'accuracy'})

    # Define the early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    r = model.fit(x=[X_train[:, 0], X_train[:, 1:]], y=[y_train[:, 0], y_train[:, 1]],
                  epochs=1,
                  batch_size=64,
                  validation_data=([X_test[:, 0], X_test[:, 1:]], [y_test[:, 0], y_test[:, 1]]),
                  callbacks=[early_stopping])

    return r

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


In [None]:
# X_train, y_train = feed_nn(X_train0, y_train0)
# X_test, y_test = feed_nn(X_test0, y_test0)

# X_train = X_train0.values
# X_test = X_test0.values
# y_train = y_train0.values
# y_test = y_test0.values

In [None]:
r = run_simple_model(X_train, X_test, y_train[:,0], y_test[:,0])


In [None]:
r.history