# NFL Big Data Bowl - LSTM with `x_next`, `y_next` and `frame` column

**Goal**: Predict `(x_next, y_next)` using **only pre-snap frames** (`frame == "pre"`)

- One DataFrame
- Uses your existing `x_next`, `y_next`
- **No NaN leakage** from post-snap kinematics
- Masking for variable sequence lengths
- Ready for full training

In [None]:
# !pip install -q tensorflow scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 1. Load your data (replace with your actual path)

In [None]:
# to_predict = pd.read_csv('your_processed_data.csv')
# For demo, we'll assume it's already loaded
# to_predict.head()

## 2. Add `frame` column (pre/post snap)

In [None]:
# Replace 'is_snap' with your actual snap indicator column
to_predict["frame"] = np.where(to_predict["is_snap"] == 1, "pre", "post")

## 3. Define features and targets

In [None]:
feature_cols = [
    "x", "y", "s", "a", "dir", "o",
    "play_direction_num",
    "player_position_WR", "player_position_RB", "player_position_QB",
    "player_role_Targeted", "player_role_Passer", "player_role_Def",
    "ball_land_x", "ball_land_y"
]

target_cols = ["x_next", "y_next"]  # Already in your DF

## 4. Build pre-snap â†’ (x_next, y_next) sequences

In [None]:
def build_pre_target_sequences(df, feature_cols, target_cols, max_pre_len=96):
    X_list, y_list, mask_list = [], [], []

    for (game_id, play_id, nfl_id), grp in df.groupby(["game_id", "play_id", "nfl_id"]):
        pre = grp[grp["frame"] == "pre"].sort_values("frame_id_total")
        if len(pre) == 0:
            continue

        X_seq = pre[feature_cols].values[-max_pre_len:]
        y_seq = pre[target_cols].values[-max_pre_len:]

        X_list.append(X_seq)
        y_list.append(y_seq)
        mask_list.append(np.ones(len(X_seq)))

    X_pad = pad_sequences(X_list, maxlen=max_pre_len, dtype="float32", padding="pre", value=0.0)
    y_pad = pad_sequences(y_list, maxlen=max_pre_len, dtype="float32", padding="pre", value=0.0)
    mask  = pad_sequences(mask_list, maxlen=max_pre_len, dtype="float32", padding="pre", value=0.0)

    return X_pad, y_pad, mask

SEQ_PRE_LEN = 96
X_pre, y_tgt, pre_mask = build_pre_target_sequences(to_predict, feature_cols, target_cols, SEQ_PRE_LEN)

print("X_pre:", X_pre.shape)
print("y_tgt:", y_tgt.shape)

## 5. Scale only real pre-snap frames

In [None]:
scaler = StandardScaler()
real_frames = X_pre[pre_mask == 1]
scaled_real = scaler.fit_transform(real_frames)

X_scaled = X_pre.copy()
X_scaled[pre_mask == 1] = scaled_real

## 6. Train/val split

In [None]:
idx_train, idx_val = train_test_split(np.arange(X_scaled.shape[0]), test_size=0.2, random_state=42)

X_train, X_val = X_scaled[idx_train], X_scaled[idx_val]
y_train, y_val = y_tgt[idx_train], y_tgt[idx_val]

## 7. Build LSTM model (one-step-ahead with masking)

In [None]:
def build_model(seq_len, n_feat):
    inputs = layers.Input(shape=(seq_len, n_feat))
    masked = layers.Masking(mask_value=0.0)(inputs)
    lstm   = layers.LSTM(128, return_sequences=True)(masked)
    drop   = layers.Dropout(0.2)(lstm)
    dense  = layers.TimeDistributed(layers.Dense(64, activation="relu"))(drop)
    out    = layers.TimeDistributed(layers.Dense(2, activation="linear"))(dense)

    model = models.Model(inputs, out)
    model.compile(optimizer=optimizers.Adam(1e-3), loss="mse", metrics=["RootMeanSquaredError"])
    return model

model = build_model(SEQ_PRE_LEN, X_train.shape[2])
model.summary()

## 8. Train

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=64,
    verbose=2
)

## 9. Predict on test set

In [None]:
def predict_test(df_test):
    X_pre, _, mask = build_pre_target_sequences(df_test, feature_cols, target_cols, SEQ_PRE_LEN)
    real = X_pre[mask == 1]
    X_pre[mask == 1] = scaler.transform(real)
    return model.predict(X_pre, batch_size=128)

# preds = predict_test(test_df)