In [250]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from catboost import CatBoostClassifier
import catboost
catboost.__version__

RANDOM_SEED = 42
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [251]:
df = pd.read_csv("../dataset/nfl_encoded_v2.csv")
df.head()

Unnamed: 0,play_id,posteam,defteam,posteam_type,yardline_100,qtr,down,ydstogo,goal_to_go,score_differential,...,drive,posteam_timeouts_remaining,defteam_timeouts_remaining,shotgun,no_huddle,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,side_of_field,play_type
0,68,PIT,TEN,home,58.0,1,1,10,0,0.0,...,1,3,3,0,0,893.0,1793.0,3593.0,PIT,1
1,92,PIT,TEN,home,53.0,1,2,5,0,0.0,...,1,3,3,0,0,856.0,1756.0,3556.0,PIT,0
2,113,PIT,TEN,home,56.0,1,3,8,0,0.0,...,1,3,3,1,0,815.0,1715.0,3515.0,PIT,1
3,162,TEN,PIT,away,98.0,1,1,10,0,0.0,...,2,3,3,0,0,796.0,1696.0,3496.0,TEN,0
4,183,TEN,PIT,away,98.0,1,2,10,0,0.0,...,2,3,3,0,0,760.0,1660.0,3460.0,TEN,1


Identify categorical and numeric features

In [252]:
# Identify categorical vs numeric features
categorical_features = ["posteam", "defteam", "posteam_type","game_half", "side_of_field"]
numeric_features = X.select_dtypes(include=["float32", "float64"]).columns.tolist()

# REMOVE categorical features that are actually numeric
numeric_features = [col for col in numeric_features if col not in categorical_features]

print("Categorical features:", categorical_features)
df[categorical_features].head()

Categorical features: ['posteam', 'defteam', 'posteam_type', 'game_half', 'side_of_field']


Unnamed: 0,posteam,defteam,posteam_type,game_half,side_of_field
0,PIT,TEN,home,Half1,PIT
1,PIT,TEN,home,Half1,PIT
2,PIT,TEN,home,Half1,PIT
3,TEN,PIT,away,Half1,TEN
4,TEN,PIT,away,Half1,TEN


Label encode all categorical columns for PyTorch

In [253]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

df_encoded = df.copy()

for col in categorical_features:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

Split data for PyTorch using the encoded dataframe

In [254]:
X = df_encoded.drop(columns=["play_type", "play_id"], errors="ignore")
y = df_encoded["play_type"]

categorical_features = ["posteam", "defteam", "posteam_type", "game_half", "side_of_field"]

numeric_features = [
    col for col in df_encoded.columns
    if col not in categorical_features + ["play_type", "play_id"]
]

cat_cardinalities = [df_encoded[col].nunique() for col in categorical_features]

#First split: train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#Second split: validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Training shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Testing shape:", X_test.shape)

Training shape: (223067, 19)
Validation shape: (47800, 19)
Testing shape: (47801, 19)


Train CatBoost using the ORIGINAL dataframe (not encoded)

In [255]:
X_cb = df.drop(columns=["play_type", "play_id"], errors="ignore")
y_cb = df["play_type"]

X_train_cb, X_temp_cb, y_train_cb, y_temp_cb = train_test_split(
    X_cb, y_cb, test_size=0.3, random_state=42, stratify=y_cb
)

X_val_cb, X_test_cb, y_val_cb, y_test_cb = train_test_split(
    X_temp_cb, y_temp_cb, test_size=0.5, random_state=42, stratify=y_temp_cb
)

cat_features_cb = X_cb.select_dtypes("object").columns.tolist()
cat_feature_indices_cb = [X_cb.columns.get_loc(col) for col in cat_features_cb]



In [256]:
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_features]

print("Categorical features:", categorical_features)
print("Categorical feature indices:", cat_feature_indices)

Categorical features: ['posteam', 'defteam', 'posteam_type', 'game_half', 'side_of_field']
Categorical feature indices: [0, 1, 2, 9, 18]


K fold cross validation and OOF predictions

In [257]:
# Set up K-Fold cross-validation
from sklearn.model_selection import KFold
import numpy as np

RANDOM_SEED = 42
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Initialize array for storing OOF predictions
oof_proba = np.zeros(len(y_train))

# Generate OOF predictions
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_cb), 1):
    print(f"Fold {fold}/{n_splits}")

    X_tr, X_val_fold = X_train_cb.iloc[train_idx], X_train_cb.iloc[val_idx]
    y_tr, y_val_fold = y_train_cb.iloc[train_idx], y_train_cb.iloc[val_idx]

    fold_model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        depth=6,
        learning_rate=0.1,
        iterations=1200,
        od_type='Iter',
        od_wait=50,
        random_seed=RANDOM_SEED + fold,
        verbose=0
    )

    fold_model.fit(
        X_tr, y_tr,
        cat_features=cat_feature_indices_cb,
        eval_set=(X_val_fold, y_val_fold),
        use_best_model=True,
        verbose=0
    )

    # Store fold predictions for OOF
    oof_proba[val_idx] = fold_model.predict_proba(X_val_fold)[:, 1]

cat_model_full = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    depth=6,
    learning_rate=0.1,
    iterations=1200,
    od_type='Iter',
    od_wait=50,
    random_seed=RANDOM_SEED,
    verbose=100
)

cat_model_full.fit(
    X_train_cb, y_train_cb,
    cat_features=cat_feature_indices_cb,
    eval_set=(X_val_cb, y_val_cb),
    use_best_model=True
)

# Get validation and test predictions
val_proba = cat_model_full.predict_proba(X_val_cb)[:, 1]
test_proba = cat_model_full.predict_proba(X_test_cb)[:, 1]



Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
0:	test: 0.7760523	best: 0.7760523 (0)	total: 267ms	remaining: 5m 19s
100:	test: 0.8032336	best: 0.8032336 (100)	total: 22.7s	remaining: 4m 7s
200:	test: 0.8063418	best: 0.8063418 (200)	total: 44.8s	remaining: 3m 42s
300:	test: 0.8073007	best: 0.8073007 (300)	total: 1m 6s	remaining: 3m 17s
400:	test: 0.8077827	best: 0.8077827 (400)	total: 1m 25s	remaining: 2m 49s
500:	test: 0.8080014	best: 0.8080014 (500)	total: 1m 43s	remaining: 2m 24s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8080167417
bestIteration = 504

Shrink model to first 505 iterations.


In [267]:
cat_model_full = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    depth=6,
    learning_rate=0.1,
    iterations=1200,
    od_type='Iter',
    od_wait=80,
    random_seed=RANDOM_SEED,
    verbose=100
)

cat_model_full.fit(
    X_train_cb, y_train_cb,
    cat_features=cat_feature_indices_cb,
    eval_set=(X_val_cb, y_val_cb),
    use_best_model=True
)

# Get validation and test predictions
val_proba = cat_model_full.predict_proba(X_val_cb)[:, 1]
test_proba = cat_model_full.predict_proba(X_test_cb)[:, 1]

0:	test: 0.7760523	best: 0.7760523 (0)	total: 173ms	remaining: 3m 27s
100:	test: 0.8032336	best: 0.8032336 (100)	total: 16.2s	remaining: 2m 56s
200:	test: 0.8063418	best: 0.8063418 (200)	total: 35.6s	remaining: 2m 56s
300:	test: 0.8073007	best: 0.8073007 (300)	total: 54.5s	remaining: 2m 42s
400:	test: 0.8077827	best: 0.8077827 (400)	total: 1m 15s	remaining: 2m 30s
500:	test: 0.8080014	best: 0.8080014 (500)	total: 1m 36s	remaining: 2m 14s
600:	test: 0.8080895	best: 0.8081097 (594)	total: 1m 58s	remaining: 1m 57s
700:	test: 0.8081632	best: 0.8081632 (700)	total: 2m 22s	remaining: 1m 41s
800:	test: 0.8082217	best: 0.8082572 (789)	total: 2m 47s	remaining: 1m 23s
900:	test: 0.8082866	best: 0.8082970 (896)	total: 3m 10s	remaining: 1m 3s
1000:	test: 0.8084354	best: 0.8084482 (992)	total: 3m 31s	remaining: 42s
Stopped by overfitting detector  (80 iterations wait)

bestTest = 0.8084601836
bestIteration = 1006

Shrink model to first 1007 iterations.


Extract CatBoost Leaf Indices

In [268]:
train_leaves = cat_model_full.calc_leaf_indexes(catboost.Pool(X_train_cb, cat_features=cat_feature_indices_cb))
val_leaves = cat_model_full.calc_leaf_indexes(catboost.Pool(X_val_cb, cat_features=cat_feature_indices_cb))
test_leaves = cat_model_full.calc_leaf_indexes(catboost.Pool(X_test_cb, cat_features=cat_feature_indices_cb))

n_trees = train_leaves.shape[1]
n_leaf_values = max(train_leaves.max(), val_leaves.max(), test_leaves.max()) + 1


Convert leaves into embeddings in PyTorch

In [269]:
import torch
from torch.utils.data import Dataset
import numpy as np

class PlayDataset(Dataset):
    def __init__(self, X_num, X_cat, leaf_idx, y):
        self.X_num = torch.tensor(X_num.values.astype(np.float32))
        self.X_cat = torch.tensor(X_cat.values.astype(np.int64))
        self.leaf_idx = torch.tensor(leaf_idx.astype(np.int64))
        self.y = torch.tensor(y.values.astype(np.float32))

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (
            self.X_num[idx],
            self.X_cat[idx],
            self.leaf_idx[idx],
            self.y[idx]
        )

In [270]:
X_num_train = X_train[numeric_features]
X_cat_train = X_train[categorical_features]

X_num_val = X_val[numeric_features]
X_cat_val = X_val[categorical_features]

X_num_test = X_test[numeric_features]
X_cat_test = X_test[categorical_features]

X_num_train_full = X_num_train.copy()
X_num_val_full   = X_num_val.copy()
X_num_test_full  = X_num_test.copy()

X_num_train_full['cat_proba'] = oof_proba
X_num_val_full['cat_proba']   = val_proba
X_num_test_full['cat_proba']  = test_proba

Neural Network with CatBoost Leaf Embeddings
- Each tree has about 30-60 leaves 9depending on depth)
- We embed each tree's leaf index and then concat them.

In [271]:
import torch.nn as nn
import torch.nn.functional as F

class CatBoostEmbeddingMLP(nn.Module):
    def __init__(self, num_numeric_features, cat_dims, embed_dim,
                 num_leaf_features, hidden_dim=256, dropout=0.2):
        super().__init__()

        # categorical embeddings
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_dim, embed_dim) for cat_dim in cat_dims
        ])
        cat_total_dim = len(cat_dims) * embed_dim

        # input dim = numeric + cat embed + leaf indices
        input_dim = num_numeric_features + cat_total_dim + num_leaf_features

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.35),     # <<< ADD DROPOUT HERE
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.25),     # <<< ADD DROPOUT HERE
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x_num, x_cat, leaf_idx):
        cat_embeds = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_embeds = torch.cat(cat_embeds, dim=1)

        x = torch.cat([x_num, cat_embeds, leaf_idx], dim=1)
        return self.mlp(x)

Training Loop

In [272]:
from torch.utils.data import DataLoader
import torch.optim as optim

train_dataset = PlayDataset(X_num_train_full, X_cat_train, train_leaves, y_train)
val_dataset   = PlayDataset(X_num_val_full, X_cat_val, val_leaves, y_val)
test_dataset  = PlayDataset(X_num_test_full, X_cat_test, test_leaves, y_test)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=256)
test_loader  = DataLoader(test_dataset,  batch_size=1024)

num_numeric_features = X_num_train_full.shape[1] 

model = CatBoostEmbeddingMLP(
    num_numeric_features=num_numeric_features,
    cat_dims=cat_cardinalities,      # list of cardinalities for each categorical column
    embed_dim=8,                     # categorical embedding dimension
    num_leaf_features=n_trees, 
    hidden_dim=256,
    dropout=0.2                      
)

# exclude embedding layers from weight decay
emb_params = []
other_params = []

for name, param in model.named_parameters():
    if "embeddings" in name:
        emb_params.append(param)
    else:
        other_params.append(param)

optimizer = torch.optim.Adam([
    {"params": other_params, "weight_decay": 1e-4},
    {"params": emb_params, "weight_decay": 0.0}
], lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [273]:
print("num_numeric_features =", X_num_train_full.shape[1])
print("num_categorical_features =", len(cat_cardinalities))
print("n_trees =", cat_model_full.tree_count_)
print("n_leaf_values =", n_leaf_values)
print("leaf_idx dimension =", train_leaves.shape)

num_numeric_features = 15
num_categorical_features = 5
n_trees = 1007
n_leaf_values = 64
leaf_idx dimension = (223067, 1007)


Training

In [274]:
for epoch in range(10):
    model.train()
    for x_num, x_cat, leaf_idx, y in train_loader:
        optimizer.zero_grad()
        logits = model(x_num, x_cat, leaf_idx).squeeze()
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} complete")


Epoch 1 complete
Epoch 2 complete
Epoch 3 complete
Epoch 4 complete
Epoch 5 complete
Epoch 6 complete
Epoch 7 complete
Epoch 8 complete
Epoch 9 complete
Epoch 10 complete


Predict

In [275]:
model.eval()
all_preds = []

with torch.no_grad():
    for x_num, x_cat, leaf_idx, _ in test_loader:
        logits = model(x_num, x_cat, leaf_idx).squeeze()
        probs = torch.sigmoid(logits)
        all_preds.append(probs.cpu().numpy())

all_preds = np.concatenate(all_preds)
test_labels = (all_preds >= 0.5).astype(int)

Evaluate Model

In [276]:
# --- Evaluate model ---
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# Convert y_test (Pandas Series → numpy array)
y_test_np = y_test.values

# Accuracy
acc = accuracy_score(y_test_np, test_labels)

# AUC (needs probabilities, not labels)
auc = roc_auc_score(y_test_np, all_preds)

# Confusion matrix
cm = confusion_matrix(y_test_np, test_labels)

# Classification report
report = classification_report(y_test_np, test_labels)

print("Accuracy:", acc)
print("AUC:", auc)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


Accuracy: 0.717035208468442
AUC: 0.7811817449025499

Confusion Matrix:
 [[14782  5094]
 [ 8432 19493]]

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.74      0.69     19876
           1       0.79      0.70      0.74     27925

    accuracy                           0.72     47801
   macro avg       0.71      0.72      0.71     47801
weighted avg       0.73      0.72      0.72     47801

