In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

from catboost import CatBoostClassifier
import catboost
catboost.__version__

'1.2.8'

In [5]:
df = pd.read_csv("../dataset/nfl_filtered.csv")
df.head()

Unnamed: 0,play_id,posteam,defteam,posteam_type,yardline_100,qtr,down,ydstogo,goal_to_go,score_differential,...,drive,posteam_timeouts_remaining,defteam_timeouts_remaining,shotgun,no_huddle,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,side_of_field,play_type
0,68,PIT,TEN,home,58.0,1,1.0,10,0.0,0.0,...,1,3.0,3.0,0,0,893.0,1793.0,3593.0,PIT,pass
1,92,PIT,TEN,home,53.0,1,2.0,5,0.0,0.0,...,1,3.0,3.0,0,0,856.0,1756.0,3556.0,PIT,run
2,113,PIT,TEN,home,56.0,1,3.0,8,0.0,0.0,...,1,3.0,3.0,1,0,815.0,1715.0,3515.0,PIT,pass
3,162,TEN,PIT,away,98.0,1,1.0,10,0.0,0.0,...,2,3.0,3.0,0,0,796.0,1696.0,3496.0,TEN,run
4,183,TEN,PIT,away,98.0,1,2.0,10,0.0,0.0,...,2,3.0,3.0,0,0,760.0,1660.0,3460.0,TEN,pass


In [6]:
df["play_type"] = df["play_type"].map({"run": 0, "pass": 1}).astype(int)

In [7]:
if "play_id" in df.columns:
    df = df.drop(columns=["play_id"])

X = df.drop(columns=["play_type"])
y = df["play_type"]

categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Feature shape:", X.shape)
print("Target shape:", y.shape)
print("Categorical features:", categorical_features)

Feature shape: (318668, 19)
Target shape: (318668,)
Categorical features: ['posteam', 'defteam', 'posteam_type', 'game_half', 'side_of_field']


In [8]:
#First split: train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#Second split: validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Training shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Testing shape:", X_test.shape)

Training shape: (223067, 19)
Validation shape: (47800, 19)
Testing shape: (47801, 19)


In [9]:
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_features]

print("Categorical features:", categorical_features)
print("Categorical feature indices:", cat_feature_indices)

Categorical features: ['posteam', 'defteam', 'posteam_type', 'game_half', 'side_of_field']
Categorical feature indices: [0, 1, 2, 9, 18]


In [10]:
cat_model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    depth=6,
    learning_rate=0.1,
    iterations=500,
    od_type='Iter',
    od_wait=50,
    random_seed=42,
    verbose=100 #prints training progress every 100 iterations
)


In [11]:
cat_model.fit(
    X_train, y_train, cat_features=cat_feature_indices,
    eval_set=(X_val, y_val), use_best_model=True
)

0:	test: 0.7760523	best: 0.7760523 (0)	total: 297ms	remaining: 2m 28s
100:	test: 0.8032336	best: 0.8032336 (100)	total: 13.7s	remaining: 54.1s
200:	test: 0.8063418	best: 0.8063418 (200)	total: 30.7s	remaining: 45.7s
300:	test: 0.8073007	best: 0.8073007 (300)	total: 47.8s	remaining: 31.6s
400:	test: 0.8077827	best: 0.8077827 (400)	total: 1m 4s	remaining: 15.9s
499:	test: 0.8080009	best: 0.8080009 (499)	total: 1m 20s	remaining: 0us

bestTest = 0.8080009079
bestIteration = 499



<catboost.core.CatBoostClassifier at 0x202c3c3cc10>

Extract CatBoost Leaf Indices

In [18]:
# Leaf indices for each sample
train_leaves = cat_model.calc_leaf_indexes(catboost.Pool(X_train, cat_features=cat_feature_indices))
val_leaves   = cat_model.calc_leaf_indexes(catboost.Pool(X_val,   cat_features=cat_feature_indices))
test_leaves  = cat_model.calc_leaf_indexes(catboost.Pool(X_test,  cat_features=cat_feature_indices))


n_trees = train_leaves.shape[1]
n_leaf_values = train_leaves.max() + 1


Convert leaves into embeddings in PyTorch

In [22]:
import torch
from torch.utils.data import Dataset
import numpy as np

class PlayDataset(Dataset):
    def __init__(self, X_numeric, leaf_indices, y):
        # Convert DataFrame to float32 numpy array
        self.X_numeric = torch.tensor(X_numeric.values.astype(np.float32))
        self.leaf_indices = torch.tensor(leaf_indices.astype(np.int64))
        self.y = torch.tensor(y.values.astype(np.float32))

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_numeric[idx], self.leaf_indices[idx], self.y[idx]



Neural Network with CatBoost Leaf Embeddings
- Each tree has about 30-60 leaves 9depending on depth)
- We embed each tree's leaf index and then concat them.

In [23]:
import torch.nn as nn
import torch.nn.functional as F

class CatBoostEmbeddingMLP(nn.Module):
    def __init__(self, n_numeric_features, n_trees, n_leaf_values, embed_dim=8):
        super().__init__()

        # Embedding layer: (total possible leaf ids → dense vectors)
        self.leaf_emb = nn.Embedding(n_leaf_values, embed_dim)

        # MLP input size = numeric features + (trees × embed_dim)
        mlp_input_dim = n_numeric_features + n_trees * embed_dim

        self.fc1 = nn.Linear(mlp_input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x_num, leaf_idx):
        # leaf_idx shape: (batch_size, n_trees)
        leaf_vectors = self.leaf_emb(leaf_idx)          # (batch, n_trees, embed_dim)
        leaf_vectors = leaf_vectors.view(leaf_vectors.size(0), -1)

        x = torch.cat([x_num, leaf_vectors], dim=1)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x))


Training Loop

In [24]:
from torch.utils.data import DataLoader
import torch.optim as optim

train_dataset = PlayDataset(X_train, train_leaves, y_train)
val_dataset   = PlayDataset(X_val, val_leaves, y_val)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=256)

model = CatBoostEmbeddingMLP(
    n_numeric_features=X_train.shape[1],
    n_trees=n_trees,
    n_leaf_values=n_leaf_values,
    embed_dim=8
)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()


ValueError: could not convert string to float: 'CLE'

Training

In [None]:
for epoch in range(10):
    model.train()
    for x_num, leaf_idx, y in train_loader:
        optimizer.zero_grad()
        pred = model(x_num, leaf_idx).squeeze()
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} complete")


Predict

In [None]:
model.eval()
all_preds = []

with torch.no_grad():
    for x_num, leaf_idx, _ in test_loader:
        pred = model(x_num, leaf_idx).squeeze()
        all_preds.append(pred.cpu().numpy())

all_preds = np.concatenate(all_preds)

# 4. Convert to class labels
test_labels = (all_preds >= 0.5).astype(int)

print(all_preds[:10])   # probabilities
print(test_labels[:10]) # 0/1 labels