In [1]:
!pip install nfl_data_py
!pip install pandas numpy torch tabulate matplotlib tqdm scikit-learn

Collecting nfl_data_py
  Downloading nfl_data_py-0.3.3-py3-none-any.whl.metadata (12 kB)
Collecting pandas<2.0,>=1.0 (from nfl_data_py)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting appdirs>1 (from nfl_data_py)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting fastparquet>0.5 (from nfl_data_py)
  Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet>0.5->nfl_data_py)
  Downloading cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading nfl_data_py-0.3.3-py3-none-any.whl (13 kB)
Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m


In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import numpy as np
import nfl_data_py as nfl
from tabulate import tabulate
import matplotlib.pyplot as plt
from datetime import datetime
import os
from tqdm.notebook import tqdm
import math
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU type: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU type: Tesla T4


In [3]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = torch.nn.functional.mse_loss(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        return focal_loss.mean()

def create_curriculum_batches(dataset, start_seq_len=50, end_seq_len=None, epochs_per_stage=5):
    """Creates curriculum learning stages"""
    if end_seq_len is None:
        end_seq_len = max(len(seq) for seq in dataset.qb_seqs)

    num_stages = math.ceil((end_seq_len - start_seq_len) / 50)
    stages = []

    for i in range(num_stages):
        curr_len = min(start_seq_len + i * 50, end_seq_len)
        stages.extend([curr_len] * epochs_per_stage)

    return stages

class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [4]:
# Load data
years = [2022, 2023, 2024]
print("Loading play by play data...")
play_by_play = nfl.import_pbp_data(years, downcast=True)

# Filter for passing plays
pass_plays = play_by_play[play_by_play['pass_attempt'] == 1].copy()
pass_plays = pass_plays.sort_values(['game_id', 'play_id'])

# Fill missing values for new features
numeric_columns = ['defenders_in_box', 'number_of_pass_rushers', 'temp',
                  'wind', 'shotgun', 'no_huddle']
pass_plays[numeric_columns] = pass_plays[numeric_columns].fillna(0)

print(f"Total plays: {len(pass_plays)}")

Loading play by play data...
2022 done.
2023 done.
2024 done.
Downcasting floats.
Total plays: 54998


In [5]:
def create_sequence_features(play_by_play, qb_name, game_id):
    """Creates a sequence of play-by-play data with enhanced context"""
    qb_plays = play_by_play[
        (play_by_play['passer_player_name'] == qb_name) &
        (play_by_play['pass_attempt'] == 1)
    ]

    if isinstance(game_id, float) and np.isinf(game_id):
        previous_plays = qb_plays
    else:
        previous_plays = qb_plays[qb_plays['game_id'].astype(str) < str(game_id)]

    sequence = []
    for _, play in previous_plays.iterrows():
        play_stats = [
            # Core play stats
            play['yards_gained'] if not np.isnan(play['yards_gained']) else 0,
            play['pass_touchdown'] if not np.isnan(play['pass_touchdown']) else 0,
            play['complete_pass'] if not np.isnan(play['complete_pass']) else 0,
            play['air_yards'] if not np.isnan(play['air_yards']) else 0,
            play['yards_after_catch'] if not np.isnan(play['yards_after_catch']) else 0,
            play['qb_hit'] if not np.isnan(play['qb_hit']) else 0,
            play['sack'] if not np.isnan(play['sack']) else 0,

            # Game situation
            play['score_differential'] if not np.isnan(play['score_differential']) else 0,
            play['qtr'] if not np.isnan(play['qtr']) else 0,
            play['down'] if not np.isnan(play['down']) else 0,
            play['ydstogo'] if not np.isnan(play['ydstogo']) else 0,
            play['yardline_100'] if not np.isnan(play['yardline_100']) else 0,

            # Defensive pressure
            play['defenders_in_box'] if not np.isnan(play['defenders_in_box']) else 0,
            play['number_of_pass_rushers'] if not np.isnan(play['number_of_pass_rushers']) else 0,

            # Weather conditions
            play['temp'] if not np.isnan(play['temp']) else 70,
            play['wind'] if not np.isnan(play['wind']) else 0,

            # Binary indicators
            play['shotgun'] if not np.isnan(play['shotgun']) else 0,
            play['no_huddle'] if not np.isnan(play['no_huddle']) else 0,
        ]
        sequence.append(play_stats)

    sequence = np.array(sequence)

    # Apply linear weighting to emphasize recent plays
    if len(sequence) > 0:
        weights = np.linspace(1.0, 1.5, len(sequence))
        sequence = sequence * weights[:, np.newaxis]

    return sequence

def create_defense_sequence(play_by_play, def_team, game_id):
    """Creates a sequence of play-by-play data for a defense with enhanced context"""
    def_plays = play_by_play[
        (play_by_play['defteam'] == def_team) &
        (play_by_play['pass_attempt'] == 1)
    ]

    if isinstance(game_id, float) and np.isinf(game_id):
        previous_plays = def_plays
    else:
        previous_plays = def_plays[def_plays['game_id'].astype(str) < str(game_id)]

    sequence = []
    for _, play in previous_plays.iterrows():
        play_stats = [
            # Same features as QB sequence
            play['yards_gained'] if not np.isnan(play['yards_gained']) else 0,
            play['pass_touchdown'] if not np.isnan(play['pass_touchdown']) else 0,
            play['complete_pass'] if not np.isnan(play['complete_pass']) else 0,
            play['air_yards'] if not np.isnan(play['air_yards']) else 0,
            play['yards_after_catch'] if not np.isnan(play['yards_after_catch']) else 0,
            play['qb_hit'] if not np.isnan(play['qb_hit']) else 0,
            play['sack'] if not np.isnan(play['sack']) else 0,

            play['score_differential'] if not np.isnan(play['score_differential']) else 0,
            play['qtr'] if not np.isnan(play['qtr']) else 0,
            play['down'] if not np.isnan(play['down']) else 0,
            play['ydstogo'] if not np.isnan(play['ydstogo']) else 0,
            play['yardline_100'] if not np.isnan(play['yardline_100']) else 0,

            play['defenders_in_box'] if not np.isnan(play['defenders_in_box']) else 0,
            play['number_of_pass_rushers'] if not np.isnan(play['number_of_pass_rushers']) else 0,

            play['temp'] if not np.isnan(play['temp']) else 70,
            play['wind'] if not np.isnan(play['wind']) else 0,

            play['shotgun'] if not np.isnan(play['shotgun']) else 0,
            play['no_huddle'] if not np.isnan(play['no_huddle']) else 0,
        ]
        sequence.append(play_stats)

    sequence = np.array(sequence)

    if len(sequence) > 0:
        weights = np.linspace(1.0, 1.5, len(sequence))
        sequence = sequence * weights[:, np.newaxis]

    return sequence

In [6]:
class NFLDataset(Dataset):
    def __init__(self, qb_sequences, def_sequences, y, qb_names, def_teams, indices, max_seq_len=2000):
        self.qb_seqs = [torch.FloatTensor(qb_sequences[i]) for i in indices]
        self.def_seqs = [torch.FloatTensor(def_sequences[i]) for i in indices]
        self.y = torch.FloatTensor(y[indices])
        self.qb_idx = torch.LongTensor([qb_to_idx[qb] for qb in qb_names[indices]])
        self.team_idx = torch.LongTensor([team_to_idx[team] for team in def_teams[indices]])
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.y)  # Return the number of samples

    def __getitem__(self, idx):
        qb_seq = self.qb_seqs[idx]
        def_seq = self.def_seqs[idx]

        # Truncate sequences if they're too long
        if len(qb_seq) > self.max_seq_len:
            qb_seq = qb_seq[-self.max_seq_len:]
        if len(def_seq) > self.max_seq_len:
            def_seq = def_seq[-self.max_seq_len:]

        return (
            qb_seq,
            def_seq,
            self.qb_idx[idx],
            self.team_idx[idx],
            self.y[idx]
        )

def pad_sequences(sequences, max_len=None):
    """Pad sequences to the same length"""
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_seqs = []
    for seq in sequences:
        if len(seq) == 0:
            padded_seq = np.zeros((max_len, seq.shape[1] if len(seq.shape) > 1 else 1))
        else:
            pad_length = max_len - len(seq)
            if pad_length > 0:
                padding = np.zeros((pad_length, seq.shape[1]))
                padded_seq = np.vstack([seq, padding])
            else:
                padded_seq = seq[:max_len]
        padded_seqs.append(padded_seq)

    return np.array(padded_seqs)

def collate_fn(batch):
    """Custom collate function to handle variable-length sequences"""
    qb_seqs, def_seqs, qb_idx, team_idx, y = zip(*batch)

    # Pad sequences
    qb_seqs_padded = pad_sequences([seq.numpy() for seq in qb_seqs])
    def_seqs_padded = pad_sequences([seq.numpy() for seq in def_seqs])

    return (
        torch.FloatTensor(qb_seqs_padded),
        torch.FloatTensor(def_seqs_padded),
        torch.stack(qb_idx),
        torch.stack(team_idx),
        torch.stack(y)
    )

In [7]:
class QBPerformancePredictor(nn.Module):
    def __init__(self, num_qbs, num_teams, max_seq_len=2000):  # Increased max sequence length
        super().__init__()

        self.qb_feature_dim = 18
        self.def_feature_dim = 18
        self.hidden_dim = 128
        self.max_seq_len = max_seq_len

        # Feature embedding
        self.feature_embedding = nn.Linear(self.qb_feature_dim, 64)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=64,
            nhead=4,
            dim_feedforward=256,
            dropout=0.1,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=3)

        # Identity embeddings with positional encoding
        self.qb_embedding = nn.Embedding(num_qbs, 32)
        self.team_embedding = nn.Embedding(num_teams, 32)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_seq_len, 64))

        # Rest of the architecture remains the same...

    def forward(self, qb_seq, def_seq, qb_idx, team_idx):
        # Truncate sequences if they're too long
        if qb_seq.size(1) > self.max_seq_len:
            qb_seq = qb_seq[:, -self.max_seq_len:, :]
        if def_seq.size(1) > self.max_seq_len:
            def_seq = def_seq[:, -self.max_seq_len:, :]

        # Project features
        qb_embedded = self.feature_embedding(qb_seq)
        def_embedded = self.feature_embedding(def_seq)

        # Add positional encoding
        seq_len = qb_embedded.size(1)
        qb_embedded = qb_embedded + self.positional_encoding[:, :seq_len, :]
        def_embedded = def_embedded + self.positional_encoding[:, :seq_len, :]

        # Transform sequences
        qb_encoded = self.transformer_encoder(qb_embedded)
        def_encoded = self.transformer_encoder(def_embedded)

        # Pool sequences with attention
        qb_pooled = self.attention_pool(qb_encoded)
        def_pooled = self.attention_pool(def_encoded)

        # Get identity embeddings
        qb_emb = self.qb_embedding(qb_idx)
        team_emb = self.team_embedding(team_idx)

        # Combine features
        combined = torch.cat([qb_pooled, def_pooled, qb_emb, team_emb], dim=1)

        # Main prediction path
        x1 = self.fc1(combined)
        x1 = self.layer_norm1(x1)
        x1 = self.relu(x1)
        x1 = self.dropout(x1)

        x2 = self.fc2(x1)
        x2 = self.layer_norm2(x2)
        x2 = self.relu(x2)
        x2 = self.dropout(x2)

        main_out = self.fc3(x2)

        # Auxiliary prediction path
        aux_x = self.aux_fc1(combined)
        aux_x = self.relu(aux_x)
        aux_out = self.aux_fc2(aux_x)

        return main_out, aux_out

In [8]:
def train_epoch(model, train_loader, optimizer, criterion, aux_criterion, device):
    model.train()
    total_loss = 0
    total_main_loss = 0
    total_aux_loss = 0

    with tqdm(train_loader, desc='Training') as pbar:
        for batch in pbar:
            qb_seq, def_seq, qb_idx, team_idx, y = [b.to(device) for b in batch]

            optimizer.zero_grad()

            # Get main and auxiliary predictions
            main_pred, aux_pred = model(qb_seq, def_seq, qb_idx, team_idx)

            # Calculate losses
            main_loss = criterion(main_pred, y)

            # Create auxiliary targets (completion %, TD rate, INT rate)
            aux_targets = torch.stack([
                y[:, 3],  # completion percentage
                y[:, 1] / torch.clamp(y[:, 0], min=1),  # TD rate
                y[:, 2] / torch.clamp(y[:, 0], min=1)   # INT rate
            ], dim=1).to(device)

            aux_loss = aux_criterion(aux_pred, aux_targets)

            # Combined loss
            loss = main_loss + 0.3 * aux_loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            total_loss += loss.item()
            total_main_loss += main_loss.item()
            total_aux_loss += aux_loss.item()

            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'main_loss': f'{main_loss.item():.4f}',
                'aux_loss': f'{aux_loss.item():.4f}'
            })

    return total_loss / len(train_loader), total_main_loss / len(train_loader), total_aux_loss / len(train_loader)

def validate(model, val_loader, criterion, aux_criterion, device):
    model.eval()
    total_loss = 0
    total_main_loss = 0
    total_aux_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            qb_seq, def_seq, qb_idx, team_idx, y = [b.to(device) for b in batch]

            main_pred, aux_pred = model(qb_seq, def_seq, qb_idx, team_idx)

            main_loss = criterion(main_pred, y)

            aux_targets = torch.stack([
                y[:, 3],
                y[:, 1] / torch.clamp(y[:, 0], min=1),
                y[:, 2] / torch.clamp(y[:, 0], min=1)
            ], dim=1).to(device)

            aux_loss = aux_criterion(aux_pred, aux_targets)
            loss = main_loss + 0.3 * aux_loss

            total_loss += loss.item()
            total_main_loss += main_loss.item()
            total_aux_loss += aux_loss.item()

    return total_loss / len(val_loader), total_main_loss / len(val_loader), total_aux_loss / len(val_loader)

In [9]:
# Create sequences and prepare data
print("\nCreating sequences...")
qb_sequences = []
def_sequences = []
y_data = []

# Group plays by game for target creation
game_stats = pass_plays.groupby(['game_id', 'passer_player_name', 'defteam']).agg({
    'yards_gained': 'sum',
    'pass_touchdown': 'sum',
    'interception': 'sum',
    'complete_pass': 'sum',
    'pass_attempt': 'sum',
    'sack': 'sum'
}).reset_index()

game_stats['completion_percentage'] = (game_stats['complete_pass'] / game_stats['pass_attempt'] * 100).round(1)

for _, game in game_stats.iterrows():
    # Create sequences
    qb_seq = create_sequence_features(pass_plays, game['passer_player_name'], game['game_id'])
    def_seq = create_defense_sequence(pass_plays, game['defteam'], game['game_id'])

    # Skip if no historical data
    if len(qb_seq) == 0 or len(def_seq) == 0:
        continue

    # Create target variables
    target = [
        game['yards_gained'],
        game['pass_touchdown'],
        game['interception'],
        game['completion_percentage'],
        game['sack']
    ]

    qb_sequences.append(qb_seq)
    def_sequences.append(def_seq)
    y_data.append(target)

# Create QB and team indices
print("\nCreating indices...")
qb_to_idx = {qb: idx for idx, qb in enumerate(game_stats['passer_player_name'].unique())}
team_to_idx = {team: idx for idx, team in enumerate(game_stats['defteam'].unique())}

# Scale target variables
scaler = StandardScaler()
y = np.array(y_data)
y_scaled = scaler.fit_transform(y)

# Split into train and test sets
print("\nSplitting data...")
train_size = int(0.8 * len(y_scaled))
indices = np.arange(len(y_scaled))
np.random.shuffle(indices)
train_idx = indices[:train_size]
test_idx = indices[train_size:]

# Create data loaders
# Define maximum sequence length
max_seq_len = 2000  # Adjust this value as needed

# Create data loaders with max_seq_len
train_dataset = NFLDataset(
    qb_sequences, def_sequences, y_scaled,
    game_stats['passer_player_name'].values,
    game_stats['defteam'].values, train_idx,
    max_seq_len=max_seq_len  # Pass max_seq_len to the dataset
)

test_dataset = NFLDataset(
    qb_sequences, def_sequences, y_scaled,
    game_stats['passer_player_name'].values,
    game_stats['defteam'].values, test_idx,
    max_seq_len=max_seq_len  # Pass max_seq_len to the dataset
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Initialize model with max_seq_len
model = QBPerformancePredictor(
    num_qbs=len(qb_to_idx),
    num_teams=len(team_to_idx),
    max_seq_len=max_seq_len  # Pass max_seq_len to the model
).to(device)

# Initialize training components
criterion = FocalLoss()
aux_criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

# Training loop
num_epochs = 50
early_stopping = EarlyStopping(patience=10, min_delta=0.001)

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    train_loss, train_main_loss, train_aux_loss = train_epoch(
        model, train_loader, optimizer, criterion, aux_criterion, device
    )

    val_loss, val_main_loss, val_aux_loss = validate(
        model, test_loader, criterion, aux_criterion, device
    )

    print(f"Training Loss: {train_loss:.4f} (Main: {train_main_loss:.4f}, Aux: {train_aux_loss:.4f})")
    print(f"Validation Loss: {val_loss:.4f} (Main: {val_main_loss:.4f}, Aux: {val_aux_loss:.4f})\n")

    scheduler.step()

    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break


Creating sequences...

Creating indices...

Splitting data...
Epoch 1/50


Training:   0%|          | 0/86 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (1657) must match the size of tensor b (1406) at non-singleton dimension 1

In [None]:
def predict_qb_performance(qb_name, def_team):
    """Make predictions for a QB against a specific defense"""
    if qb_name not in qb_to_idx:
        raise ValueError(f"Quarterback {qb_name} not found in data.")
    if def_team not in team_to_idx:
        raise ValueError(f"Defense team {def_team} not found in data.")

    qb_seq = create_sequence_features(pass_plays, qb_name, float('inf'))
    def_seq = create_defense_sequence(pass_plays, def_team, float('inf'))

    if len(qb_seq) == 0:
        raise ValueError(f"No historical data found for QB: {qb_name}")
    if len(def_seq) == 0:
        raise ValueError(f"No historical data found for defense: {def_team}")

    qb_seq_tensor = torch.FloatTensor(qb_seq).unsqueeze(0).to(device)
    def_seq_tensor = torch.FloatTensor(def_seq).unsqueeze(0).to(device)
    qb_idx = torch.LongTensor([qb_to_idx[qb_name]]).to(device)
    team_idx = torch.LongTensor([team_to_idx[def_team]]).to(device)

    model.eval()
    with torch.no_grad():
        main_pred, _ = model(qb_seq_tensor, def_seq_tensor, qb_idx, team_idx)

    prediction = scaler.inverse_transform(main_pred.cpu().numpy())

    return {
        'yards_gained': round(float(prediction[0, 0]), 1),
        'pass_touchdown': round(float(prediction[0, 1]), 1),
        'interception': round(float(prediction[0, 2]), 1),
        'completion_percentage': round(float(prediction[0, 3]), 1),
        'sack': round(float(prediction[0, 4]), 1)
    }

# Example usage
try:
    prediction = predict_qb_performance("P.Mahomes", "BUF")
    print("\nPredicted QB Performance:")
    for stat, value in prediction.items():
        print(f"{stat}: {value}")
except ValueError as e:
    print(e)