In [19]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import gc
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
total = pd.read_csv("output.csv")
cat_cols = ['team', 'conference']

Index(['player_id', 'full_name', 'team', 'season', 'week', 'week_start',
       'conference', 'pow_conference', 'games_played_this_week', 'numMinutes',
       'points', 'assists', 'blocks', 'steals', 'reboundsTotal',
       'reboundsDefensive', 'reboundsOffensive', 'fieldGoalsAttempted',
       'fieldGoalsMade', 'threePointersAttempted', 'threePointersMade',
       'freeThrowsAttempted', 'freeThrowsMade', 'turnovers', 'foulsPersonal',
       'plusMinusPoints', 'wins_this_week', 'wins_vs_team_with_all_nba_player',
       'is_win_vs_over_500', 'opponent_has_all_nba', 'avg_opp_score',
       'avg_opp_winrate_prior', 'avg_opp_wins_prior', 'avg_opp_losses_prior',
       'away_games_prior', 'away_losses_prior', 'away_win_streak_prior',
       'away_wins_prior', 'home_games_prior', 'home_losses_prior',
       'home_win_streak_prior', 'home_wins_prior', 'losses_prior',
       'wins_vs_over_500_prior', 'won_player_of_the_week',
       'all_star_this_season', 'mvp_this_season',
       'all_nba_f

In [3]:
# To conduct week by week inference, we will train our model pre-2025-26 season and inference on our 2025-26 weeks
total_pre = total[total['season'] != 2025]
total_inf = total[total['season'] == 2025] # our inference set

In [4]:
# Dropping rows where season is before 2001-02 season
total_pre = total_pre[total_pre['season'] > 2000]

# Neural Networks

While we achieved 100% accuracy for our Light GBM inferences, our model will eventually reach its baseline as mentioned above. Thus, we will explore another model to see if it can achieve a higher baseline accuracy: neural networks

In [None]:
# Assign device to gpu if available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Training on Apple Silicon GPU (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Training on NVIDIA GPU (CUDA)")
else:
    device = torch.device("cpu")
    print("Training on CPU")

print(f"Using device: {device}")

Training on Apple Silicon GPU (MPS)
Using device: mps


### Data Preparation

As extensively shown in our Light GBM model, we need to keep our groups (season, week, conference) in tact when training our model. As such, we need to create a custom batch sampler that keeps entire groups together.

In [6]:
class GroupBatchSampler:
    def __init__(self, group_ids, batch_size):
        self.group_ids = group_ids
        self.batch_size = batch_size

        # Group indices by group_id
        self.groups = {}
        for idx, group_id in enumerate(group_ids):
            if group_id not in self.groups:
                self.groups[group_id] = []
            self.groups[group_id].append(idx)

        self.group_list = list(self.groups.keys())

    def __iter__(self):
        # Shuffle the order of the groups
        np.random.shuffle(self.group_list)

        batch = []
        for group_id in self.group_list:
            group_indices = self.groups[group_id]
            batch.extend(group_indices)

            # Yield batch when reching batch_size
            while len(batch) >= self.batch_size:
                yield batch[:self.batch_size]
                batch = batch[self.batch_size:]

         # Yield remaining samples   
        if len(batch) > 0:
            yield batch
    
    def __len__(self):
        return (len(self.group_ids) + self.batch_size - 1) // self.batch_size

Unlike our Light GBM model, we need to one-hot encode our categorical features (team, conference) for our neural net model.

In [None]:
# From our data above
# Reminder: total_pre = data up to 2024-25 season
X = total_pre.drop(columns=['full_name', 'player_id', 'pow_player_id', 'player_of_the_week', 'won_player_of_the_week', 'all_star_this_season', 'mvp_this_season',
 'all_nba_first_team_this_season', 'all_nba_second_team_this_season', 'all_nba_third_team_this_season', 'week_start', 'pow_conference',
 'is_win_vs_over_500', 'opponent_has_all_nba'])

y = total_pre['won_player_of_the_week']

print(f"Number of features before one-hot encoding: {len(X.columns)}")

# One-hot encode
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Rename conference one-hot encoding column: 'conference_West' -> 'conference'
X = X.rename(columns={'conference_West': 'conference'})

print(f"Number of features after one-hot encoding: {len(X.columns)}")

Number of features before one-hot encoding: 65
Number of features after one-hot encoding: 95


In [8]:
X.columns

Index(['season', 'week', 'games_played_this_week', 'numMinutes', 'points',
       'assists', 'blocks', 'steals', 'reboundsTotal', 'reboundsDefensive',
       'reboundsOffensive', 'fieldGoalsAttempted', 'fieldGoalsMade',
       'threePointersAttempted', 'threePointersMade', 'freeThrowsAttempted',
       'freeThrowsMade', 'turnovers', 'foulsPersonal', 'plusMinusPoints',
       'wins_this_week', 'wins_vs_team_with_all_nba_player', 'avg_opp_score',
       'avg_opp_winrate_prior', 'avg_opp_wins_prior', 'avg_opp_losses_prior',
       'away_games_prior', 'away_losses_prior', 'away_win_streak_prior',
       'away_wins_prior', 'home_games_prior', 'home_losses_prior',
       'home_win_streak_prior', 'home_wins_prior', 'losses_prior',
       'wins_vs_over_500_prior', 'team_pts', 'team_ast', 'team_blk',
       'team_stl', 'team_gms', 'fieldGoalsPercentage',
       'threePointersPercentage', 'freeThrowsPercentage', 'points_mean_season',
       'points_std_season', 'assists_mean_season', 'assists_st

Since we one-hot encoded our categorical features, the number of features will grow a lot due to the number of teams in the NBA. This is totally ok since neural networks can handle high dimensionality

### Neural Network Architecture

Our neural network has three layers that get progressively smaller (256 -> 128 -> 64 neurons), like a funnel that squeezes information down from ~100 input features into a single score per player. The first layer spots basic patterns in the data, the middle layer combines these patterns into bigger ideas like "great scorer" or "well-rounded player," and the final layer picks out what actually matters for winning player of the week. 

Between each layer, we use ReLU activation which lets the network learn that certain combinations of stats matter more together than separately (like how 35 points + 3 wins is way more impressive than just 35 points alone). We also randomly turn off 30% of neurons during training (dropout) so the network doesn't just memorize the data and actually learns real patterns. The final output is a ranking score rather than a probability because we just need to know who scores highest, not the exact chance they win.

In [9]:
class POTWRanker(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout=0.3):
        super(POTWRanker, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        # Build hidden layers
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = hidden_dim
        
        # Output layer (single score per player)
        layers.append(nn.Linear(prev_dim, 1))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

### Ranking Loss Function

Our loss function teaches the model to make sure winners score higher than everyone else in the same week, rather than just predicting winners independently. For every week, it compares the winner to every non-winner (so if there's 1 winner and 50 other players, it makes 50 comparisons) and gives the model a penalty whenever the winner's score isn't at least 1.0 points higher. This is like telling the model "the winner needs to clearly beat everyone else, not just barely edge them out." The key advantage is that the model learns what makes one player better than another in that specific week's context, rather than trying to learn some universal "good player" score. This only works if we keep entire weeks together during training. If we mix players from different weeks, the comparisons don't make sense since those players never actually competed for the same award.

In [10]:
class PairwiseRankingLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(PairwiseRankingLoss, self).__init__()
        self.margin = margin
    
    def forward(self, scores, labels, group_indices):
        total_loss = torch.tensor(0.0, requires_grad=True)
        num_pairs = 0
        
        # For each group (week/conference)
        for group_id in torch.unique(group_indices):
            mask = group_indices == group_id
            group_scores = scores[mask].squeeze()
            group_labels = labels[mask]
            
            # Find winners and non-winners
            winner_mask = group_labels == 1
            non_winner_mask = group_labels == 0
            
            if winner_mask.sum() == 0 or non_winner_mask.sum() == 0:
                continue
            
            winner_scores = group_scores[winner_mask]
            non_winner_scores = group_scores[non_winner_mask]
            
            # Compute all pairwise differences at once
            score_diff = winner_scores.unsqueeze(1) - non_winner_scores.unsqueeze(0)
            
            # Apply margin and clamp
            losses = torch.clamp(self.margin - score_diff, min=0)
            
            total_loss = total_loss + losses.sum()  # Add to tensor
            num_pairs += losses.numel()
        
        # Return tensor (even if zero)
        if num_pairs == 0:
            return torch.tensor(0.0, requires_grad=True)
        
        return total_loss / num_pairs

### Cross-validation Training

Again, we will run time-based cross validation to prevent overfitting results swaying our decisions. We will follow the same format as our cross validation from Light GBM.

Unlike our Light GBM model, in our neural net model, we will scale our data because neural networks are sensitive to feature magnitudes. For example, 'points' (0 to 70+ range) would dominate features like 'fieldGoalsPercentage' (0 to 1 range) during training, causing the model to learn poorly. Standardization ensures all features contribute equally to the model's learning process and helps gradient descent converge faster and more reliably. 

In [None]:
# Hyperparameters
epochs = 50
batch_size = 512
hidden_dims = [512, 256, 128, 64]
dropout = 0.2
lr = 0.001

unique_seasons = sorted(X['season'].unique())
folds = 3  # Reduced from 5 to 3 compared to Light GBM
k = [1, 3, 5, 10]
cv_results = []

for fold in range(1, folds + 1):
    val_season = unique_seasons[-fold]
    train_seasons = unique_seasons[:-fold]
    
    val_mask = X['season'] == val_season
    train_mask = X['season'].isin(train_seasons)
    
    X_train = X[train_mask].copy()
    y_train = y[train_mask].copy()
    X_val = X[val_mask].copy()
    y_val = y[val_mask].copy()

    print(f"Validation {fold}")
    print(f"Val Season: {val_season}")
    print(f"Train Seasons: {train_seasons[0]}-{train_seasons[-1]}")
    print(f"Train size: {len(X_train)}, Val size: {len(X_val)}")
    
    X_train['group_id'] = X_train.groupby(['season', 'week', 'conference']).ngroup()
    train_group_ids = X_train['group_id'].values
    
    X_train_features = X_train.drop(columns=['group_id']).values
    X_val_features = X_val.values
    print(f"Train shape: {X_train_features.shape}")
    print(f"Val shape: {X_val_features.shape}")
    
    # Scaling our data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_features).astype(np.float32)
    X_val_scaled = scaler.transform(X_val_features).astype(np.float32)
    
    print("\nCreating PyTorch dataset...")
    train_dataset = TensorDataset(
        torch.from_numpy(X_train_scaled),
        torch.from_numpy(y_train.values.astype(np.float32)),
        torch.from_numpy(train_group_ids.astype(np.int64))
    )
    
    print("\nCreating GroupBatchSampler and DataLoader...")
    batch_sampler = GroupBatchSampler(train_group_ids, batch_size=batch_size)
    train_loader = DataLoader(train_dataset, batch_sampler=batch_sampler)
    print(f"DataLoader created. Number of batches: {len(train_loader)}")
    
    print("\nInitializing model...")
    input_dim = X_train_scaled.shape[1]
    model = POTWRanker(input_dim, hidden_dims=hidden_dims, dropout=dropout)
    print(f"Model created. Parameters: {sum(p.numel() for p in model.parameters()):,}")

    model = model.to(device)
    print(f"Model moved to {device}")
    
    print("\nSetting up loss and optimizer...")
    criterion = PairwiseRankingLoss(margin=1.0)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    print("Loss and optimizer ready")
    
    print(f"\nTraining {epochs} epochs")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch_X, batch_y, batch_groups in train_loader:
            # Move batch data to GPU
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            batch_groups = batch_groups.to(device)

            optimizer.zero_grad()
            scores = model(batch_X)
            loss = criterion(scores, batch_y, batch_groups)
            loss.backward() # Compute gradients
            optimizer.step() # Update weights * learning rate (Gradient descent)
            
            total_loss += loss.item() # Track running total loss
        
        avg_loss = total_loss / len(train_loader)
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
    
    print("\nEvaluation")
    
    model.eval() # Set model to evaluation mode
    with torch.no_grad():
        X_val_tensor = torch.from_numpy(X_val_scaled).to(device)
        val_scores = model(X_val_tensor).cpu().squeeze().numpy()
    
    df_val = X_val[['season', 'week', 'conference']].copy().reset_index(drop=True)
    df_val['score'] = val_scores
    df_val['y_true'] = y_val.reset_index(drop=True).values
    
    k_dict = {i: [] for i in k}
    groups = df_val.groupby(['season', 'week', 'conference'], observed=True)
    ranks = []
    
    for _, group in groups:
        if group['y_true'].sum() == 0:
            continue
        
        group_sorted = group.sort_values('score', ascending=False).reset_index(drop=True)
        pos_idx = group_sorted.index[group_sorted['y_true'] == 1]
        
        for i in k:
            top_k = group_sorted.head(i)
            hit = top_k['y_true'].max()
            k_dict[i].append(hit)
        
        for idx in pos_idx:
            ranks.append(idx + 1)
    
    ranks = np.array(ranks)
    curr = {
        "Fold": fold,
        "Top_rank": ranks.min(),
        "Lowest_rank": ranks.max(),
        "Percentiles": np.percentile(ranks, [10, 25, 50, 75, 90])
    }
    for i, hits in k_dict.items():
        curr[f"Top_{i}_avg_hits"] = np.mean(hits)
    cv_results.append(curr)
    
    print(f"\nFold {fold} Results:")
    print(f"Top-1 Accuracy: {curr['Top_1_avg_hits']:.4f}")
    print(f"Top-3 Accuracy: {curr['Top_3_avg_hits']:.4f}")
    print(f"Top-5 Accuracy: {curr['Top_5_avg_hits']:.4f}")
    print(f"Top Rank: {curr['Top_rank']}")
    print(f"Lowest Rank: {curr['Lowest_rank']}")
    
    # Clean up memory after each fold to clear up RAM
    del model, optimizer, criterion, train_loader, train_dataset, batch_sampler
    del X_train, y_train, X_val, y_val
    del X_train_features, X_val_features, X_train_scaled, X_val_scaled
    del train_group_ids
    gc.collect()

Validation 1
Val Season: 2024
Train Seasons: 2001-2023
Train size: 165504, Val size: 7971
  Number of features: 95
Train shape: (165504, 95)
Val shape: (7971, 95)

Creating PyTorch dataset...

Creating GroupBatchSampler and DataLoader...
DataLoader created. Number of batches: 324

Initializing model...
Model created. Parameters: 221,697
Model moved to mps

Setting up loss and optimizer...
Loss and optimizer ready

Training 50 epochs
Epoch 10/50, Loss: 0.0087
Epoch 20/50, Loss: 0.0050
Epoch 30/50, Loss: 0.0025
Epoch 40/50, Loss: 0.0021
Epoch 50/50, Loss: 0.0018

Evaluation

Fold 1 Results:
Top-1 Accuracy: 0.4000
Top-3 Accuracy: 0.6250
Top-5 Accuracy: 0.7250
Top Rank: 1
Lowest Rank: 22
Validation 2
Val Season: 2023
Train Seasons: 2001-2022
Train size: 158395, Val size: 7109
  Number of features: 95
Train shape: (158395, 95)
Val shape: (7109, 95)

Creating PyTorch dataset...

Creating GroupBatchSampler and DataLoader...
DataLoader created. Number of batches: 310

Initializing model...
Mod

In [17]:
# Final Results
cv_df = pd.DataFrame(cv_results)
cv_df

Unnamed: 0,Fold,Top_rank,Lowest_rank,Percentiles,Top_1_avg_hits,Top_3_avg_hits,Top_5_avg_hits,Top_10_avg_hits
0,1,1,22,"[1.0, 1.0, 3.0, 6.0, 10.800000000000004]",0.4,0.625,0.725,0.9
1,2,1,18,"[1.0, 1.0, 2.0, 4.0, 6.199999999999996]",0.485714,0.771429,0.914286,0.971429
2,3,1,35,"[1.0, 1.0, 1.0, 3.0, 7.6000000000000085]",0.595238,0.857143,0.880952,0.952381


In [18]:
print("Average Performance Across Folds:")
for i in k:
    print(f"Top-{i} Accuracy: {cv_df[f'Top_{i}_avg_hits'].mean()}")

Average Performance Across Folds:
Top-1 Accuracy: 0.4936507936507937
Top-3 Accuracy: 0.7511904761904762
Top-5 Accuracy: 0.8400793650793651
Top-10 Accuracy: 0.9412698412698411


Our neural network achieved an average top-1 accuracy of approximately 50% across cross-validation folds, which is competitive with Light GBM's 53% but shows significantly higher variance (ranging from 40% in Fold 1 to 60% in Fold 3). This variance suggests that certain seasons are easier to predict than others, and neural networks are more sensitive to these differences in data distribution compared to tree-based models. While Fold 3's 60% accuracy demonstrates that neural networks can match or exceed LightGBM's performance on some validation sets, the inconsistency makes LightGBM a more stable and reliable choice for deployment. Nonetheless, lets continue on to tuning our neural network.

### Hyper Parameter Tuning

Just like for Light GBM model, lets use optuna to tune our model.

In [22]:
def nn_objective(trial):    
    # Hyperparameters
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True), # Proportion of step when updating weights during gradient descent
        'batch_size': trial.suggest_categorical('batch_size', [128, 256, 512]), # Number of samples processed together before updating weights
        'dropout': trial.suggest_float('dropout', 0.1, 0.5), # Probability of randomly "turning off neurons" during training to prevent overfitting
        'hidden_dim_1': trial.suggest_categorical('hidden_dim_1', [256, 512, 768]),
        'hidden_dim_2': trial.suggest_categorical('hidden_dim_2', [128, 256, 384]),
        'hidden_dim_3': trial.suggest_categorical('hidden_dim_3', [64, 128, 192]),
        'hidden_dim_4': trial.suggest_categorical('hidden_dim_4', [32, 64, 96]),
        'margin': trial.suggest_float('margin', 0.5, 2.0), # Minimum score difference required between winner and non-winner in our loss function
        'epochs': 30, # Fixed to save time during training, decrease from 50 to 30 to decrease trial time
    }
    
    hidden_dims = [params['hidden_dim_1'], params['hidden_dim_2'], 
                   params['hidden_dim_3'], params['hidden_dim_4']]
    
    unique_seasons = sorted(X['season'].unique())
    folds = 2 # Decrease fold due to running multiple trials
    fold_accuracies = []
    
    for fold in range(1, folds + 1):
        val_season = unique_seasons[-fold]
        train_seasons = unique_seasons[:-fold]
        
        val_mask = X['season'] == val_season
        train_mask = X['season'].isin(train_seasons)
        
        X_train = X[train_mask].copy()
        y_train = y[train_mask].copy()
        X_val = X[val_mask].copy()
        y_val = y[val_mask].copy()
        
        # Create group IDs
        X_train['group_id'] = X_train.groupby(['season', 'week', 'conference']).ngroup()
        train_group_ids = X_train['group_id'].values
        
        # Extract features
        X_train_features = X_train.drop(columns=['group_id']).values
        X_val_features = X_val.values
        
        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_features).astype(np.float32)
        X_val_scaled = scaler.transform(X_val_features).astype(np.float32)
        
        # Create dataset
        train_dataset = TensorDataset(
            torch.from_numpy(X_train_scaled),
            torch.from_numpy(y_train.values.astype(np.float32)),
            torch.from_numpy(train_group_ids.astype(np.int64))
        )
        
        batch_sampler = GroupBatchSampler(train_group_ids, batch_size=params['batch_size'])
        train_loader = DataLoader(train_dataset, batch_sampler=batch_sampler)
        
        # Initialize model
        input_dim = X_train_scaled.shape[1]
        model = POTWRanker(input_dim, hidden_dims=hidden_dims, dropout=params['dropout'])
        model = model.to(device)
        
        criterion = PairwiseRankingLoss(margin=params['margin'])
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
        
        # Training
        for epoch in range(params['epochs']):
            model.train()
            for batch_X, batch_y, batch_groups in train_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                batch_groups = batch_groups.to(device)
                
                optimizer.zero_grad()
                scores = model(batch_X)
                loss = criterion(scores, batch_y, batch_groups)
                loss.backward()
                optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            X_val_tensor = torch.from_numpy(X_val_scaled).to(device)
            val_scores = model(X_val_tensor).cpu().squeeze().numpy()
        
        df_val = X_val[['season', 'week', 'conference']].copy().reset_index(drop=True)
        df_val['score'] = val_scores
        df_val['y_true'] = y_val.reset_index(drop=True).values
        
        # Calculate Top-1 accuracy
        hits = []
        for _, group in df_val.groupby(['season', 'week', 'conference'], observed=True):
            if group['y_true'].sum() == 0:
                continue
            group_sorted = group.sort_values('score', ascending=False).reset_index(drop=True)
            hit = 1 if group_sorted.iloc[0]['y_true'] == 1 else 0
            hits.append(hit)
        
        fold_accuracy = np.mean(hits)
        fold_accuracies.append(fold_accuracy)
        
        # Report intermediate value for pruning
        trial.report(fold_accuracy, fold)
        
        # Prune unpromising trials
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        # Clean up memory
        del model, optimizer, criterion, train_loader, train_dataset, batch_sampler
        del X_train, y_train, X_val, y_val, train_group_ids
        del X_train_features, X_val_features, X_train_scaled, X_val_scaled
        gc.collect()
    
    # Return mean accuracy across folds
    return np.mean(fold_accuracies)

In [23]:
# Create study
study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1), # Does not prune first 5 trials, waits after 1st fold to prune
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Run optimization, using 15 vs. 50 due to long trial time
study.optimize(nn_objective, n_trials=15, show_progress_bar=True)

[I 2025-12-05 00:20:42,749] A new study created in memory with name: no-name-bb01eb73-f063-4b96-9431-4102d5f165c9
Best trial: 0. Best value: 0.425:   7%|▋         | 1/15 [11:02<2:34:40, 662.89s/it]

[I 2025-12-05 00:31:45,640] Trial 0 finished with value: 0.42500000000000004 and parameters: {'learning_rate': 0.0005611516415334506, 'batch_size': 128, 'dropout': 0.1624074561769746, 'hidden_dim_1': 768, 'hidden_dim_2': 256, 'hidden_dim_3': 64, 'hidden_dim_4': 96, 'margin': 1.2871346474483567}. Best is trial 0 with value: 0.42500000000000004.


Best trial: 1. Best value: 0.505357:  13%|█▎        | 2/15 [19:39<2:04:59, 576.86s/it]

[I 2025-12-05 00:40:22,282] Trial 1 finished with value: 0.5053571428571428 and parameters: {'learning_rate': 0.0007309539835912913, 'batch_size': 256, 'dropout': 0.21685785941408728, 'hidden_dim_1': 768, 'hidden_dim_2': 384, 'hidden_dim_3': 128, 'hidden_dim_4': 96, 'margin': 1.7125960221746916}. Best is trial 1 with value: 0.5053571428571428.


Best trial: 1. Best value: 0.505357:  20%|██        | 3/15 [28:04<1:48:50, 544.22s/it]

[I 2025-12-05 00:48:47,661] Trial 2 finished with value: 0.3875 and parameters: {'learning_rate': 0.0004066563313514797, 'batch_size': 256, 'dropout': 0.14881529393791154, 'hidden_dim_1': 768, 'hidden_dim_2': 256, 'hidden_dim_3': 128, 'hidden_dim_4': 32, 'margin': 1.8422410256414732}. Best is trial 1 with value: 0.5053571428571428.


Best trial: 1. Best value: 0.505357:  27%|██▋       | 4/15 [46:52<2:21:59, 774.49s/it]

[I 2025-12-05 01:07:35,165] Trial 3 finished with value: 0.4946428571428571 and parameters: {'learning_rate': 0.0015696396388661144, 'batch_size': 128, 'dropout': 0.11809091556421523, 'hidden_dim_1': 512, 'hidden_dim_2': 128, 'hidden_dim_3': 192, 'hidden_dim_4': 64, 'margin': 0.7980735223012586}. Best is trial 1 with value: 0.5053571428571428.


Best trial: 1. Best value: 0.505357:  33%|███▎      | 5/15 [57:47<2:01:52, 731.27s/it]

[I 2025-12-05 01:18:29,803] Trial 4 finished with value: 0.45357142857142857 and parameters: {'learning_rate': 0.00010257563974185662, 'batch_size': 128, 'dropout': 0.4085081386743783, 'hidden_dim_1': 512, 'hidden_dim_2': 128, 'hidden_dim_3': 192, 'hidden_dim_4': 96, 'margin': 1.2083223877429239}. Best is trial 1 with value: 0.5053571428571428.


Best trial: 1. Best value: 0.505357:  40%|████      | 6/15 [1:05:57<1:37:25, 649.51s/it]

[I 2025-12-05 01:26:40,593] Trial 5 finished with value: 0.4821428571428571 and parameters: {'learning_rate': 0.00017345566642360953, 'batch_size': 256, 'dropout': 0.40838687198182444, 'hidden_dim_1': 512, 'hidden_dim_2': 256, 'hidden_dim_3': 64, 'hidden_dim_4': 32, 'margin': 1.633326707814573}. Best is trial 1 with value: 0.5053571428571428.


Best trial: 1. Best value: 0.505357:  47%|████▋     | 7/15 [1:10:08<1:09:13, 519.16s/it]

[I 2025-12-05 01:30:51,400] Trial 6 pruned. 


Best trial: 1. Best value: 0.505357:  53%|█████▎    | 8/15 [1:15:39<53:34, 459.18s/it]  

[I 2025-12-05 01:36:22,142] Trial 7 pruned. 


Best trial: 1. Best value: 0.505357:  60%|██████    | 9/15 [1:23:50<46:55, 469.26s/it]

[I 2025-12-05 01:44:33,562] Trial 8 pruned. 


Best trial: 1. Best value: 0.505357:  67%|██████▋   | 10/15 [1:27:14<32:15, 387.19s/it]

[I 2025-12-05 01:47:56,997] Trial 9 pruned. 


Best trial: 1. Best value: 0.505357:  73%|███████▎  | 11/15 [1:30:36<22:02, 330.62s/it]

[I 2025-12-05 01:51:19,354] Trial 10 pruned. 


Best trial: 1. Best value: 0.505357:  80%|████████  | 12/15 [1:41:10<21:08, 422.96s/it]

[I 2025-12-05 02:01:53,492] Trial 11 finished with value: 0.4232142857142857 and parameters: {'learning_rate': 0.0013838713864867114, 'batch_size': 128, 'dropout': 0.2095091787743595, 'hidden_dim_1': 256, 'hidden_dim_2': 128, 'hidden_dim_3': 128, 'hidden_dim_4': 64, 'margin': 0.534169419173486}. Best is trial 1 with value: 0.5053571428571428.


Best trial: 1. Best value: 0.505357:  87%|████████▋ | 13/15 [1:45:11<12:15, 367.77s/it]

[I 2025-12-05 02:05:54,286] Trial 12 pruned. 


Best trial: 1. Best value: 0.505357:  93%|█████████▎| 14/15 [1:50:26<05:51, 351.82s/it]

[I 2025-12-05 02:11:09,228] Trial 13 pruned. 


Best trial: 1. Best value: 0.505357: 100%|██████████| 15/15 [1:53:39<00:00, 454.62s/it]

[I 2025-12-05 02:14:22,113] Trial 14 pruned. 





In [25]:
# Results
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial value (Top-1 Accuracy): {study.best_value:.4f}")

print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Number of finished trials: 15
Best trial value (Top-1 Accuracy): 0.5054

Best hyperparameters:
  learning_rate: 0.0007309539835912913
  batch_size: 256
  dropout: 0.21685785941408728
  hidden_dim_1: 768
  hidden_dim_2: 384
  hidden_dim_3: 128
  hidden_dim_4: 96
  margin: 1.7125960221746916


In [26]:
# Get best params
best_params = study.best_params

# Reconstruct hidden dims
hidden_dims_best = [
    best_params['hidden_dim_1'],
    best_params['hidden_dim_2'],
    best_params['hidden_dim_3'],
    best_params['hidden_dim_4']
]

print("\nBest Configuration:")
print(f"  Learning Rate: {best_params['learning_rate']:.6f}")
print(f"  Batch Size: {best_params['batch_size']}")
print(f"  Dropout: {best_params['dropout']:.2f}")
print(f"  Hidden Dims: {hidden_dims_best}")
print(f"  Margin: {best_params['margin']:.2f}")


Best Configuration:
  Learning Rate: 0.000731
  Batch Size: 256
  Dropout: 0.22
  Hidden Dims: [768, 384, 128, 96]
  Margin: 1.71


### Final Model

In [27]:
# Best hyperparameters from optuna trials
epochs = 50
batch_size = best_params['batch_size']
hidden_dims = hidden_dims_best
dropout = best_params['dropout']
lr = best_params['learning_rate']
margin = best_params['margin']

# Train on ALL available data (no hold-out), we have our 2025-26 season data for inference
X_all = X.copy()  # All data from 2001-2024
y_all = y.copy()

# Sort by key
X_all['group_id'] = X_all.groupby(['season', 'week', 'conference']).ngroup()
all_group_ids = X_all['group_id'].values
X_all = X_all.sort_values(by='group_id')
y_all = y_all.loc[X_all.index]

# Extract features
feature_cols = [col for col in X_all.columns if col not in ['group_id']]
X_all_features = X_all[feature_cols].values

# Scale
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all_features).astype(np.float32)

# Create dataset
train_dataset = TensorDataset(
    torch.from_numpy(X_all_scaled),
    torch.from_numpy(y_all.values.astype(np.float32)),
    torch.from_numpy(all_group_ids.astype(np.int64))
)

batch_sampler = GroupBatchSampler(all_group_ids, batch_size=batch_size)
train_loader = DataLoader(train_dataset, batch_sampler=batch_sampler)

# Train with best hyperparameters from Optuna
model = POTWRanker(input_dim, hidden_dims=hidden_dims, dropout=dropout)
model = model.to(device)

criterion = PairwiseRankingLoss(margin=margin)
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(100):
    model.train()
    for batch_X, batch_y, batch_groups in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        batch_groups = batch_groups.to(device)
        
        optimizer.zero_grad()
        scores = model(batch_X)
        loss = criterion(scores, batch_y, batch_groups)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/100, Loss: {avg_loss:.4f}")

# Save model
torch.save(model.state_dict(), 'nn_final_model.pth')
torch.save(scaler, 'nn_scaler.pkl') # Save fitted scaler to transform future data
print("Final model saved")


Epoch 10/100, Loss: 0.0011
Epoch 20/100, Loss: 0.0011
Epoch 30/100, Loss: 0.0011
Epoch 40/100, Loss: 0.0011
Epoch 50/100, Loss: 0.0011
Epoch 60/100, Loss: 0.0011
Epoch 70/100, Loss: 0.0011
Epoch 80/100, Loss: 0.0011
Epoch 90/100, Loss: 0.0011
Epoch 100/100, Loss: 0.0011
Final model saved


### Inference

We will now move on to testing our model on data from the current season: 2025-26. We will test our model for each week so far during this season.

In [48]:
# Reminder: total_inf is our inference set containing data from the 2025-26 season
# Select the same features our model was trained on
total_inf = total_inf.reset_index(drop=True)
X_inf = total_inf.drop(columns=['full_name', 'player_id', 'pow_player_id', 'player_of_the_week', 'won_player_of_the_week', 'all_star_this_season', 'mvp_this_season',
 'all_nba_first_team_this_season', 'all_nba_second_team_this_season', 'all_nba_third_team_this_season', 'week_start', 'pow_conference',
 'is_win_vs_over_500', 'opponent_has_all_nba']).reset_index(drop=True)

# True results
y_inf = total_inf['won_player_of_the_week'].reset_index(drop=True)

# One-hot encode
X_inf = pd.get_dummies(X_inf, columns=cat_cols, drop_first=True)

# Rename conference one-hot encoding column: 'conference_West' -> 'conference'
X_inf = X_inf.rename(columns={'conference_West': 'conference'})

# Add missing columns (teams that don't exist anymore) with all zeros
for col in X_all.drop(columns=['group_id']).columns:
    if col not in X_inf.columns:
        X_inf[col] = 0  # Add missing column with zeros
        print(f"Added missing column: {col}")

# Double check our data does not contain null values
has_null_rows = X_inf.isnull().any(axis=1)
num_rows_with_nulls = has_null_rows.sum()
print(f"Number of rows with null values: {num_rows_with_nulls}")

Added missing column: team_Bobcats
Added missing column: team_SuperSonics
Number of rows with null values: 0


In [49]:
X_inf

Unnamed: 0,season,week,games_played_this_week,numMinutes,points,assists,blocks,steals,reboundsTotal,reboundsDefensive,...,team_Spurs,team_Suns,team_Thunder,team_Timberwolves,team_Trail Blazers,team_Warriors,team_Wizards,conference,team_Bobcats,team_SuperSonics
0,2025,44,1,6.76,0.0,0.0,0.0,0.0,2.0,2.0,...,False,False,False,False,False,False,False,True,0,0
1,2025,44,2,15.78,6.0,0.0,1.0,0.0,1.0,1.0,...,False,False,False,False,False,False,False,False,0,0
2,2025,44,4,133.12,92.0,11.0,2.0,2.0,21.0,21.0,...,False,False,False,False,False,False,False,False,0,0
3,2025,44,4,100.52,56.0,24.0,0.0,0.0,13.0,11.0,...,False,False,False,False,False,False,False,False,0,0
4,2025,44,3,97.73,55.0,30.0,0.0,4.0,22.0,20.0,...,False,False,False,False,False,False,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1492,2025,48,1,4.56,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,0,0
1493,2025,48,3,68.75,30.0,1.0,0.0,4.0,14.0,9.0,...,False,False,False,False,False,True,False,True,0,0
1494,2025,48,3,128.70,44.0,12.0,2.0,2.0,10.0,8.0,...,False,False,False,False,False,False,False,False,0,0
1495,2025,48,1,5.48,3.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,0,0


In [51]:
# Inference
X_inf_features = X_inf.values

# Scale features with SAME scaler from training
X_inf_scaled = scaler.transform(X_inf_features).astype(np.float32)

# Convert to tensors and move to device
X_inf_tensor = torch.from_numpy(X_inf_scaled).to(device)

# Predictions
model.eval()
with torch.no_grad():
    pred = model(X_inf_tensor).cpu().squeeze().numpy()

# Evaluate results
results = []
k = [1, 3, 5, 10]
k_dict = {i: [] for i in k}

# Group predictions by season, week, conference
for (season, week, conference), group in X_inf.groupby(['season', 'week', 'conference'], observed=True):
    idx = group.index.to_numpy()  # these are now positions 0..1496

    # Use positional indexing everywhere
    y_group = y_inf.iloc[idx].to_numpy()
    pred_group = pred[idx]

    # Skip if no winner i.e. present week
    if y_group.sum() == 0:
        continue
    
    # Create results dataframe
    df = total_inf.loc[group.index, ['full_name', 'team']].copy()
    df['score'] = pred_group
    df['y_true'] = y_group
    
    # Sort by prediction score
    df_sorted = df.sort_values('score', ascending=False).reset_index(drop=True)
    
    # Find actual winner's rank
    winner_idx = df_sorted[df_sorted['y_true'] == 1].index
    if len(winner_idx) > 0:
        winner_rank = winner_idx[0] + 1  # Convert to 1-indexed
        
        result = {
            'week': week,
            'conference': conference,
            'actual_winner': df_sorted[df_sorted['y_true'] == 1]['full_name'].values[0],
            'predicted_winner': df_sorted.iloc[0]['full_name'],
            'winner_rank': winner_rank,
            'top_5': df_sorted.head(5)[['full_name', 'team', 'score', 'y_true']].to_dict('records')
        }
        results.append(result)
        
        # Calculate top-k accuracy
        for i in k:
            top_k = df_sorted.head(i)
            hit = top_k['y_true'].max()
            k_dict[i].append(hit)
    
print(f"\nTotal weeks evaluated: {len(results)}")


Total weeks evaluated: 8


In [52]:
# Show week by week peformance
for result in results:
    print(f"\nWeek {result['week']} - {result['conference']}:")
    print(f"Actual: {result['actual_winner']}")
    print(f"Predicted: {result['predicted_winner']}")
    print(f"Top 5 predictions:")
    for i, player in enumerate(result['top_5'], 1):
        print(f"{i}. {player['full_name']} ({player['team']}) - Score: {player['score']:.4f}")

# Overall metrics
print("Overall Inference Performance:")

for i in k:
    accuracy = np.mean(k_dict[i])
    print(f"Top-{i} Accuracy: {accuracy:.4f} ({int(accuracy * len(results))}/{len(results)} weeks)")

ranks = [r['winner_rank'] for r in results]
print(f"\nMean Winner Rank: {np.mean(ranks):.2f}")
print(f"Median Winner Rank: {np.median(ranks):.1f}")
print(f"Best Rank: {min(ranks)}")
print(f"Worst Rank: {max(ranks)}")


Week 45 - False:
Actual: Cade Cunningham
Predicted: Cade Cunningham
Top 5 predictions:
1. Cade Cunningham (Pistons) - Score: -3.2761
2. Giannis Antetokounmpo (Bucks) - Score: -9.4162
3. Donovan Mitchell (Cavaliers) - Score: -10.6207
4. Tyrese Maxey (76ers) - Score: -19.6053
5. Evan Mobley (Cavaliers) - Score: -19.8634

Week 45 - True:
Actual: Nikola Jokic
Predicted: Shai Gilgeous-Alexander
Top 5 predictions:
1. Shai Gilgeous-Alexander (Thunder) - Score: -20.0971
2. Alperen Sengun (Rockets) - Score: -20.2184
3. Nikola Jokic (Nuggets) - Score: -22.0988
4. Devin Booker (Suns) - Score: -47.8813
5. Amen Thompson (Rockets) - Score: -50.5350

Week 46 - False:
Actual: Jalen Johnson
Predicted: Scottie Barnes
Top 5 predictions:
1. Scottie Barnes (Raptors) - Score: -6.5365
2. Derrick White (Celtics) - Score: -13.7224
3. Giannis Antetokounmpo (Bucks) - Score: -17.3811
4. Franz Wagner (Magic) - Score: -22.2874
5. Jakob Poeltl (Raptors) - Score: -22.8969

Week 46 - True:
Actual: Nikola Jokic
Predic

In [None]:
print("COMPARISON WITH LIGHTGBM")
print(f"LightGBM Top-1:        53.0%")
print(f"Neural Network Top-1:  {cv_df['Top_1_avg_hits'].mean():.1%}")
print(f"Difference:            {(cv_df['Top_1_avg_hits'].mean() - 0.53)*100:+.1f}%")