In [1]:
%pip install torch
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import time

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Load data and split into training and testing sets

In [2]:
# Load data
X = np.load('data_processing/X_sequences.npy')
y = np.load('data_processing/y_labels.npy')

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42)

# Convert to tensors and split into batches

The entire X shape has 58k sequences. Splitting into batches allows for faster training and better generalization

In [3]:
X_train = torch.FloatTensor(X_train) 
X_test = torch.FloatTensor(X_test)  
y_train = torch.FloatTensor(y_train).unsqueeze(1)
y_test = torch.FloatTensor(y_test).unsqueeze(1)

# create x,y pairs 
train_data = TensorDataset(X_train, y_train)

# split data into batches of 32
train_loader = DataLoader(
    train_data, 
    batch_size=32,    # Process 32 points at a time
    shuffle=True      # Randomize order each epoch
)

# same for test data
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=32) 
    
                        


# Test with RNN model

This is a many to one RNN model. It takes many shot sequences and predicts a binary outcome. Did the server win or lose. 

https://www.geeksforgeeks.org/machine-learning/introduction-to-recurrent-neural-network/

https://docs.pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html

In [4]:
print(f"Train: {y_train.mean():.2%} server wins")
print(f"Test: {y_test.mean():.2%} server wins")
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__() # initialize nn.Module
        
        # RNN model layer
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)

        # compress hidden state to 32 features, then to binary output
        self.firstLayer = nn.Linear(hidden_size, 32)
        self.output = nn.Linear(32, 1)
        
    def forward(self, x):
        # h is final hidden state
        _, h = self.rnn(x)
        
        # relu adds non-linearity and sigmoid for binary output
        x = torch.relu(self.firstLayer(h.squeeze(0)))  
        x = torch.sigmoid(self.output(x))
        return x
    
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# initialize model, loss function, optimizer
model = RNNModel(input_size=8, hidden_size=64)
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Adam optimizer updates weights

# Training loop
for epoch in range(50):
    model.train()
    for X_batch, y_batch in train_loader: # process each batch
        optimizer.zero_grad() # zero gradients
        outputs = model(X_batch) # forward pass
        loss = loss_function(outputs, y_batch) # compute loss
        loss.backward() # backpropagate
        print(f"RNN grad: {model.rnn.weight_ih_l0.grad.abs().mean():.6f}") # see why gradients arent flowing
        print(f"FC grad: {model.firstLayer.weight.grad.abs().mean():.6f}")  
        optimizer.step() # update weights

# evaluiate on test set
model.eval()
y_pred = []
y_pred_prob = []

# dont track gradients during eval
with torch.no_grad():
    for X_batch, _ in test_loader:
        outputs = model(X_batch)
        y_pred_prob.extend(outputs.cpu().numpy()) # store predicted probabilities
        predicted = (outputs > 0.5).float() # threshold at 0.5
        y_pred.extend(predicted.cpu().numpy()) # store predicted classes (0,1)

# convert lists to 1D
y_pred = [int(p[0]) for p in y_pred]
y_pred_prob = [p[0] for p in y_pred_prob]

# calculate metrics
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {acc:.3f}")
print(f"ROC-AUC:  {roc:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Train: 60.94% server wins
Test: 61.21% server wins
RNN grad: 0.000000
FC grad: 0.000334
RNN grad: 0.000000
FC grad: 0.000186
RNN grad: 0.000000
FC grad: 0.000639
RNN grad: 0.000000
FC grad: 0.000519
RNN grad: 0.000000
FC grad: 0.000008
RNN grad: 0.000000
FC grad: 0.000136
RNN grad: 0.000000
FC grad: 0.000535
RNN grad: 0.000000
FC grad: 0.000420
RNN grad: 0.000000
FC grad: 0.000037
RNN grad: 0.000000
FC grad: 0.000135
RNN grad: 0.000000
FC grad: 0.000508
RNN grad: 0.000000
FC grad: 0.000101
RNN grad: 0.000000
FC grad: 0.000547
RNN grad: 0.000000
FC grad: 0.000189
RNN grad: 0.000000
FC grad: 0.000345
RNN grad: 0.000000
FC grad: 0.000019
RNN grad: 0.000000
FC grad: 0.001411
RNN grad: 0.000000
FC grad: 0.000144
RNN grad: 0.000000
FC grad: 0.000886
RNN grad: 0.000000
FC grad: 0.000417
RNN grad: 0.000000
FC grad: 0.000310
RNN grad: 0.000000
FC grad: 0.000914
RNN grad: 0.000000
FC grad: 0.000782
RNN grad: 0.000000
FC grad: 0.000197
RNN grad: 0.000000
FC grad: 0.000201
RNN grad: 0.000000
FC gr

The RNN gradients are 0.000 during training. This is preventing the model from learning any patterns in the shot sequences. Because there are 86 timesteps per sequence, gradients are multiplied 86 times during backpropogation. Most activation derivatives are <1, repeated multiplication  causes gradients to approach 0 before reaching early weights. This is causing the RNN layer to only predict 1s, the majority class. 