In [6]:
import pandas as pd 
import numpy as np 
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [2]:
df = pd.read_csv('ProcessedDataset.csv', index_col=0)

In [3]:
def transform_data_to_graph_data(df: pd.DataFrame, 
                                 label: str, 
                                 date_time_col: str = 'DateTime', 
                                 track_name_col: str = 'Track Name'):
    
    # Group the rows by DateTime and Track Name
    grouped = df.groupby([date_time_col, track_name_col])

    graph_data = []
    
    # Iterate over each group
    for (dt, track), group_df in grouped:
        
        # Drop unused columns and convert to numeric
        X = group_df.drop(columns=[label, date_time_col, track_name_col]).astype(float).to_numpy()
        X = torch.tensor(X, dtype=torch.float32)

        # Convert the label column to torch tensor
        y = torch.tensor(group_df[label].to_numpy(), dtype=torch.float32)

        # Append (X, y) to the list
        graph_data.append([X, y])

    return graph_data

In [4]:
graphs = transform_data_to_graph_data(df, 'win')

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GCNSelfLayer(nn.Module):
    """
    A single Graph Convolution layer:
      H_next = A_hat * H * W
    where A_hat is the adjacency matrix (possibly normalized),
    H is the input node features, and W is a learnable weight.
    """

    def __init__(self, in_features, out_features, bias=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Learnable weight matrix: shape (in_features, out_features)
        self.weight = nn.Parameter(torch.Tensor(in_features, out_features))

        # Optional bias: shape (out_features)
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

        # Initialize weights
        self.reset_parameters()

    def reset_parameters(self):
        # A simple initialization scheme (e.g., glorot/xavier)
        nn.init.xavier_uniform_(self.weight)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def forward(self, H):
        """
        H: [N, in_features]   - node feature matrix
        A_hat: [N, N]         - adjacency matrix (ideally normalized)
        returns: [N, out_features]
        """
        # 1) Fully connected graph - normalised by the degree to avoid vanishing/exploding gradients
        A_hat = torch.eye(H.shape[0])

        # 2) Multiply input features by W
        HW = torch.matmul(H, self.weight)  # [N, out_features]

        # 3) Propagate/aggregate over adjacency
        #    A_hat * (H * W)
        out = torch.matmul(A_hat, HW)       # [N, out_features]

        # 3) Add bias (if any)
        if self.bias is not None:
            out = out + self.bias

        return out
    
class GCNNonSelfLayer(nn.Module):
    """
    A single Graph Convolution layer:
      H_next = A_hat * H * W
    where A_hat is the adjacency matrix (possibly normalized),
    H is the input node features, and W is a learnable weight.
    """

    def __init__(self, in_features, out_features, bias=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Learnable weight matrix: shape (in_features, out_features)
        self.weight = nn.Parameter(torch.Tensor(in_features, out_features))

        # Optional bias: shape (out_features)
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

        # Initialize weights
        self.reset_parameters()

    def reset_parameters(self):
        # A simple initialization scheme (e.g., glorot/xavier)
        nn.init.xavier_uniform_(self.weight)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def forward(self, H):
        """
        H: [N, in_features]   - node feature matrix
        A_hat: [N, N]         - adjacency matrix (ideally normalized)
        returns: [N, out_features]
        """
        # 1) Fully connected graph - normalised by the degree to avoid vanishing/exploding gradients
        A_hat = torch.ones((H.shape[0], H.shape[0])) - torch.eye(H.shape[0])
        A_hat = A_hat / H.shape[0]

        # 2) Multiply input features by W
        HW = torch.matmul(H, self.weight)  # [N, out_features]

        # 3) Propagate/aggregate over adjacency
        #    A_hat * (H * W)
        out = torch.matmul(A_hat, HW)       # [N, out_features]

        # 3) Add bias (if any)
        if self.bias is not None:
            out = out + self.bias

        return out
    
class GCNLayer(nn.Module):
    """
    A single Graph Convolution layer:
      H_next = A_hat * H * W
    where A_hat is the adjacency matrix (possibly normalized),
    H is the input node features, and W is a learnable weight.
    """

    def __init__(self, in_features, out_features, bias=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Learnable weight matrix: shape (in_features, out_features)
        self.weight = nn.Parameter(torch.Tensor(in_features, out_features))

        # Optional bias: shape (out_features)
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

        # Initialize weights
        self.reset_parameters()

    def reset_parameters(self):
        # A simple initialization scheme (e.g., glorot/xavier)
        nn.init.xavier_uniform_(self.weight)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def forward(self, H):
        """
        H: [N, in_features]   - node feature matrix
        A_hat: [N, N]         - adjacency matrix (ideally normalized)
        returns: [N, out_features]
        """
        # 1) Fully connected graph - normalised by the degree to avoid vanishing/exploding gradients
        A_hat = torch.ones((H.shape[0], H.shape[0])) / H.shape[0] + torch.eye(H.shape[0])

        # 2) Multiply input features by W
        HW = torch.matmul(H, self.weight)  # [N, out_features]

        # 3) Propagate/aggregate over adjacency
        #    A_hat * (H * W)
        out = torch.matmul(A_hat, HW)       # [N, out_features]

        # 3) Add bias (if any)
        if self.bias is not None:
            out = out + self.bias

        return out

In [6]:
class SimpleGCN1(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.gc1 = GCNLayer(in_features, hidden_dim)
        self.linear = nn.Linear(hidden_dim, 1)

    def forward(self, H):
        x = self.gc1(H)
        x = self.linear(x)
        return x
    
class SimpleGCN2(nn.Module):
    def __init__(self, in_features, hidden_dim1, hidden_dim2):
        super().__init__()
        self.gc1 = GCNLayer(in_features, hidden_dim1)
        self.relu = nn.ReLU()
        self.gc2 = GCNLayer(hidden_dim1, hidden_dim2)
        self.linear = nn.Linear(hidden_dim2, 1)

    def forward(self, H):
        x = self.relu(self.gc1(H))
        x = self.gc2(x)
        x = self.linear(x)
        return x

In [388]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

val_fraction = 0.2
data_split_indx = round(len(graphs) * (1-val_fraction))
training_graphs, validation_graphs = graphs[:data_split_indx], graphs[data_split_indx:]

model = SimpleGCN1(graphs[0][0].shape[1], 64)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 4. Training loop
num_epochs = 500
num_per_epoch = 10000

model.train()
for epoch in range(num_epochs):

    training_epoch_loss = 0
    for i in range(num_per_epoch):
        indx = random.randint(0, len(training_graphs)-1)
        
        # Forward pass
        predictions = model(training_graphs[indx][0])   
        true_labels = torch.reshape(training_graphs[indx][1], (-1, 1))
        loss = criterion(predictions, true_labels)

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        training_epoch_loss += loss.item()

    # Print loss periodically
    if (epoch + 1) % 10 == 0:

        # Calculate Validation loss
        model.eval()
        validation_loss = 0
        for i in range(len(validation_graphs)):

            predictions = model(validation_graphs[i][0])   
            true_labels = torch.reshape(validation_graphs[i][1], (len(validation_graphs[i][1]), 1))
            loss = criterion(predictions, true_labels)

            validation_loss += loss

        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {training_epoch_loss/num_per_epoch:.4f}, Validation Loss: {validation_loss/len(validation_graphs):.4f}")

    

# # 5. Evaluate the model on the entire dataset
# model.eval()
# with torch.no_grad():
#     preds = model(x)
#     final_loss = criterion(preds, y)
    
# print("\nTraining complete!")
# print(f"Final Loss over all data: {final_loss.item():.4f}")


Epoch [10/500], Training Loss: 8.4561, Validation Loss: 6.1249
Epoch [20/500], Training Loss: 6.8188, Validation Loss: 4.4333
Epoch [30/500], Training Loss: 6.0622, Validation Loss: 2.3490
Epoch [40/500], Training Loss: 5.1724, Validation Loss: 1.6676
Epoch [50/500], Training Loss: 5.4594, Validation Loss: 8.2519
Epoch [60/500], Training Loss: 4.5003, Validation Loss: 4.2570
Epoch [70/500], Training Loss: 4.0544, Validation Loss: 0.9388
Epoch [80/500], Training Loss: 3.4256, Validation Loss: 2.0264
Epoch [90/500], Training Loss: 3.4219, Validation Loss: 7.4251
Epoch [100/500], Training Loss: 2.7956, Validation Loss: 5.7092
Epoch [110/500], Training Loss: 2.3113, Validation Loss: 5.5580
Epoch [120/500], Training Loss: 1.9300, Validation Loss: 1.3019


KeyboardInterrupt: 

In [7]:
class Classification(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.lin1 = nn.Linear(in_features, hidden_dim)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(hidden_dim, 1)

    def forward(self, H):
        x = self.lin1(H)
        x = self.relu(x)
        x = self.lin2(x)
        return x

In [8]:
X = torch.tensor(df.drop(columns=['win', 'DateTime', 'Track Name']).astype(np.float32).to_numpy())
y = torch.tensor(df['win'].astype(np.float32).to_numpy())

In [9]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32
validation_fraction = 0.1
validation_index = round(len(y)*(1-validation_fraction))

X_train = X[:validation_index,:]
X_validation = X[validation_index:,:]
y_train = y[:validation_index]
y_validation = y[validation_index:]

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

validation_dataset = TensorDataset(X_validation, y_validation)
validation_loader = DataLoader(validation_dataset, batch_size=len(validation_dataset), shuffle=False)

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

model = Classification(X.shape[1], 32)

num_epochs = 500

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(num_epochs):

    epoch_loss = 0
    num_in_epoch = 0
    model.train()
    for batch_x, batch_y in train_loader:
        # 1. Zero the parameter gradients
        optimizer.zero_grad()
        
        # 2. Forward pass
        outputs = model(batch_x)
        
        # 3. Compute the loss (BCEWithLogitsLoss expects raw logits from the final layer)
        loss = criterion(outputs, torch.reshape(batch_y, (-1,1)))
        
        # 4. Backpropagation
        loss.backward()
        
        # 5. Update parameters
        optimizer.step()

        epoch_loss += loss.item()
        num_in_epoch += 1
    
    # Evaluation 
    model.eval()
    for batch_x, batch_y in validation_loader:
        # 2. Forward pass
        outputs = model(batch_x)

        validation_loss = criterion(outputs, torch.reshape(batch_y, (-1,1)))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/num_in_epoch:.4f}, Validation loss: {validation_loss.item():.4f}")

print("Training complete.")

Epoch 1/500, Loss: 2.3043, Validation loss: 2.1002
Epoch 2/500, Loss: 0.7374, Validation loss: 1.4705
Epoch 3/500, Loss: 0.7067, Validation loss: 0.6810
Epoch 4/500, Loss: 0.6733, Validation loss: 0.4288
Epoch 5/500, Loss: 0.6742, Validation loss: 0.5574
Epoch 6/500, Loss: 0.6496, Validation loss: 0.4397
Epoch 7/500, Loss: 0.6447, Validation loss: 0.5008
Epoch 8/500, Loss: 0.6148, Validation loss: 0.8251
Epoch 9/500, Loss: 0.6057, Validation loss: 0.8874
Epoch 10/500, Loss: 0.5993, Validation loss: 0.3493
Epoch 11/500, Loss: 0.5861, Validation loss: 0.3840
Epoch 12/500, Loss: 0.5859, Validation loss: 0.7026
Epoch 13/500, Loss: 0.5673, Validation loss: 0.6976
Epoch 14/500, Loss: 0.5489, Validation loss: 0.6212
Epoch 15/500, Loss: 0.5321, Validation loss: 0.4282
Epoch 16/500, Loss: 0.5281, Validation loss: 0.4145
Epoch 17/500, Loss: 0.5276, Validation loss: 0.5642
Epoch 18/500, Loss: 0.5062, Validation loss: 0.5810
Epoch 19/500, Loss: 0.4925, Validation loss: 0.3526
Epoch 20/500, Loss: 0

KeyboardInterrupt: 

In [13]:
for batch_x, batch_y in validation_loader:
    outputs = model(batch_x)

In [16]:
torch.nn.Sigmoid(outputs)

TypeError: Sigmoid.__init__() takes 1 positional argument but 2 were given

In [None]:
import math

class CrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        """
        Args:
            d_model  (int): Dimensionality of the model (embeddings).
            num_heads (int): Number of attention heads.
        """
        super(CrossAttention, self).__init__()
        
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads  # dimensionality per head
        
        # Learnable linear projections for Q, K, V
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        
        # Final linear layer to recombine all heads
        self.fc_out = nn.Linear(d_model, d_model)
        
    def forward(self, x, context, mask=None):
        """
        Computes cross attention between x (query) and context (key/value).
        
        Args:
            x       (torch.Tensor): [batch_size, seq_len, d_model] – Queries
            context (torch.Tensor): [batch_size, context_len, d_model] – Keys/Values
            mask    (torch.Tensor): [batch_size, 1, 1, context_len] – Optional attention mask (0 where masked)
        
        Returns:
            out (torch.Tensor): The attended representation of x, shape [batch_size, seq_len, d_model].
            attention_weights (torch.Tensor): Attention weights of shape [batch_size, num_heads, seq_len, context_len].
        """
        B, Tx, _ = x.shape
        Bc, Tc, _ = context.shape
        assert B == Bc, "Query and context must have the same batch size."

        # 1. Linear Projections
        Q = self.Wq(x)       # [B, Tx, d_model]
        K = self.Wk(context) # [B, Tc, d_model]
        V = self.Wv(context) # [B, Tc, d_model]

        # 2. Reshape and transpose for multi-head attention
        #    from [B, Tx, d_model] -> [B, num_heads, Tx, depth]
        Q = Q.view(B, Tx, self.num_heads, self.depth).transpose(1, 2)  # [B, num_heads, Tx, depth]
        K = K.view(B, Tc, self.num_heads, self.depth).transpose(1, 2)  # [B, num_heads, Tc, depth]
        V = V.view(B, Tc, self.num_heads, self.depth).transpose(1, 2)  # [B, num_heads, Tc, depth]

        # 3. Scaled Dot-Product Attention
        #    attention_scores: [B, num_heads, Tx, Tc]
        attention_scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(self.depth)
        
        if mask is not None:
            # mask == 0 where we want to mask
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
        
        attention_weights = torch.softmax(attention_scores, dim=-1)  # [B, num_heads, Tx, Tc]
        
        # 4. Combine attention weights with values: [B, num_heads, Tx, depth]
        out = torch.matmul(attention_weights, V)
        
        # 5. Reshape back: [B, Tx, num_heads, depth] -> [B, Tx, d_model]
        out = out.transpose(1, 2).contiguous().view(B, Tx, self.d_model)
        
        # 6. Apply final projection
        out = self.fc_out(out)
        
        return out, attention_weights


class SetTransformer(nn.Module):
    def __init__(self, dim_input, num_outputs, dim_output,
            num_inds=32, dim_hidden=128, num_heads=4, ln=False):
        super(SetTransformer, self).__init__()


    def forward(self, X):
        return self.dec(self.enc(X))

In [25]:
graph_X = graphs[0][0]
graph_y = graphs[0][1]

In [26]:
pma = PMA(graph_X.shape[1], 1, 1)

In [27]:
pma(graph_X)

tensor([[[ 0.1238, -0.1221,  0.0420, -0.0408,  0.0734,  0.0364,  0.0165,
           0.0541,  0.0391, -0.0768, -0.0652, -0.1013, -0.0599, -0.0190,
           0.0491, -0.1362, -0.0462, -0.0349,  0.0358, -0.1169, -0.0418,
           0.1106,  0.0367,  0.0574,  0.1245, -0.0562,  0.0933,  0.0673,
          -0.0715, -0.0657,  0.0906,  0.0146, -0.0973, -0.0226,  0.0438,
          -0.1416, -0.1278, -0.1001,  0.0519,  0.1368, -0.0944, -0.1301,
           0.1151, -0.0559,  0.0450, -0.1343, -0.0219, -0.0382,  0.1387,
           0.0247,  0.0907, -0.0661, -0.1329,  0.0242,  0.1082,  0.1007,
          -0.0115,  0.0537,  0.0439,  0.1334, -0.0432,  0.0061,  0.1171,
          -0.0680,  0.1277,  0.1181, -0.0460,  0.0634, -0.0178,  0.0561,
          -0.1169,  0.1064, -0.0148, -0.0384, -0.0318,  0.1126, -0.0193,
           0.0851, -0.1072,  0.0472, -0.0466, -0.0720, -0.0914,  0.0522,
          -0.0345,  0.0587,  0.0566,  0.1409,  0.1207, -0.1106, -0.0406,
          -0.0816, -0.0962, -0.1223, -0.0644,  0.12

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)