In [1]:
import pandas as pd 
import numpy as np 
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import random
from modelling_functions import split_train_validation_data, average_log_loss_score, get_sklearn_model_results_for_races, betting_results_threshold_method, betting_results_top_bsp_method

In [2]:
df = pd.read_csv('ProcessedDatasetExtra.csv', index_col=0)

In [3]:
# Split train and validation data
train_data, validation_data = split_train_validation_data(df, validation_split=0.1)

In [109]:
# Get feature, target and extra information columns from the dataset
info_columns = [
    'DateTime',
    'Track Name',
    'PlaceInt',
    'First Trading Price',
    '15 mins',
    '10 mins',
    '5 mins',
    '3 mins',
    '2 mins',
    '1 min ',
    'Post Time',
    'industry_implied_prob',
    'betfair_implied_prob',
    'industry_implied_prob_norm',
    'betfair_implied_prob_norm',
    'industry_implied_prob_norm_inv',
    'betfair_implied_prob_norm_inv'
]

info_columns_set = set(info_columns)

target_column = 'win'

info_columns_set.add(target_column)

feature_columns = list(set(train_data.columns).difference(info_columns_set))

In [110]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_data[feature_columns])

In [112]:
list(train_data[feature_columns].columns)

['placeTop3_number_last_3_by_JockeyTrainer',
 'TimeSinceLastRaceForJockey',
 'place_last_by_Jockey',
 'win_last_5_by_Jockeydistance_bin',
 'avg_place_last_3_by_JockeyTrainer',
 'placetop3_rate_by_HorseTrainer',
 'TimeSinceLastRaceForHorse',
 'avg_place_by_Jockey',
 'win_last_by_JockeyGoing',
 'avg_place_last_5_by_HorseTrainer',
 'win_last_by_HorseJockey',
 'place_last_by_Horse',
 'avg_place_last_5_by_JockeyGoing',
 'placeTop3_number_last_3_by_HorseDistance',
 'placeTop3_number_last_by_JockeyDistance',
 'placeTop3_number_last_5_by_HorseTrainer',
 'placeTop3_number_last_by_Jockeydistance_bin',
 'Pace',
 'placeTop3_number_last_5_by_JockeyDistance',
 'win_rate_by_JockeyDistance',
 'avg_place_by_Horse',
 'win_number_by_Horsedistance_bin',
 'win_rate_by_Horse',
 'placetop3_rate_by_Jockey',
 'num_races_by_JockeyDistance',
 'placeTop3_number_last_by_Jockey',
 'placeTop3_number_last_by_HorseTrainer',
 'avg_place_by_JockeyGoing',
 'win_number_by_Jockey',
 'win_last_by_Horsedistance_bin',
 'place

In [113]:
train_data[feature_columns] = scaler.transform(train_data[feature_columns])
validation_data[feature_columns] = scaler.transform(validation_data[feature_columns])

In [114]:
# Take the race data and create several sets of data from a single race where each
# horse has a turn being the data indexed 0.
def transform_graphical_data_training(data: pd.DataFrame, feature_columns: list[str], target_column: str, shuffle_data: bool = True):

    grouped = data.groupby(['DateTime', 'Track Name'])

    races = []

    for (dt, track), group_df in grouped:

        race = group_df

        race_permutations = []

        for i in range(len(race)):
            rotated_df = pd.concat([race.iloc[i:], race.iloc[:i]], ignore_index=True)
            X = torch.tensor(rotated_df[feature_columns].astype(np.float32).to_numpy())
            y = torch.tensor(rotated_df[target_column].astype(np.float32).to_numpy())
            race_permutations.append([X, y])

        if shuffle_data:
            races = races + race_permutations
        else:
            races.append(race_permutations)

    return races

def transform_graphical_data_eval(data: pd.DataFrame, shuffle_data: bool = True):

    grouped = data.groupby(['DateTime', 'Track Name'])

    races = []

    for (dt, track), group_df in grouped:

        race = group_df

        race_permutations = []

        for i in range(len(race)):
            rotated_df = pd.concat([race.iloc[i:], race.iloc[:i]], ignore_index=True)
            race_permutations.append(rotated_df)

        if shuffle_data:
            races = races + race_permutations
        else:
            races.append(race_permutations)

    return races

In [115]:
train_races = transform_graphical_data_training(train_data, feature_columns=feature_columns, target_column=target_column)

In [116]:
train_races[2][0].shape

torch.Size([12, 143])

In [117]:
class MixLayer(nn.Module):
    """
    A single Graph Convolution layer added to data in index 0:
      H_next = A_hat * H * W
    where A_hat is the adjacency matrix (possibly normalized),
    H is the input node features, and W is a learnable weight.
    """

    def __init__(self, in_features, out_features, bias=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Learnable weight matrix: shape (in_features, out_features)
        self.weight = nn.Parameter(torch.Tensor(in_features, out_features))

        # Optional bias: shape (out_features)
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

        # Initialize weights
        self.reset_parameters()

    def reset_parameters(self):
        # A simple initialization scheme (e.g., glorot/xavier)
        nn.init.xavier_uniform_(self.weight)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def forward(self, H):
        """
        H: [N, in_features]   - node feature matrix
        A_hat: [N, N]         - adjacency matrix (ideally normalized)
        returns: [N, out_features]
        """

        # 1) Fully connected graph - normalised by the degree to avoid vanishing/exploding gradients
        A_hat = torch.ones((H.shape[0], H.shape[0]))/ H.shape[0]

        # 2) Multiply input features by W
        HW = torch.matmul(H, self.weight)  # [N, out_features]

        # 3) Propagate/aggregate over adjacency
        #    A_hat * (H * W)
        out = torch.matmul(A_hat, HW)       # [N, out_features]

        # 3) Add bias (if any)
        if self.bias is not None:
            out = out + self.bias

        out = torch.concat((H[0,:], out[0]))

        return out

In [118]:
class MixSetClassifier(nn.Module):
    def __init__(self, in_features, hidden_dim, mix_out):
        super().__init__()
        self.lin1 = nn.Linear(in_features, hidden_dim)
        self.relu = nn.ReLU()
        self.mix = MixLayer(hidden_dim, mix_out)
        self.lin2 = nn.Linear(hidden_dim+mix_out, 1)

    def forward(self, H):
        x = self.lin1(H)
        x = self.relu(x)
        x = self.mix(x)
        x = self.lin2(x)
        return x

In [119]:
import torch
import torch.nn as nn
import torch.optim as optim

mix_model = MixSetClassifier(train_races[0][0].shape[1], 64, 64)

num_epochs = 100

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(mix_model.parameters(), lr=1e-5)

num_per_epoch = 50000

for epoch in range(num_epochs):

    epoch_loss = 0
    num_in_epoch = 0
    mix_model.train()
    for indx in range(len(train_races)):

        # indx = random.randint(0, len(train_races)-1)
        train_sample_X = train_races[indx][0]
        train_sample_y = train_races[indx][1]

        # 1. Zero the parameter gradients
        optimizer.zero_grad()
        
        # 2. Forward pass
        outputs = mix_model(train_sample_X)
        
        # 3. Compute the loss (BCEWithLogitsLoss expects raw logits from the final layer)
        loss = criterion(outputs, torch.reshape(train_sample_y[0], (1,)))
        
        # 4. Backpropagation
        loss.backward()
        
        # 5. Update parameters
        optimizer.step()

        epoch_loss += loss.item()
        num_in_epoch += 1
    
    # Evaluation 
    # classification_model.eval()
    # race_results = get_pytorch_model_results_for_races(validation_data, feature_columns, classification_model)
    # average_validation_log_loss = average_log_loss_score(race_results, 'model_prediction')

    # for batch_x, batch_y in validation_loader:
    #     # 2. Forward pass
    #     outputs = classification_model(batch_x)

    #     validation_loss = criterion(outputs, torch.reshape(batch_y, (-1,1)))

    # print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/num_in_epoch:.4f}, Validation average log loss: {average_validation_log_loss:.4f}")

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/num_in_epoch:.4f}")
          
print("Training complete.")

Epoch 1/100, Loss: 0.3093
Epoch 2/100, Loss: 0.3052
Epoch 3/100, Loss: 0.3044
Epoch 4/100, Loss: 0.3039
Epoch 5/100, Loss: 0.3035
Epoch 6/100, Loss: 0.3032
Epoch 7/100, Loss: 0.3029
Epoch 8/100, Loss: 0.3027
Epoch 9/100, Loss: 0.3025
Epoch 10/100, Loss: 0.3022
Epoch 11/100, Loss: 0.3021
Epoch 12/100, Loss: 0.3019
Epoch 13/100, Loss: 0.3017
Epoch 14/100, Loss: 0.3016
Epoch 15/100, Loss: 0.3014
Epoch 16/100, Loss: 0.3013
Epoch 17/100, Loss: 0.3012
Epoch 18/100, Loss: 0.3010
Epoch 19/100, Loss: 0.3009
Epoch 20/100, Loss: 0.3008
Epoch 21/100, Loss: 0.3007
Epoch 22/100, Loss: 0.3005
Epoch 23/100, Loss: 0.3004
Epoch 24/100, Loss: 0.3003
Epoch 25/100, Loss: 0.3002
Epoch 26/100, Loss: 0.3001
Epoch 27/100, Loss: 0.3000
Epoch 28/100, Loss: 0.2999
Epoch 29/100, Loss: 0.2998
Epoch 30/100, Loss: 0.2997
Epoch 31/100, Loss: 0.2996
Epoch 32/100, Loss: 0.2995
Epoch 33/100, Loss: 0.2994
Epoch 34/100, Loss: 0.2993
Epoch 35/100, Loss: 0.2992
Epoch 36/100, Loss: 0.2992
Epoch 37/100, Loss: 0.2991
Epoch 38/1

KeyboardInterrupt: 

In [79]:
validation_races = transform_graphical_data_eval(validation_data, shuffle_data=False)

In [105]:
def get_race_results_for_mix_model(races: list[pd.DataFrame]):

    final_race_results = []

    for race in races:
        y_model = []
        race_final = race[0]

        for race_perm in race:

            X = torch.tensor(race_perm[feature_columns].astype(np.float32).to_numpy())
            y = torch.tensor(race_perm[target_column].astype(np.float32).to_numpy())

            y_horse = mix_model(X)

            y_model.append(y_horse)

        race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()

        final_race_results.append(race_final)

    return final_race_results

In [106]:
race_results = get_race_results_for_mix_model(validation_races)

  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch.softmax(torch.tensor(y_model), dim=0).detach().numpy()
  race_final['model_prediction'] = torch

In [107]:
average_log_loss_score(race_results, 'model_prediction')

1.799522742065966