In [1]:
import pandas as pd 
import numpy as np 
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import random
from modelling_functions import split_train_validation_data, average_log_loss_score, get_sklearn_model_results_for_races, betting_results_threshold_method, betting_results_top_bsp_method

In [2]:
df = pd.read_csv('ProcessedDatasetExtra.csv', index_col=0)

In [3]:
# Split train and validation data
train_data, validation_data = split_train_validation_data(df, validation_split=0.1)

In [4]:
# Get feature, target and extra information columns from the dataset
info_columns = [
    'DateTime',
    'Track Name',
    'PlaceInt',
    'First Trading Price',
    '15 mins',
    '10 mins',
    '5 mins',
    '3 mins',
    '2 mins',
    '1 min ',
    'Post Time',
    'industry_implied_prob',
    'betfair_implied_prob',
    'industry_implied_prob_norm',
    'betfair_implied_prob_norm'
]

info_columns_set = set(info_columns)

target_column = 'win'

info_columns_set.add(target_column)

feature_columns = list(set(train_data.columns).difference(info_columns_set))

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_data[feature_columns])

In [49]:
class LogisticRegression(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.lin1 = nn.Linear(in_features, 1)

    def forward(self, H):
        x = self.lin1(H)
        return x
    
class Classification(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.lin1 = nn.Linear(in_features, hidden_dim)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(hidden_dim, 1)

    def forward(self, H):
        x = self.lin1(H)
        x = self.relu(x)
        x = self.lin2(x)
        return x
    
class DeepClassification(nn.Module):
    def __init__(self, in_features, hidden_dim1, hidden_dim2, dropout):
        super().__init__()
        self.lin1 = nn.Linear(in_features, hidden_dim1)
        self.dropout = nn.Dropout(dropout)
        self.relu1 = nn.ReLU()
        self.lin2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()
        self.lin3 = nn.Linear(hidden_dim2, 1)

    def forward(self, H):
        x = self.lin1(H)
        x = self.dropout(x)
        x = self.relu1(x)
        x = self.lin2(x)
        x = self.relu2(x)
        x = self.lin3(x)
        return x

In [13]:
X_train = train_data[feature_columns].astype(np.float32)
X_train = np.array(scaler.transform(X_train))

y_train = train_data[target_column]

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train.astype(np.float32).to_numpy())

X_validation = validation_data[feature_columns].astype(np.float32)
X_validation = np.array(scaler.transform(X_validation))

y_validation = validation_data[target_column]

X_validation = torch.tensor(X_validation)
y_validation = torch.tensor(y_validation.astype(np.float32).to_numpy())

In [14]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32
validation_fraction = 0.1
validation_index = round(len(y)*(1-validation_fraction))

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

validation_dataset = TensorDataset(X_validation, y_validation)
validation_loader = DataLoader(validation_dataset, batch_size=len(validation_dataset), shuffle=False)

In [41]:
# Attach model results to the race data using an sklearn model
def get_pytorch_model_results_for_races(df_validation: pd.DataFrame, feature_columns: list[str], model: nn.Module, target_column: str = 'model_prediction'):
    
    grouped = df_validation.groupby(['DateTime', 'Track Name'])

    races = []

    for (dt, track), group_df in grouped:

        race = group_df

        X_race = race[feature_columns].astype(np.float32)
        X_race = np.array(scaler.transform(X_race))

        X_race = torch.tensor(X_race)

        race[target_column] = torch.softmax(model(X_race), dim=0).detach()

        races.append(race)

    return races

In [51]:
import torch
import torch.nn as nn
import torch.optim as optim

classification_model = LogisticRegression(X_validation.shape[1])

num_epochs = 100

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classification_model.parameters(), lr=1e-4)

for epoch in range(num_epochs):

    epoch_loss = 0
    num_in_epoch = 0
    classification_model.train()
    for batch_x, batch_y in train_loader:
        # 1. Zero the parameter gradients
        optimizer.zero_grad()
        
        # 2. Forward pass
        outputs = classification_model(batch_x)
        
        # 3. Compute the loss (BCEWithLogitsLoss expects raw logits from the final layer)
        loss = criterion(outputs, torch.reshape(batch_y, (-1,1)))
        
        # 4. Backpropagation
        loss.backward()
        
        # 5. Update parameters
        optimizer.step()

        epoch_loss += loss.item()
        num_in_epoch += 1
    
    # Evaluation 
    classification_model.eval()
    race_results = get_pytorch_model_results_for_races(validation_data, feature_columns, classification_model)
    average_validation_log_loss = average_log_loss_score(race_results, 'model_prediction')

    # for batch_x, batch_y in validation_loader:
    #     # 2. Forward pass
    #     outputs = classification_model(batch_x)

    #     validation_loss = criterion(outputs, torch.reshape(batch_y, (-1,1)))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/num_in_epoch:.4f}, Validation average log loss: {average_validation_log_loss:.4f}")

print("Training complete.")

Epoch 1/100, Loss: 0.5201, Validation average log loss: 2.0788
Epoch 2/100, Loss: 0.3341, Validation average log loss: 1.9824
Epoch 3/100, Loss: 0.3022, Validation average log loss: 1.9339
Epoch 4/100, Loss: 0.2970, Validation average log loss: 1.9087
Epoch 5/100, Loss: 0.2946, Validation average log loss: 1.8936
Epoch 6/100, Loss: 0.2930, Validation average log loss: 1.8834
Epoch 7/100, Loss: 0.2918, Validation average log loss: 1.8759
Epoch 8/100, Loss: 0.2909, Validation average log loss: 1.8703
Epoch 9/100, Loss: 0.2902, Validation average log loss: 1.8646
Epoch 10/100, Loss: 0.2897, Validation average log loss: 1.8604
Epoch 11/100, Loss: 0.2892, Validation average log loss: 1.8575
Epoch 12/100, Loss: 0.2888, Validation average log loss: 1.8547
Epoch 13/100, Loss: 0.2885, Validation average log loss: 1.8526
Epoch 14/100, Loss: 0.2882, Validation average log loss: 1.8500
Epoch 15/100, Loss: 0.2880, Validation average log loss: 1.8508
Epoch 16/100, Loss: 0.2878, Validation average lo

In [None]:
race_results = get_pytorch_model_results_for_races(validation_data, feature_columns, classification_model)
average_log_loss_score(race_results, 'model_prediction')

In [40]:
average_log_loss_score(race_results, 'model_prediction')

1.819487061818238