In [24]:
import torch
import numpy as np
from sklearn.calibration import calibration_curve
import sklearn.metrics as skmetrics
import plotly.graph_objects as go
import time

In [37]:
# Batch size - number of shots within a training batch of one training iteration
N_BATCH = 200

# Training epoch - number of passes through the full training dataset
N_EPOCH = 35

# Learning rate - step size to update parameters
LEARNING_RATE = 0.01

# Learning rate decay - scaling factor to decrease learning rate at the end of each decay period
LEARNING_RATE_DECAY = 0.75

# Learning rate decay period - number of epochs before reducing/decaying learning rate
LEARNING_RATE_DECAY_PERIOD = 4

In [29]:
class NeuralNetwork(torch.nn.Module):
    '''
    Neural network class of fully connected layers

    Arg(s):
        n_input_feature : int
            number of input features
        n_output : int
            number of output classes
    '''

    def __init__(self, n_input_feature, n_output):
        super(NeuralNetwork, self).__init__()

        self.fully_connected_layer_1 = torch.nn.Linear(n_input_feature, 128)
        self.fully_connected_layer_2 = torch.nn.Linear(128, 256)
        self.fully_connected_layer_3 = torch.nn.Linear(256, 512)
        self.fully_connected_layer_4 = torch.nn.Linear(512, 1024)

        self.output = torch.nn.Linear(1024, n_output)

    def forward(self, x):
        '''
        Forward pass through the neural network

        Arg(s):
            x : torch.Tensor[float32]
                tensor of N x d
        Returns:
            torch.Tensor[float32]
                tensor of n_output predicted class
        '''

        output_fc1 = torch.nn.functional.relu(self.fully_connected_layer_1(x))
        output_fc2 = torch.nn.functional.relu(self.fully_connected_layer_2(output_fc1))
        output_fc3 = torch.nn.functional.relu(self.fully_connected_layer_3(output_fc2))
        output_fc4 = torch.nn.functional.relu(self.fully_connected_layer_4(output_fc3))

        output_logits = self.output(output_fc4)

        return output_logits


In [19]:
def train(model,
          dataloader,
          n_epoch,
          optimizer,
          learning_rate_decay,
          learning_rate_decay_period):
    '''
    Trains the model using optimizer and specified learning rate schedule

    Arg(s):
        model : torch.nn.Module
            neural network or logistic regression
        dataloader : torch.utils.data.DataLoader
            # https://pytorch.org/docs/stable/data.html
            dataloader for training data
        n_epoch : int
            number of epochs to train
        optimizer : torch.optim
            https://pytorch.org/docs/stable/optim.html
            optimizer to use for updating weights
        learning_rate_decay : float
            rate of learning rate decay
        learning_rate_decay_period : int
            period to reduce learning rate based on decay e.g. every 2 epoch
        device : str
            device to run on
    Returns:
        torch.nn.Module : trained network
    '''
    loss_func = torch.nn.CrossEntropyLoss()

    for epoch in range(n_epoch):

        total_loss = 0.0

        if epoch and epoch % learning_rate_decay_period == 0:
            for group in optimizer.param_groups:
              group['lr'] = group['lr'] * learning_rate_decay
        
        for _, batch in enumerate(dataloader):

            labels = batch[:, 1].long()
            labels = torch.where(labels == -1, torch.tensor(0), labels)
            vectors = batch[:, 2:]

            outputs = model(vectors)

            optimizer.zero_grad()

            loss = loss_func(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss = total_loss + loss.item()

        mean_loss = total_loss / len(dataloader)

        # Log average loss over the epoch
        print('Epoch={}/{}  Loss: {:.3f}'.format(epoch + 1, n_epoch, mean_loss))

    return model


In [20]:
def evaluate(model, dataloader):

    true_labels = []
    predicted_probs = []
    sb_probs = []

    with torch.no_grad():
        for batch in dataloader:
            labels = batch[:, 1].long()
            vectors = batch[:, 2:]

            outputs = model.forward(vectors)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)

            predicted_probs.extend(probabilities[:, 1].tolist())
            true_labels.extend(labels.tolist())
            sb_probs.extend(batch[:, 0].tolist())

    return true_labels, predicted_probs, sb_probs

In [70]:
data = np.vectorize(float)(np.load('data/shots.npy', allow_pickle=True))

In [71]:
'''
Set up dataloading
'''
tensor_data_train = torch.from_numpy(data).float()

dataloader_train = torch.utils.data.DataLoader(
    tensor_data_train,
    batch_size=N_BATCH,
    shuffle=True,
    drop_last=True,
    num_workers=2)

n_class = 2

'''
Set up model and optimizer
'''
n_input_feature = 48

model = NeuralNetwork(n_input_feature, n_class)

optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

'''
Train model
'''
model.train()

time_start = time.time()

model = train(
    model,
    dataloader_train,
    N_EPOCH,
    optimizer,
    learning_rate_decay=LEARNING_RATE_DECAY,
    learning_rate_decay_period=LEARNING_RATE_DECAY_PERIOD)

time_elapsed = time.time() - time_start
print('Total training time: {:3f} seconds'.format(time_elapsed))


Epoch=1/35  Loss: 0.334
Epoch=2/35  Loss: 0.302
Epoch=3/35  Loss: 0.297
Epoch=4/35  Loss: 0.295
Epoch=5/35  Loss: 0.291
Epoch=6/35  Loss: 0.289
Epoch=7/35  Loss: 0.289
Epoch=8/35  Loss: 0.288
Epoch=9/35  Loss: 0.285
Epoch=10/35  Loss: 0.285
Epoch=11/35  Loss: 0.284
Epoch=12/35  Loss: 0.284
Epoch=13/35  Loss: 0.282
Epoch=14/35  Loss: 0.282
Epoch=15/35  Loss: 0.282
Epoch=16/35  Loss: 0.281
Epoch=17/35  Loss: 0.280
Epoch=18/35  Loss: 0.280
Epoch=19/35  Loss: 0.280
Epoch=20/35  Loss: 0.280
Epoch=21/35  Loss: 0.279
Epoch=22/35  Loss: 0.279
Epoch=23/35  Loss: 0.279
Epoch=24/35  Loss: 0.279
Epoch=25/35  Loss: 0.278
Epoch=26/35  Loss: 0.278
Epoch=27/35  Loss: 0.278
Epoch=28/35  Loss: 0.278
Epoch=29/35  Loss: 0.277
Epoch=30/35  Loss: 0.277
Epoch=31/35  Loss: 0.277
Epoch=32/35  Loss: 0.277
Epoch=33/35  Loss: 0.276
Epoch=34/35  Loss: 0.276
Epoch=35/35  Loss: 0.276
Total training time: 83.701338 seconds


In [72]:
'''
Set up dataloading
'''
tensor_data_test = torch.from_numpy(data).float()

dataloader_test = torch.utils.data.DataLoader(
    tensor_data_test,
    batch_size=25,
    shuffle=False,
    drop_last=False,
    num_workers=2)

model.eval()

y_test, predictions_test, sbxg_test = evaluate(model, dataloader_test)

loss_score = skmetrics.log_loss(y_test, predictions_test)
print('Testing Log-loss of Our Model: {:.4f}'.format(loss_score))

loss_score = skmetrics.log_loss(y_test, sbxg_test)
print('Testing Log-loss of StatsBomb Model: {:.4f}'.format(loss_score))


fig = go.Figure()
fig.update_layout(template='plotly_dark')

prob_true, prob_pred = calibration_curve(y_test, predictions_test, n_bins=10)
fig.add_trace(go.Scatter(x=prob_pred, y=prob_true, mode='markers+lines', name='Our Model'))

prob_true, prob_pred = calibration_curve(y_test, sbxg_test, n_bins=10)
fig.add_trace(go.Scatter(x=prob_pred, y=prob_true, mode='markers+lines', name='StatsBomb'))

fig.add_trace(go.Scatter(x=np.linspace(0, 1, 10), y=np.linspace(0, 1, 10), mode='lines', line=dict(dash='dash'), name='Perfect Calibration'))

predictions_test_binned = np.digitize(predictions_test, bins=np.linspace(0, 1, 11)) - 1
bin_labels = [f"{(i / 10)}" for i in range(1,11)]
bin_counts = np.bincount(predictions_test_binned, minlength=10)

fig.add_trace(go.Bar(x=bin_labels, y=bin_counts, name="Our Model's Number of Shots per Bucket", yaxis='y2', xaxis='x2', opacity=0.5))

sbxg_test_binned = np.digitize(sbxg_test, bins=np.linspace(0, 1, 11)) - 1
bin_labels = [f"{(i / 10)}" for i in range(1,11)]
bin_counts = np.bincount(sbxg_test_binned, minlength=10)

fig.add_trace(go.Bar(x=bin_labels, y=bin_counts, name="StatsBomb Model's Number of Shots per Bucket", yaxis='y2', xaxis='x2', opacity=0.5))

fig.update_layout(
    title='Calibration Curves with Uniform Buckets',
    xaxis_title='Mean xG',
    yaxis={
        'range': [0, 1],
        'title': 'Fraction of Actual Goals'
    },
    yaxis2={
        'title': 'Number of Shots',
        'overlaying': 'y',
        'side': 'right',
        'showgrid':False,
    },
    xaxis2={
        'overlaying': 'x',
        'side': 'top',
        'showgrid':False,
        'showticklabels':False
    },
    legend=dict(
        orientation='h',  
    ),
    width=650,
    height=650
)

fig.show()


Testing Log-loss of Our Model: 0.2758
Testing Log-loss of StatsBomb Model: 0.2625
