In [2]:
import numpy as np
import warnings
import time
import sklearn.metrics as skmetrics
from sklearn.calibration import calibration_curve
import plotly.graph_objects as go

warnings.filterwarnings(action='ignore')

In [3]:
class Optimizer(object):

    def __init__(self, alpha, eta_decay_factor, optimizer_type):
        '''
        Arg(s):
            alpha : float
                initial learning rate 
            eta_decay_factor : float
                learning rate decay rate
            optimizer_type : str
                'gradient_descent',
                'stochastic_gradient_descent',
        '''

        self.__alpha = alpha
        self.__eta_decay_factor = eta_decay_factor
        self.__optimizer_type = optimizer_type

    def __compute_gradients(self, w, x, y, loss_func='logistic'):
        '''
        Returns the gradient of a loss function

        Arg(s):
            w : numpy[float32]
                d x 1 weight vector
            x : numpy[float32]
                d x N feature vector
            y : numpy[float32]
                1 x N groundtruth vector
            loss_func : str
                loss by default is 'logistic' only for the purpose of the assignment
        Returns:
            numpy[float32] : d x 1 gradients
        '''

        if loss_func == 'logistic':
            p = 1 / (1 + np.exp(-1 * y * np.matmul(w.T, x)))
            return -1 * np.mean(y * x * (1 - p), axis=1).reshape(w.shape)
        else:
            raise ValueError('Unupported loss function: {}'.format(loss_func))

    def __polynomial_decay(self, time_step):
        '''
        Computes the polynomial decay factor t^{-a}

        Arg(s):
            time_step : int
                current step in optimization
        Returns:
            float : polynomial decay to adjust learning rate
        '''

        return np.power(1 / time_step, (self.__eta_decay_factor))

    def update(self,
               w,
               x,
               y,
               loss_func,
               batch_size,
               time_step):
        '''
        Updates the weight vector based on

        Arg(s):
            w : numpy[float32]
                d x 1 weight vector
            x : numpy[float32]
                d x N feature vector
            y : numpy[float32]
                1 x N groundtruth vector
            loss_func : str
                loss function to use, should be 'logistic' for the purpose of the assignment
            batch_size : int
                batch size for stochastic and momentum stochastic gradient descent
            time_step : int
                current step in optimization
        Returns:
            numpy[float32]: d x 1 weights
        '''

        if self.__optimizer_type == 'gradient_descent':
            return w - self.__alpha * self.__compute_gradients(w, x, y, loss_func)
        elif self.__optimizer_type == 'stochastic_gradient_descent':
            batch_idx = np.random.choice(x.shape[1], batch_size)
            return w - self.__polynomial_decay(time_step) * self.__alpha * self.__compute_gradients(w, x[:,batch_idx], y[batch_idx], loss_func)
        else:
            raise ValueError('Unsupported optimizer type: {}'.format(self.__optimizer_type))


In [4]:
class LogisticRegression(object):

    def __init__(self):
        # Define private variables
        self.__weights = None
        self.__optimizer = None

    def fit(self,
            x,
            y,
            T,
            alpha,
            eta_decay_factor,
            batch_size,
            optimizer_type,
            loss_func='logistic'):
        '''
        Fits the model to x and y by updating the weight vector
        using gradient descent

        Arg(s):
            x : numpy[float32]
                d x N feature vector
            y : numpy[float32]
                1 x N groundtruth vector
            T : int
                number of iterations to train
            alpha : float
                learning rate
            eta_decay_factor : float
                learning rate decay rate
            batch_size : int
                number of examples per batch
            optimizer_type : str
                'gradient_descent',
                'momentum_gradient_descent',
                'stochastic_gradient_descent',
                'momentum_stochastic_gradient_descent'
            loss_func : str
                loss function to use, by default is 'logistic' only for the purpose of the assignment
        '''

        self.__optimizer = Optimizer(alpha, eta_decay_factor, optimizer_type)

        self.__weights = np.zeros([x.shape[0], 1])

        for t in range(1, T + 1):

            loss = self.__compute_loss(x, y, loss_func)
            if (t % 100) == 0:
                print('Step={}  Loss={}'.format(t, loss))

            self.__weights = self.__optimizer.update(self.__weights, x, y, loss_func, batch_size, t)

    def predict(self, x):
        '''
        Predicts the label for each feature vector x

        Arg(s):
            x : numpy[float32]
                d x N feature vector
        Returns:
            numpy[float32] : 1 x N vector
        '''

        predictions = 1 / (1 + np.exp(-1 * np.matmul(self.__weights.T, x)))
        return predictions.squeeze()

    def __compute_loss(self, x, y, loss_func):
        '''
        Computes the logistic loss

        Arg(s):
            x : numpy[float32]
                d x N feature vector
            y : numpy[float32]
                1 x N groundtruth vector
            loss_func : str
                loss function to use, by default is 'logistic' only for the purpose of the assignment
        Returns:
            float : loss
        '''

        if loss_func == 'logistic':
            loss = np.mean(np.log(1 + np.exp(-1 * y * np.matmul(self.__weights.T, x))))
        else:
            raise ValueError('Unsupported loss function: {}'.format(loss_func))

        return loss

In [5]:
data = np.vectorize(float)(np.load('data/shots.npy', allow_pickle=True))

#separate StatsBomb xG, groundtruth, and feature vectors
sbxg = data[:, 0]
y = data[:, 1]
x = data[:, 2:]

#get random indices for train, val, test splits
shuffled_indices = np.random.permutation(x.shape[0])
train_split_idx = int(0.60 * x.shape[0])
val_split_idx = int(0.80 * x.shape[0])

train_indices = shuffled_indices[0:train_split_idx]
val_indices = shuffled_indices[train_split_idx:val_split_idx]
test_indices = shuffled_indices[val_split_idx:]

#separate dataset into train, val, test splits
x_train, y_train, sbxg_train = x[train_indices, :], y[train_indices], sbxg[train_indices]
x_val, y_val, sbxg_val = x[val_indices, :], y[val_indices], sbxg[val_indices]
x_test, y_test, sbxg_test = x[test_indices, :], y[test_indices], sbxg[test_indices]

#format dataset into dXN and 1xN matrices as our logistic regression model expects
x_train = np.transpose(x_train, axes=(1, 0))
x_val = np.transpose(x_val, axes=(1, 0))
x_test = np.transpose(x_test, axes=(1, 0))
y_train = np.expand_dims(y_train, axis=0)
y_val = np.expand_dims(y_val, axis=0)
y_test = np.expand_dims(y_test, axis=0)

y_train = y_train[0]
y_val = y_val[0]
y_test = y_test[0]

In [7]:
model_ours = LogisticRegression()

time_start = time.time()

model_ours.fit(x_train, 
               y_train, 
               T=6000, 
               alpha=1e-4, 
               eta_decay_factor=1e-6, 
               batch_size=8000, 
               optimizer_type='stochastic_gradient_descent')

time_elapsed = time.time() - time_start
print('Total training time: {:3f} seconds'.format(time_elapsed))

predictions_train = model_ours.predict(x_train)
loss_score = skmetrics.log_loss(y_train, predictions_train)
print('Training Log-loss of Our Model: {:.4f}'.format(loss_score))

loss_score = skmetrics.log_loss(y_train, sbxg_train)
print('Training Log-loss of StatsBomb Model: {:.4f}'.format(loss_score))

predictions_val = model_ours.predict(x_val)
loss_score = skmetrics.log_loss(y_val, predictions_val)
print('Validation Log-loss of Our Model: {:.4f}'.format(loss_score))

loss_score = skmetrics.log_loss(y_val, sbxg_val)
print('Validation Log-loss of StatsBomb Model: {:.4f}'.format(loss_score))

predictions_test = model_ours.predict(x_test)
loss_score = skmetrics.log_loss(y_test, predictions_test)
print('Testing Log-loss of Our Model: {:.4f}'.format(loss_score))

loss_score = skmetrics.log_loss(y_test, sbxg_test)
print('Testing Log-loss of StatsBomb Model: {:.4f}'.format(loss_score))


Step=100  Loss=0.3157417184654849
Step=200  Loss=0.31333012967039936
Step=300  Loss=0.31065006202238094
Step=400  Loss=0.3079848881509398
Step=500  Loss=0.30560372835794447
Step=600  Loss=0.3060724432873516
Step=700  Loss=0.303486347693775
Step=800  Loss=0.3020599202056486
Step=900  Loss=0.3017139633404017
Step=1000  Loss=0.3005802738946204
Step=1100  Loss=0.29999141434909266
Step=1200  Loss=0.2994149492149629
Step=1300  Loss=0.2991290891441645
Step=1400  Loss=0.2982383851579517
Step=1500  Loss=0.29819712541423193
Step=1600  Loss=0.29822536382373405
Step=1700  Loss=0.2973895735748442
Step=1800  Loss=0.2967334255196876
Step=1900  Loss=0.29699427696025027
Step=2000  Loss=0.2968673041293801
Step=2100  Loss=0.29788975870802614
Step=2200  Loss=0.2966937489631423
Step=2300  Loss=0.2955956202921629
Step=2400  Loss=0.29523174489017334
Step=2500  Loss=0.29568439650377815
Step=2600  Loss=0.29539012220594385
Step=2700  Loss=0.29480471142025594
Step=2800  Loss=0.2946359252271507
Step=2900  Loss=0.

In [21]:
fig = go.Figure()
fig.update_layout(template='plotly_dark')

#plot calibration curve for our model with 10 quantile buckets (each bucket has the same number of predictions)
prob_true, prob_pred = calibration_curve(y_test, predictions_test, n_bins=10, strategy='quantile')
fig.add_trace(go.Scatter(x=prob_pred, y=prob_true, mode='markers+lines', name='Our Model'))

#plot calibration curve for StatsBomb model with 10 quantile buckets (each bucket has the same number of predictions)
prob_true, prob_pred = calibration_curve(y_test, sbxg_test, n_bins=10, strategy='quantile')
fig.add_trace(go.Scatter(x=prob_pred, y=prob_true, mode='markers+lines', name='StatsBomb Model'))

#plot perfect calibration curve
fig.add_trace(go.Scatter(x=np.linspace(0, 1, 10), y=np.linspace(0, 1, 10), mode='lines', line=dict(dash='dash'), name='Perfect Calibration'))

fig.update_layout(
    title='Calibration Curves with Quantile Buckets',
    xaxis_title='Mean xG',
    yaxis_title='Fraction of Actual Goals',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1,
        xanchor='right',
        x=1
    ),
    width=600,
    height=600
)

fig.show()

NameError: name 'predictions_test' is not defined

In [16]:
fig = go.Figure()
fig.update_layout(template='plotly_dark')

#plot calibration curve for our model with 10 uniform buckets (each bucket has equal width across prediction space)
prob_true, prob_pred = calibration_curve(y_test, predictions_test, n_bins=10)
fig.add_trace(go.Scatter(x=prob_pred, y=prob_true, mode='markers+lines', name='Our Model'))

#plot calibration curve for StatsBomb model with 10 uniform buckets (each bucket has equal width across prediction space)
prob_true, prob_pred = calibration_curve(y_test, sbxg_test, n_bins=10)
fig.add_trace(go.Scatter(x=prob_pred, y=prob_true, mode='markers+lines', name='StatsBomb'))

#plot calibration curve for perfect calibration
fig.add_trace(go.Scatter(x=np.linspace(0, 1, 10), y=np.linspace(0, 1, 10), mode='lines', line=dict(dash='dash'), name='Perfect Calibration'))

#histogram for number of our model's predictions in each bucket
predictions_test_binned = np.digitize(predictions_test, bins=np.linspace(0, 1, 11)) - 1
bin_labels = [f"{(i / 10)}" for i in range(1,11)]
bin_counts = np.bincount(predictions_test_binned, minlength=10)

fig.add_trace(go.Bar(x=bin_labels, y=bin_counts, name="Our Model's Number of Shots per Bucket", yaxis='y2', xaxis='x2', opacity=0.5))

#histogram for number of StatsBomb model's predictions in each bucket
sbxg_test_binned = np.digitize(sbxg_test, bins=np.linspace(0, 1, 11)) - 1
bin_labels = [f"{(i / 10)}" for i in range(1,11)]
bin_counts = np.bincount(sbxg_test_binned, minlength=10)

fig.add_trace(go.Bar(x=bin_labels, y=bin_counts, name="StatsBomb Model's Number of Shots per Bucket", yaxis='y2', xaxis='x2', opacity=0.5))

fig.update_layout(
    title='Calibration Curves with Uniform Buckets',
    xaxis_title='Mean xG',
    yaxis={
        'range': [0, 1],
        'title': 'Fraction of Actual Goals'
    },
    yaxis2={
        'title': 'Number of Shots',
        'overlaying': 'y',
        'side': 'right',
        'showgrid':False,
    },
    xaxis2={
        'overlaying': 'x',
        'side': 'top',
        'showgrid':False,
        'showticklabels':False
    },
    legend=dict(
        orientation='h',  
    ),
    width=650,
    height=650
)

fig.show()