In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [49]:
torch.manual_seed(42)
np.random.seed(42)

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

In [None]:
data = pd.read_csv("../preprocessed_data/data_with_removals_encoded.csv")
#data = pd.read_csv("../preprocessed_data/data_without_promoted_teams.csv")
#data = pd.read_csv("../preprocessed_data/data_with_30_most_important_features.csv")

data.head()

In [None]:
data.info()

In [53]:
label_encoder = LabelEncoder()
data['FTR_encoded'] = label_encoder.fit_transform(data['FTR'])
data_encoded =data.drop(columns=['FTR'])

data_encoded = data_encoded.astype({col: np.float64 for col in data_encoded.columns if col != 'FTR_encoded'})

In [54]:
# Train, validation, test split
temp_data, test_data = train_test_split(data_encoded, test_size=0.1, random_state=42)
train_data, valid_data = train_test_split(temp_data, test_size=0.2, random_state=42)

X_train = torch.tensor(train_data.drop(columns=['FTR_encoded']).values, dtype=torch.float32)
y_train = torch.tensor(train_data['FTR_encoded'].values, dtype=torch.long)
X_valid = torch.tensor(valid_data.drop(columns=['FTR_encoded']).values, dtype=torch.float32)
y_valid = torch.tensor(valid_data['FTR_encoded'].values, dtype=torch.long)


In [55]:
lambda_ = 1e-3

class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x
        
    def l1_loss(self):
        # Calculate L1 regularization term
        l1_reg = torch.tensor(0., requires_grad=True)
        for param in self.parameters():
            l1_reg = l1_reg + torch.norm(param, 1)
        return l1_reg
    def l2_loss(self):
        # Calculate L2 regularization term
        l2_reg = torch.tensor(0., requires_grad=True)
        for param in self.parameters():
            l2_reg = l2_reg + torch.norm(param, 2)
        return l2_reg


In [56]:
input_size = X_train.shape[1]
hidden_size = 32
output_size = 3

reg = None

learning_rate = 0.00001
num_epochs = 100

model = SimpleMLP(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def calculate_accuracy(outputs, targets):
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == targets).sum().item()
    return correct / targets.size(0)

train_accuracies = []
valid_accuracies = []
train_losses = []
valid_losses = []

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    if reg == 'l1':
        l1_loss = model.l1_loss()
        loss += lambda_ * l1_loss
    elif reg == 'l2':
        l2_loss = model.l2_loss()
        loss += lambda_ * l2_loss

    loss.backward()
    optimizer.step()
    
    train_accuracy = calculate_accuracy(outputs, y_train)
    train_accuracies.append(train_accuracy)
    train_losses.append(loss.item())
    
    model.eval()
    with torch.no_grad():
        valid_outputs = model(X_valid)
        valid_loss = criterion(valid_outputs, y_valid)
        valid_accuracy = calculate_accuracy(valid_outputs, y_valid)
        valid_accuracies.append(valid_accuracy)
        valid_losses.append(valid_loss.item())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Train Acc: {train_accuracy:.4f}, Valid Loss: {valid_loss.item():.4f}, Valid Acc: {valid_accuracy:.4f}')

# Plot the accuracies
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs + 1), valid_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Train and Validation Accuracy')
plt.legend()

# Plot the losses
plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs + 1), valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train and Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Results for varying the number of hidden units and layers

| Hidden Units | # Hidden Layers | Train Accuracy | Validation Accuracy | Train Loss | Validation Loss |
|--------------|----------------|----------------|---------------------|------------|-----------------|
| 32           | 1              | 0.5441         | 0.5274              | 0.9547     | 0.9837          |
| 32           | 2              | 0.5495         | 0.5323              | 0.9483     | 0.9861          |
| 32           | 3              | 0.5448         | 0.5225              | 0.9546     | 0.9872          |
| 64           | 1              | 0.5546         | 0.5204              | 0.9396     | 0.9911          |
| 64           | 2              | 0.5734         | 0.5225              | 0.9133     | 0.9966          |
| 64           | 3              | 0.5797         | 0.5211              | 0.9051     | 1.0065          |
| 128          | 1              | 0.5794         | 0.5190              | 0.9042     | 0.9936          |
| 128          | 2              | 0.6303         | 0.5211              | 0.8335     | 1.0341          |
| 128          | 3              | 0.7127         | 0.4831              | 0.6893     | 1.1963          |

Based on the initial results, the best model is the one with 64 hidden units and 2 hidden layers, with a train accuracy of 0.5734 and a validation accuracy of 0.5225.

Let's see if we can further improve the model with other regularization techniques.

# Dropout results with different inclusion probabilities

| Inclusion Probability | Train Accuracy | Validation Accuracy | Train Loss | Validation Loss |
|-----------------------|----------------|---------------------|------------|-----------------|
| 0.6                   | 0.5355         | 0.5218              | 0.9820     | 0.9822          |
| 0.7                   | 0.5355         | 0.5218              | 0.9820     | 0.9822          |
| 0.85                  | 0.5407         | 0.5197              | 0.9691     | 0.9860          |
| 0.97                  | 0.5450         | 0.5239              | 0.9567     | 0.9865          |

It seems that a very high inclusion probability has a marginal effect on the validation accuracy, though probably not enough to be statistically significant.

# L1 regularization results

| Lambda | Train Accuracy | Validation Accuracy | Train Loss | Validation Loss |
|-----------------------|----------------|---------------------|------------|-----------------|
| 5e-4                  | 0.5330         | 0.5147              | 1.0255     | 0.9831          |
| 1e-3                  | 0.5341         | 0.5183              | 1.0471     | 0.9839          |
| 5e-3                  | 0.4542         | 0.4677              | 1.1763     | 1.0052          |
| 5e-2                  | 0.4542         | 0.4677              | 2.1595     | 1.0731          |

It seems that l1 regularization is not a good fit for this model... it actually had a negative effect on the validation accuracy and loss.

# L2 regularization results

| Lambda | Train Accuracy | Validation Accuracy | Train Loss | Validation Loss |
|-----------------------|----------------|---------------------|------------|-----------------|
| 5e-4                  | 0.5260         | 0.5126              | 1.1570     | 0.9888          |
| 1e-3                  | 0.5423         | 0.5211              | 0.9658     | 0.9889          |
| 5e-3                  | 0.5420         | 0.5239              | 0.9998     | 0.9840          |
| 5e-2                  | 0.5235         | 0.5140              | 1.1638     | 0.9871          |

It seems that l2 regularization with a lambda of 5e-3 is the best fit for this model of the lot, however still performs slightly worse than the model without regularization. Higher lambdas do however help to reduce the overfitting of the model.

---

It seems that there are two possible issues here:
1. The model is too simple and is not able to capture the complexity of the data (albeit increasing complexity decreases the validation accuracy and increases overfitting)
2. The data itself is too noisy or contains too much irrelevant information.

### Next steps:
- Experiment with different feature selections - particularly with less features.
- experiment with different model architectures and layers, as well as different combinations of dropout and regularization
- Research into different techniques that could help boost model performance