# STA4365 - Convolutional Neural Network Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:
# Data Loading + Preprocessing
df = pd.read_csv('train.csv')
df = df.fillna(-9e10) # Filling NaN values with large negative numbers came at the advice of other competition participants; appears to have worked well
# Low NaN counts in the feature columns used (~200 out of 5 million rows) makes the impact negligible nonetheless

In [None]:
# Classes

# Converting data to PyTorch tensors + compatibility with CNN

class TimeSeriesDataset(Dataset):
    def __init__(self, data, targets, window_size):
        self.data = data
        self.targets = targets
        self.window_size = window_size
    
    def __len__(self):
        return len(self.data) - self.window_size
    
    def __getitem__(self, idx):
        x = self.data.iloc[idx:idx+self.window_size].values.T
        y = self.targets.iloc[idx + self.window_size]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
    

# CNN - see report for details on model design

class CNN(nn.Module):
    def __init__(self, input_channels=9, window_size=30):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2)

        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2)

        self.flattened_size = self._get_flattened_size(input_channels, window_size) # See function comment

        self.fc1 = nn.Linear(self.flattened_size, 64)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
    
    def _get_flattened_size(self, input_channels, window_size): # This function is just to get around computing the flattened size for different window sizes
        x = torch.zeros(1, input_channels, window_size)
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        return x.view(1, -1).shape[1]
    
    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(self.fc1(x))
        x = self.fc2(x)
        return x



In [None]:
# Functions

# Model training - forward / backprop

def train_model(model, train_loader, num_epochs, optimizer, criterion, device):

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_loss:.4f}")

# Calculating MAE

def evaluate_mae(model, dataloader, device='cpu'):
    model.eval()
    y_preds = []
    y_trues = []

    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(x_batch).squeeze()
            y_preds.extend(outputs.cpu().numpy())
            y_trues.extend(y_batch.cpu().numpy())
    mae = mean_absolute_error(y_trues, y_preds)
    return mae

In [None]:
# Full pipeline for all 200 companies

time_steps = 30
learning_rate = 0.001
num_epochs = 6

mae_results = []
for i in range(200):
    if(i == 78):
        continue # Skipping Company 79 due to issues with dataset; could not identify root cause ahead of the report

    # Data loading, preprocessing for each company
    df_comp = df[df["stock_id"] == i]
    X = df_comp[['seconds_in_bucket', "imbalance_size", "imbalance_buy_sell_flag", "reference_price", "matched_size", "bid_price", "bid_size", "ask_price", "ask_size", "wap"]]
    y = df_comp["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) # shuffle = false to maintain order in time series dataset (i.e. training set is the first 80% of data)
    train_dataset = TimeSeriesDataset(X_train, y_train, time_steps)
    test_dataset = TimeSeriesDataset(X_test, y_test, time_steps)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False) # See above comment on shuffle
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CNN(input_channels=10, window_size=30).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.L1Loss()
    # Train model
    train_model(model, train_loader, num_epochs, optimizer, criterion, device)
    # Get results
    mae = evaluate_mae(model, test_loader)
    mae_results.append(mae)
    print("Mean Absolute Error for CNN, Company #", i, ":", mae)

In [None]:
# Results

print("Median MAE for all companies:", np.median(mae_results))
print("Best MAE:", np.min(mae_results))

# Boxplot
mae_resultsLog = np.log(mae_results) # Log transform to make the boxplot readable
plt.boxplot(mae_resultsLog)