# Optiver Trading At The Close

[Competition](https://www.kaggle.com/competitions/optiver-trading-at-the-close)

[GitHub Repository](https://github.com/cvaisnor/DNN-Final-Project)

[Google Slides Presentation](https://docs.google.com/presentation/d/1Xc5F1_NveFi1il3GqHej2aqVmklR_jZU4kx6ZJhiDEM/edit?usp=sharing)

# General Imports

In [8]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

kaggle_environment = False # True if running on Kaggle, don't forget to add the dataset!

if kaggle_environment:
    data_path = '/kaggle/input/'
else:
    data_path = 'kaggle/input/'

# Setting File Structure

In [9]:
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

kaggle/input/optiver-trading-at-the-close/.gitkeep
kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
kaggle/input/optiver-trading-at-the-close/train.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py
kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so


# Loading Data

In [10]:
train = pd.read_csv(data_path + 'optiver-trading-at-the-close/train.csv')
revealed_targets = pd.read_csv(data_path + 'optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv(data_path + 'optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission = pd.read_csv(data_path + 'optiver-trading-at-the-close/example_test_files/sample_submission.csv')

# Feature Engineering Section

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5237980 entries, 0 to 5237979
Data columns (total 17 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   date_id                  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   imbalance_buy_sell_flag  int64  
 5   reference_price          float64
 6   matched_size             float64
 7   far_price                float64
 8   near_price               float64
 9   bid_price                float64
 10  bid_size                 float64
 11  ask_price                float64
 12  ask_size                 float64
 13  wap                      float64
 14  target                   float64
 15  time_id                  int64  
 16  row_id                   object 
dtypes: float64(11), int64(5), object(1)
memory usage: 679.4+ MB


In [12]:
# fill null values with zero for now
train = train.fillna(0)

# drop row_id column
train = train.drop(columns=['row_id'])

y = train['target'].values
X = train.drop(columns='target')

from sklearn.preprocessing import StandardScaler

# scale all X columns that are floats
float_columns = [col for col in X.columns if X[col].dtype == 'float64']
scaler = StandardScaler()
X[float_columns] = scaler.fit_transform(X[float_columns])

All feature transformations should be combined into a generate_feature() function. This can then be applied to the test data will in the for loop for the Optiver API.

In [13]:
def generate_features(test):
    test.drop(columns=['row_id'], inplace=True)
    test = scaler.transform(test)
    return test

# Slicing

Generally, don't change the cell below. It is used to slice the data into training and validation sets.

In [14]:
# imports
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# check types of outputs
print('X_train type: ', type(X_train))
print('X_test type: ', type(X_test))
print()
print('y_train type: ', type(y_train))
print('y_test type: ', type(y_test))

X_train type:  <class 'pandas.core.frame.DataFrame'>
X_test type:  <class 'pandas.core.frame.DataFrame'>

y_train type:  <class 'numpy.ndarray'>
y_test type:  <class 'numpy.ndarray'>


In [16]:
# convert to numpy arrays
X_train = X_train.values
X_test = X_test.values

# Model Development

In [17]:
# imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

In [20]:
print('PyTorch Version: ', torch.__version__)
print('CUDA Available: ', torch.cuda.is_available())
if torch.cuda.is_available():
    print('Device Type: ', torch.cuda.get_device_name(0))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

PyTorch Version:  2.1.0
CUDA Available:  False


In [21]:
class PyTorchMLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PyTorchMLP, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size


        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.fc3 = nn.Linear(self.hidden_size, hidden_size)  # Output is a single floating point number
        self.fc4 = nn.Linear(self.hidden_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

    def fit(self, X_train, y_train, X_test, y_test, epochs, batch_size, learning_rate, device, verbose=True):
        self.to(device)
        self.train()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()
        X_train_t = torch.tensor(X_train, dtype=torch.float32, device=device)
        y_train_t = torch.tensor(y_train, dtype=torch.float32, device=device)
        X_test_t = torch.tensor(X_test, dtype=torch.float32, device=device)
        y_test_t = torch.tensor(y_test, dtype=torch.float32, device=device)
        for epoch in tqdm(range(epochs), desc='Epochs'):
            for i in range(0, len(X_train), batch_size):
                batch_X = X_train_t[i:i+batch_size]
                batch_y = y_train_t[i:i+batch_size]
                optimizer.zero_grad()
                outputs = self(batch_X)
                loss = criterion(outputs, batch_y.view_as(outputs))
                loss.backward()
                optimizer.step()
            self.eval()
            with torch.no_grad():
                y_pred = self(X_test_t)
                test_loss = criterion(y_pred, y_test_t.view_as(y_pred))
                if verbose:
                    print(f'Epoch: {epoch+1}/{epochs}, Loss: {loss.item()}, Test Loss: {test_loss.item()}')
            self.train()

    def predict(self, X_test, device):
        self.to(device)
        self.eval()
        with torch.no_grad():
            X_test_t = torch.tensor(X_test, dtype=torch.float32, device=device)
            y_pred = self(X_test_t)
        return y_pred.cpu().numpy()

In [22]:
# creating the model and fitting it on the data from the original sklearn train test split
model = PyTorchMLP(input_size=X_train.shape[1], hidden_size=128)

In [None]:
model.fit(X_train, y_train, X_test, y_test, epochs=10, batch_size=256, learning_rate=0.001, device=device)

Epochs:  10%|█████▌                                                  | 1/10 [00:30<04:35, 30.60s/it]

Epoch: 1/10, Loss: 65.2214126586914, Test Loss: 89.49969482421875


Epochs:  20%|███████████▏                                            | 2/10 [01:00<04:01, 30.16s/it]

Epoch: 2/10, Loss: 65.23687744140625, Test Loss: 89.50027465820312


Epochs:  30%|████████████████▊                                       | 3/10 [02:13<05:47, 49.62s/it]

Epoch: 3/10, Loss: 65.2374267578125, Test Loss: 89.50032806396484


Epochs:  40%|██████████████████████▍                                 | 4/10 [03:12<05:21, 53.59s/it]

Epoch: 4/10, Loss: 65.2366714477539, Test Loss: 89.50027465820312


# Generating Predictions

Only for Kaggle:
When uploading notebook, make sure to add the Optiver data on the right side of the screen.

In [None]:
if kaggle_environment:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

In [None]:
if kaggle_environment:
    # To count how many time the "for loop" runs.
    counter = 0

    # init 3 empty lists
    test_ls, revealed_targets_ls, sample_prediction_ls = [], [], []

    for (test, revealed_targets, sample_prediction) in iter_test:
        # Append the dataframe that API return into the list.
        test_ls.append(test.copy())
        revealed_targets_ls.append(revealed_targets.copy())
        sample_prediction_ls.append(sample_prediction.copy())

        # Generate features
        test = generate_features(test)

        # Writes our predictions 
        sample_prediction["target"] = model.predict(test)
        
        # This line submit our predictions.
        env.predict(sample_prediction)
        counter += 1

    print('\n', '=' * 50, sep="")
    print(f"counter: {counter}")