# Optiver Trading At The Close

[Competition](https://www.kaggle.com/competitions/optiver-trading-at-the-close)

[GitHub Repository](https://github.com/cvaisnor/DNN-Final-Project)

[Google Slides Presentation](https://docs.google.com/presentation/d/1Xc5F1_NveFi1il3GqHej2aqVmklR_jZU4kx6ZJhiDEM/edit?usp=sharing)

# General Imports

In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

kaggle_environment = False # True if running on Kaggle, don't forget to add the dataset!

if kaggle_environment:
    data_path = '/kaggle/input/'
else:
    data_path = 'kaggle/input/'



# Setting File Structure

In [2]:
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

kaggle/input/optiver-trading-at-the-close/.gitkeep
kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
kaggle/input/optiver-trading-at-the-close/train.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/.ipynb_checkpoints/test-checkpoint.csv
kaggle/input/optiver-trading-at-the-close/example_test_files/.ipynb_checkpoints/sample_submission-checkpoint.csv
kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py
kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so


# Loading Data

In [3]:
original_train = pd.read_csv(data_path + 'optiver-trading-at-the-close/train.csv')
revealed_targets = pd.read_csv(data_path + 'optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv(data_path + 'optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission = pd.read_csv(data_path + 'optiver-trading-at-the-close/example_test_files/sample_submission.csv')

# Feature Engineering Section

In [4]:
original_train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5237980 entries, 0 to 5237979
Data columns (total 17 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   date_id                  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   imbalance_buy_sell_flag  int64  
 5   reference_price          float64
 6   matched_size             float64
 7   far_price                float64
 8   near_price               float64
 9   bid_price                float64
 10  bid_size                 float64
 11  ask_price                float64
 12  ask_size                 float64
 13  wap                      float64
 14  target                   float64
 15  time_id                  int64  
 16  row_id                   object 
dtypes: float64(11), int64(5), object(1)
memory usage: 679.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 16 c

In [5]:
split_ratio = 0.8  # 80% for training, 20% for testing
split_idx = int(len(original_train) * split_ratio)

train = original_train.iloc[:split_idx]
test = original_train.iloc[split_idx:]

In [6]:
%%time
train['wap_lag1'] = train.groupby('stock_id')['wap'].shift(1)
test['wap_lag1'] = test.groupby('stock_id')['wap'].shift(1)

train['wap_lag5'] = train.groupby('stock_id')['wap'].shift(5)
test['wap_lag5'] = test.groupby('stock_id')['wap'].shift(5)

train['wap_rolling_mean10'] = train.groupby('stock_id')['wap'].rolling(window=10).mean().reset_index(level=0, drop=True)
test['wap_rolling_mean10'] = test.groupby('stock_id')['wap'].rolling(window=10).mean().reset_index(level=0, drop=True)

train['wap_diff'] = train.groupby('stock_id')['wap'].diff()
test['wap_diff'] = test.groupby('stock_id')['wap'].diff()

train['wap_expanding_mean'] = train.groupby('stock_id')['wap'].expanding().mean().reset_index(level=0, drop=True)
test['wap_expanding_mean'] = test.groupby('stock_id')['wap'].expanding().mean().reset_index(level=0, drop=True)

# train = train.fillna(0)
# test = train.fillna(0)

# train = train.fillna(0)
# test = test.fillna(0)

train.fillna(method='ffill', inplace=True)
test.fillna(method='ffill', inplace=True)
# train.fillna(train.mean(), inplace=True)

# drop row_id column
train = train.drop(columns=['row_id'])
test = test.drop(columns=['row_id'])

y = train['target'].values
y_test = test['target'].values

X = train.drop(columns='target')
X_test = test.drop(columns='target')

# drop_columns = ['date_id', 'time_id', 'seconds_in_bucket']
drop_columns = ['time_id']
X = X.drop(columns=drop_columns)
X_test = X_test.drop(columns=drop_columns)

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# float_columns = [col for col in X.columns if X[col].dtype == 'float64']
# degree2_columns = float_columns + ['stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag']
# poly_transformer = ColumnTransformer(
#     transformers=[
#         ('poly2', PolynomialFeatures(degree=(2, 2)), degree2_columns),
#     ],
#     remainder='passthrough'
# )

# pipeline = Pipeline([
#     ('poly_features', poly_transformer)
# ])

# X = pipeline.fit_transform(X)
# X_test = pipeline.transform(X_test)
# scale all X columns that are floats
# float_columns = [col for col in X.columns if X[col].dtype == 'float64']
# scaler = StandardScaler()
# X[float_columns] = scaler.fit_transform(X[float_columns])

# X['wap_lag1'] = X.groupby('stock_id')['wap'].shift(1)
# X_test['wap_lag1'] = X_test.groupby('stock_id')['wap'].shift(1)

X = X.fillna(0)
X_test = X_test.fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

CPU times: user 4.4 s, sys: 1.56 s, total: 5.96 s
Wall time: 5.27 s


In [7]:
X['wap_lag1']

0          0.000000
1          0.000000
2          0.000000
3          0.000000
4          0.000000
             ...   
4190379    0.998839
4190380    1.000783
4190381    1.002925
4190382    1.002209
4190383    0.999564
Name: wap_lag1, Length: 4190384, dtype: float64

In [8]:
# drop_columns = ['date_id', 'time_id']
# # drop_columns = []
# X = X.drop(columns=drop_columns)
# X_test = X_test.drop(columns=drop_columns)
# # X = pd.get_dummies(X, columns=['stock_id'], prefix='stock')

All feature transformations should be combined into a generate_feature() function. This can then be applied to the test data will in the for loop for the Optiver API.

In [9]:
def generate_features(cumulative_test_df, current_test):
    cumulative_test_df['wap_lag1'] = cumulative_test_df.groupby('stock_id')['wap'].shift(1)
    cumulative_test_df['wap_lag5'] = cumulative_test_df.groupby('stock_id')['wap'].shift(5)
    cumulative_test_df['wap_rolling_mean10'] = cumulative_test_df.groupby('stock_id')['wap'].rolling(window=10).mean().reset_index(level=0, drop=True)
    cumulative_test_df['wap_diff'] = cumulative_test_df.groupby('stock_id')['wap'].diff()
    cumulative_test_df['wap_expanding_mean'] = cumulative_test_df.groupby('stock_id')['wap'].expanding().mean().reset_index(level=0, drop=True)
    cumulative_test_df.fillna(method='ffill', inplace=True)
    cumulative_test_df = cumulative_test_df.drop(columns=['row_id'])
    cumulative_test_df = cumulative_test_df.fillna(0)
    
    # Only return rows corresponding to the current test dataframe
    return cumulative_test_df, cumulative_test_df.iloc[-len(current_test):]


# Slicing

Generally, don't change the cell below. It is used to slice the data into training and validation sets.

In [10]:
# imports
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X
y_train = y 

In [11]:
X_train.mean()

stock_id                   9.923560e+01
date_id                    1.936673e+02
seconds_in_bucket          2.698698e+02
imbalance_size             5.619934e+06
imbalance_buy_sell_flag    8.151998e-04
reference_price            1.000002e+00
matched_size               4.460104e+07
far_price                  9.997329e-01
near_price                 9.985859e-01
bid_price                  9.997305e-01
bid_size                   5.033497e+04
ask_price                  1.000271e+00
ask_size                   5.189606e+04
wap                        9.999981e-01
wap_lag1                   9.999524e-01
wap_lag5                   9.997699e-01
wap_rolling_mean10         9.995875e-01
wap_diff                   4.745914e-08
wap_expanding_mean         1.000009e+00
dtype: float64

In [12]:
# check types of outputs
print('X_train type: ', type(X_train))
print('X_test type: ', type(X_test))
print()
print('y_train type: ', type(y_train))
print('y_test type: ', type(y_test))

X_train type:  <class 'pandas.core.frame.DataFrame'>
X_test type:  <class 'pandas.core.frame.DataFrame'>

y_train type:  <class 'numpy.ndarray'>
y_test type:  <class 'numpy.ndarray'>


In [13]:
# # convert to numpy arrays
# X_train = X_train.values
# X_test = X_test.values

# Model Development

In [14]:
%%time
import lightgbm as lgb
from sklearn.pipeline import Pipeline


# lgbm = lgb.LGBMRegressor(n_jobs=-1, random_state=0, force_col_wise=True,
#                          verbose=-1, boosting_type='gbdt', num_leaves=10,
#                          reg_alpha=0, reg_lambda=0.2, objective='regression_l1')

lgbm = lgb.LGBMRegressor(n_jobs=-1, random_state=0, objective='regression_l1')

# m = Pipeline([
#     ("scaler", StandardScaler()),
#     ("model", lgbm)
# ])

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
lgbm.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4326
[LightGBM] [Info] Number of data points in the train set: 4190384, number of used features: 19
[LightGBM] [Info] Start training from score -0.069737
CPU times: user 1min 30s, sys: 5.92 s, total: 1min 36s
Wall time: 11.1 s


In [15]:
# lgbm.score(X_test, y_test)

from sklearn.metrics import mean_absolute_error

predictions = lgbm.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error on the test set: {mae:.4f}")
predictions2 = lgbm.predict(X_train)
mae2 = mean_absolute_error(y_train, predictions2)
print(f"Mean Absolute Error on the training set: {mae2:.4f}")


Mean Absolute Error on the test set: 5.9880
Mean Absolute Error on the training set: 6.3663


In [16]:
y_test

array([ 10.470152 , -16.819836 ,  -2.580285 , ...,   1.1694431,
        -1.540184 ,  -6.530285 ])

In [17]:
# imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

In [18]:
print('PyTorch Version: ', torch.__version__)
print('CUDA Available: ', torch.cuda.is_available())
if torch.cuda.is_available():
    print('Device Type: ', torch.cuda.get_device_name(0))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

PyTorch Version:  2.1.0
CUDA Available:  False


In [19]:
# convert to numpy arrays
# X_train = X_train.values
# X_test = X_test.values

In [20]:
class PyTorchMLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PyTorchMLP, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size


        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.fc3 = nn.Linear(self.hidden_size, hidden_size)  # Output is a single floating point number
        self.fc4 = nn.Linear(self.hidden_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

    def fit(self, X_train, y_train, X_test, y_test, epochs, batch_size, learning_rate, device, verbose=True):
        self.to(device)
        self.train()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        criterion = nn.L1Loss()
        X_train_t = torch.tensor(X_train, dtype=torch.float32, device=device)
        y_train_t = torch.tensor(y_train, dtype=torch.float32, device=device)
        X_test_t = torch.tensor(X_test, dtype=torch.float32, device=device)
        y_test_t = torch.tensor(y_test, dtype=torch.float32, device=device)
        for epoch in tqdm(range(epochs), desc='Epochs'):
            for i in range(0, len(X_train), batch_size):
                batch_X = X_train_t[i:i+batch_size]
                batch_y = y_train_t[i:i+batch_size]
                optimizer.zero_grad()
                outputs = self(batch_X)
                loss = criterion(outputs, batch_y.view_as(outputs))
                loss.backward()
                optimizer.step()
            self.eval()
            with torch.no_grad():
                y_pred = self(X_test_t)
                test_loss = criterion(y_pred, y_test_t.view_as(y_pred))
                if verbose:
                    print(f'Epoch: {epoch+1}/{epochs}, Loss: {loss.item()}, Test Loss: {test_loss.item()}')
            self.train()

    def predict(self, X_test, device):
        self.to(device)
        self.eval()
        with torch.no_grad():
            X_test_t = torch.tensor(X_test, dtype=torch.float32, device=device)
            y_pred = self(X_test_t)
        return y_pred.cpu().numpy()

In [21]:
# creating the model and fitting it on the data from the original sklearn train test split
# model = PyTorchMLP(input_size=X_train.shape[1], hidden_size=128)
# model.fit(X_train, y_train, X_test, y_test, epochs=2, batch_size=256, learning_rate=0.001, device=device)
model = lgbm

# Generating Predictions

Only for Kaggle:
When uploading notebook, make sure to add the Optiver data on the right side of the screen.

In [22]:
if kaggle_environment:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

In [23]:
if kaggle_environment:
    # To count how many time the "for loop" runs.
    counter = 0

    # init 3 empty lists
    test_ls, revealed_targets_ls, sample_prediction_ls = [], [], []
    cumulative_test_df = pd.DataFrame()

    for (test_in, revealed_targets, sample_prediction) in iter_test:
        # Append the dataframe that API return into the list.
        test_ls.append(test_in.copy())
        revealed_targets_ls.append(revealed_targets.copy())
        sample_prediction_ls.append(sample_prediction.copy())

        cumulative_test_df = pd.concat([cumulative_test_df, test_in], axis=0, ignore_index=True)

        # Generate features
        (cumulative_test_df, test_in) = generate_features(cumulative_test_df, test_in)

        # Writes our predictions 
        sample_prediction["target"] = model.predict(test_in)
        
        # This line submit our predictions.
        env.predict(sample_prediction)
        counter += 1

    print('\n', '=' * 50, sep="")
    print(f"counter: {counter}")