In [63]:
import time

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

# Data Preprocessing

In [7]:
# Using small dataset for now
TRAIN_END_DATE = 80
TEST_END_DATE = 100
df = pd.read_csv('../archive/optiver2023/train.csv')
df = df[df['date_id'] <= TEST_END_DATE]
df.info()

In [8]:
# Checking missing values
# Far price and near price have a lot of missing values, because this data is only available for last 5 minutes.
# Setting them up to 0 may potentially confuse the model
# For now, we will drop these rows
df.isna().sum()

stock_id                        0
date_id                         0
seconds_in_bucket               0
imbalance_size                 55
imbalance_buy_sell_flag         0
reference_price                55
matched_size                   55
far_price                  596247
near_price                 588085
bid_price                      55
bid_size                        0
ask_price                      55
ask_size                        0
wap                            55
target                         31
time_id                         0
row_id                          0
dtype: int64

In [21]:
# Using only last 5 minutes data so that we have less missing values
df = df[df['seconds_in_bucket'] >= 300]
df = df.fillna(0)

## Split into train and test data

In [36]:
train_df = df[df['date_id'] <= TRAIN_END_DATE]
test_df = df[(df['date_id'] > TRAIN_END_DATE) & (df['date_id'] <= TEST_END_DATE)]
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
del df

In [37]:
num_stocks = train_df['stock_id'].nunique()
num_dates = train_df['date_id'].nunique()
num_updates = train_df["seconds_in_bucket"].nunique()
num_stocks, num_dates, num_updates

(196, 81, 25)

In [38]:
num_stocks = test_df['stock_id'].nunique()
num_dates = test_df['date_id'].nunique()
num_updates = test_df["seconds_in_bucket"].nunique()
num_stocks, num_dates, num_updates

(197, 20, 25)

In [41]:
# Check if all stocks are present in both train and test data
test_stock_ids = set(test_df['stock_id'])
train_stock_ids = set(train_df['stock_id'])

test_not_in_train = test_stock_ids - train_stock_ids
train_not_in_test = train_stock_ids - test_stock_ids
test_not_in_train, train_not_in_test

({199}, set())

In [45]:
# Deleting stock 199 from test data as it is not present in train data
test_df = test_df[test_df['stock_id'] != 199]
test_stock_ids = set(test_df['stock_id'])
train_stock_ids = set(train_df['stock_id'])

test_not_in_train = test_stock_ids - train_stock_ids
train_not_in_test = train_stock_ids - test_stock_ids
test_df.reset_index(drop=True, inplace=True)
test_not_in_train, train_not_in_test

(set(), set())

In [52]:
# Ensure that all datetimes are present in both train and test data
train_df.groupby(['stock_id', 'date_id']).size().reset_index(name='count')['count'].value_counts(), test_df.groupby(['stock_id', 'date_id']).size().reset_index(name='count')['count'].value_counts()

(count
 25    15669
 Name: count, dtype: int64,
 count
 25    3920
 Name: count, dtype: int64)

## Set up for predicting wap_{t+60}

In [54]:
# Adding our target wap_{t+60} to train and test data
train_df['wap_t+60'] = train_df['wap'].shift(-1)
train_df.tail()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id,wap_t+60
391720,194,80,540,0.0,0,1.002321,5018051.11,1.002321,1.002321,1.001935,27870.3,1.002321,5192.0,1.00226,-26.410221,4454,80_540_194,0.999839
391721,195,80,540,0.0,0,0.999826,18968276.9,0.999826,0.999826,0.999826,25542.9,0.999944,201838.14,0.999839,-0.920296,4454,80_540_195,1.000078
391722,196,80,540,538629.69,1,1.000168,7156600.09,1.00036,1.00036,0.999975,69354.66,1.000168,60112.0,1.000078,0.840426,4454,80_540_196,0.99957
391723,197,80,540,0.0,0,0.999387,16035564.3,0.999387,0.999387,0.999307,94057.23,0.999707,49359.2,0.99957,-5.400181,4454,80_540_197,0.999493
391724,198,80,540,4229576.94,-1,0.999391,87598784.28,0.998815,0.999007,0.999391,733607.88,0.999583,641256.0,0.999493,-2.999902,4454,80_540_198,


In [55]:
# The last row has NaN value for wap_{t+60}, so we will drop it
train_df.drop(train_df.index[-1], inplace=True)

In [56]:
train_df.tail()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id,wap_t+60
391719,193,80,540,0.0,0,1.007618,234759900.0,1.007618,1.007618,1.007618,76932.9,1.007672,34424.13,1.007655,-5.000234,4454,80_540_193,1.00226
391720,194,80,540,0.0,0,1.002321,5018051.0,1.002321,1.002321,1.001935,27870.3,1.002321,5192.0,1.00226,-26.410221,4454,80_540_194,0.999839
391721,195,80,540,0.0,0,0.999826,18968280.0,0.999826,0.999826,0.999826,25542.9,0.999944,201838.14,0.999839,-0.920296,4454,80_540_195,1.000078
391722,196,80,540,538629.69,1,1.000168,7156600.0,1.00036,1.00036,0.999975,69354.66,1.000168,60112.0,1.000078,0.840426,4454,80_540_196,0.99957
391723,197,80,540,0.0,0,0.999387,16035560.0,0.999387,0.999387,0.999307,94057.23,0.999707,49359.2,0.99957,-5.400181,4454,80_540_197,0.999493


In [57]:
# Same for test data
test_df['wap_t+60'] = test_df['wap'].shift(-1)
test_df.drop(test_df.index[-1], inplace=True)
test_df.tail()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id,wap_t+60
97994,193,100,540,3421403.64,1,0.997613,103286300.0,0.998236,0.998236,0.997509,358087.23,0.997613,38446.0,0.997603,-4.190207,5554,100_540_193,0.997584
97995,194,100,540,0.0,0,0.997705,4930833.0,0.997705,0.997705,0.997511,6480.18,0.997705,10699.52,0.997584,2.150536,5554,100_540_194,0.998067
97996,195,100,540,0.0,0,0.998127,16799300.0,0.998127,0.998127,0.998012,49962.43,0.998127,54038.4,0.998067,-1.080036,5554,100_540_195,1.000706
97997,196,100,540,666840.43,-1,1.000626,9102064.0,0.999364,1.000085,1.000626,78670.64,1.000806,98550.24,1.000706,-1.090169,5554,100_540_196,0.999526
97998,197,100,540,8707.0,1,0.999713,11121040.0,0.999799,0.999713,0.999454,85514.11,0.999713,221442.48,0.999526,-0.619888,5554,100_540_197,0.997798


# Predicting wap_{t+60}

In [None]:
# Getting features and target columns
featured_columns = ['imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
                    'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
                    'ask_size', 'wap']
target_column = 'wap_t+60'
X_train = train_df[featured_columns]
y_train = train_df[target_column]
X_test = test_df[featured_columns]
y_test = test_df[target_column]

## Baseline Model

In [67]:
print("Baseline Model results")
train_baseline_mae = (0 - y_train).abs().mean()
test_baseline_mae = (0 - y_test).abs().mean()
train_baseline_mse = ((0 - y_train) ** 2).mean()
test_baseline_mse = ((0 - y_test) ** 2).mean()
train_baseline_rmse = train_baseline_mse ** 0.5
test_baseline_rmse = test_baseline_mse ** 0.5

print("Train MAE:", train_baseline_mae)
print("Test MAE:", test_baseline_mae)
print("Train MSE:", train_baseline_mse)
print("Test MSE:", test_baseline_mse)

Baseline Model results
Train MAE: 0.9999099396309644
Test MAE: 1.0003763824732905
Train MSE: 0.9998891160575365
Test MSE: 1.000767511269634


## Random Forest Regressor

In [60]:
start_time = time.time()
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)
end_time = time.time()
print(f'Time taken to train Random Forest Regressor: {end_time-start_time} seconds')

In [65]:
y_pred = rf_regressor.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

Mean Absolute Error: 0.002739488701659801
Mean Squared Error: 1.5724839618146507e-05
Root Mean Squared Error: 0.003965455789458068
R-squared: -0.0767002029695687
