In [1]:
#!rm -r comp642-project && echo "Project 'comp642-project' folder will be updated" || echo "Project 'comp642-project' does not exists"
#!git clone https://github.com/fdraverta/comp642-project.git

!pip install structlog

from google.colab import files
import sys
import os

# Mount local drive
from google.colab import drive
drive.mount('/content/drive')

# To find local version of the library
sys.path.append(os.path.join("./comp642-project/FRaverta-Notebooks"))

# here is your import
from featurizer import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data = TradingAtTheCloseDS("/content/drive/MyDrive/trading_the_close_data.zip")
train_data, test_data = data.get_train_test_data()
train_mae, test_mae = data.compute_baseline_model()

2024-04-23 05:15:52 [info     ] Using ZIP file with the data set from the provided path path=/content/drive/MyDrive/trading_the_close_data.zip
2024-04-23 05:15:52 [info     ] Opening ZIP file...
2024-04-23 05:15:52 [info     ] Reading CSV file...            fname=train.csv
2024-04-23 05:16:10 [info     ] Data set loaded successfully.
2024-04-23 05:16:11 [info     ] Baseline model computed.       test_mae=5.8499749700259205 train_mae=6.481303604250604


In [3]:
#drop rows with nan in wap column
train_data = train_data.dropna(subset=['wap'])
test_data = test_data.dropna(subset=['wap'])

x_train, y_train = train_data[data.categorical_features + data.numerical_features], train_data[data.y_column]
x_test, y_test = test_data[data.categorical_features + data.numerical_features], test_data[data.y_column]

# Random Forest on DATASET with all features, features nan set to 0

In [4]:
x_train.info()

# get information about nan values in each column
x_train.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 4621815 entries, 0 to 4621979
Data columns (total 13 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   imbalance_buy_sell_flag  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   reference_price          float64
 5   matched_size             float64
 6   far_price                float64
 7   near_price               float64
 8   bid_price                float64
 9   bid_size                 float64
 10  ask_price                float64
 11  ask_size                 float64
 12  wap                      float64
dtypes: float64(10), int64(3)
memory usage: 493.7 MB


stock_id                         0
imbalance_buy_sell_flag          0
seconds_in_bucket                0
imbalance_size                   0
reference_price                  0
matched_size                     0
far_price                  2553470
near_price                 2520990
bid_price                        0
bid_size                         0
ask_price                        0
ask_size                         0
wap                              0
dtype: int64

In [5]:
x_test.info()

# get information about nan values in each column
x_test.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 615945 entries, 4621980 to 5237979
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   stock_id                 615945 non-null  int64  
 1   imbalance_buy_sell_flag  615945 non-null  int64  
 2   seconds_in_bucket        615945 non-null  int64  
 3   imbalance_size           615945 non-null  float64
 4   reference_price          615945 non-null  float64
 5   matched_size             615945 non-null  float64
 6   far_price                275293 non-null  float64
 7   near_price               279975 non-null  float64
 8   bid_price                615945 non-null  float64
 9   bid_size                 615945 non-null  float64
 10  ask_price                615945 non-null  float64
 11  ask_size                 615945 non-null  float64
 12  wap                      615945 non-null  float64
dtypes: float64(10), int64(3)
memory usage: 65.8 MB


stock_id                        0
imbalance_buy_sell_flag         0
seconds_in_bucket               0
imbalance_size                  0
reference_price                 0
matched_size                    0
far_price                  340652
near_price                 335970
bid_price                       0
bid_size                        0
ask_price                       0
ask_size                        0
wap                             0
dtype: int64

In [6]:
x_train.loc[:,'far_price'] = x_train['far_price'].fillna(0)
x_train.loc[:,'near_price'] = x_train['near_price'].fillna(0)

x_test.loc[:,'far_price'] = x_test['far_price'].fillna(0)
x_test.loc[:,'near_price'] = x_test['near_price'].fillna(0)


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pickle

# Create a "do-nothing" transformer
no_op_transformer = FunctionTransformer(lambda x: x)



# Numerical pipeline with feature addition
numerical_pipeline = Pipeline([
    ('do_nothing', no_op_transformer)
])

# Categorical pipeline
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])

# Combine pipelines
preprocessing_pipeline = ColumnTransformer([
    ('num', numerical_pipeline, data.numerical_features),
    ('cat', categorical_pipeline, data.categorical_features)
])

# Applying the pipeline
x_train_transformed = preprocessing_pipeline.fit_transform(x_train)
x_test_transformed = preprocessing_pipeline.transform(x_test)


random_forest = RandomForestRegressor(n_estimators=20, random_state=42, verbose=1, n_jobs=-1,  max_depth=15)
random_forest.fit(x_train_transformed, y_train.values.reshape(-1))

#random_forest = pickle.load(open('random_forest_model.pkl', 'rb'))

y_pred = random_forest.predict(x_test_transformed)

print(f"Random Forest MAE: {mean_absolute_error(y_test, y_pred)}")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 20.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


Random Forest MAE: 5.816998932669315


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.3s finished


In [8]:
# save random_forest model
#import pickle
#with open('random_forest_model.pkl', 'wb') as f:
#    pickle.dump(random_forest, f)

In [9]:
df = pd.DataFrame({**x_test, 'Actual': y_test["target"].values, 'Predicted': y_pred})
df.head(20)

Unnamed: 0,stock_id,imbalance_buy_sell_flag,seconds_in_bucket,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,Actual,Predicted
4621980,0,-1,0,7218754.93,1.000062,22642240.0,0.0,0.0,0.999873,10813.75,1.000062,5276.0,1.0,1.809597,-1.115551
4621981,1,1,0,238616.53,0.999666,1585546.0,0.0,0.0,0.999223,18022.0,1.000054,1262.59,1.0,-3.420114,-1.687748
4621982,2,1,0,4673615.43,0.999974,2088183.0,0.0,0.0,0.999974,895.52,1.000644,22403.0,1.0,-0.41008,2.459962
4621983,3,-1,0,3022158.48,0.999961,110022500.0,0.0,0.0,0.999961,31167.18,1.000063,50186.24,1.0,1.610518,0.388651
4621984,4,-1,0,51388.69,0.999897,20345960.0,0.0,0.0,0.999897,16852.44,1.0002,32888.73,1.0,9.109974,0.449668
4621985,5,-1,0,891807.74,1.000449,2644355.0,0.0,0.0,0.999921,3788.5,1.000317,15160.0,1.0,-5.710125,1.637159
4621986,6,1,0,5840916.0,1.000132,9726217.0,0.0,0.0,0.999077,8521.5,1.000923,8537.25,1.0,-7.299781,-0.054246
4621987,7,1,0,11330985.67,0.999987,47738000.0,0.0,0.0,0.999987,995.68,1.00065,49817.0,1.0,-7.249713,1.888083
4621988,8,-1,0,11813623.17,0.999961,47325340.0,0.0,0.0,0.999454,262265.0,1.000273,131240.0,1.0,3.85046,0.008653
4621989,9,-1,0,2528948.05,1.000123,17408020.0,0.0,0.0,0.99999,150.12,1.000656,9388.75,1.0,-0.770092,1.888083


In [10]:
mean_absolute_error(y_test, y_pred)

5.816998932669315

# 2 Random forest models, one first 5 minutes other second 5 minutes


In [11]:
train_data, test_data = data.get_train_test_data()

#drop rows with nan in wap column
train_data = train_data.dropna(subset=['wap'])
test_data = test_data.dropna(subset=['wap'])

x_train, y_train = train_data[data.categorical_features + data.numerical_features], train_data[data.y_column]
x_test, y_test = test_data[data.categorical_features + data.numerical_features], test_data[data.y_column]

x_train

Unnamed: 0,stock_id,imbalance_buy_sell_flag,seconds_in_bucket,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap
0,0,1,0,3180602.69,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000
1,1,-1,0,166603.91,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000
2,2,-1,0,302879.87,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000
3,3,-1,0,11917682.27,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000
4,4,-1,0,447549.96,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4621975,195,-1,540,1755710.13,1.000637,23963918.01,0.999999,0.999999,1.000637,38626.10,1.000743,185519.18,1.000655
4621976,196,-1,540,325741.25,0.999755,8239249.46,0.999262,0.999755,0.999755,42180.16,1.000002,16034.04,0.999934
4621977,197,1,540,572162.23,0.999981,9207011.59,1.000203,1.000203,0.999870,40522.50,0.999981,61781.16,0.999914
4621978,198,0,540,0.00,1.001219,72541114.32,1.001219,1.001219,1.000953,497503.65,1.001219,630439.20,1.001070


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Create a "do-nothing" transformer
no_op_transformer = FunctionTransformer(lambda x: x)



# Numerical pipeline with feature addition
numerical_pipeline = Pipeline([
    ('do_nothing', no_op_transformer)
])

# Categorical pipeline
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])

# Combine pipelines
preprocessing_pipeline_first_5min = ColumnTransformer([
    ('num', numerical_pipeline, ['seconds_in_bucket', 'imbalance_size', 'reference_price',
                      'matched_size', 'bid_price',
                      'bid_size', 'ask_price', 'ask_size', 'wap']),
    ('cat', categorical_pipeline, data.categorical_features)
])
preprocessing_pipeline_second_5min = ColumnTransformer([
    ('num', numerical_pipeline, data.numerical_features),
    ('cat', categorical_pipeline, data.categorical_features)
])

# Applying the pipeline

x_train_transformed_first_5min = x_train.loc[x_train['far_price'].isna() |  x_train['near_price'].isna()]
x_train_transformed_second_5min = x_train.loc[x_train['far_price'].notna() & x_train['near_price'].notna()]
y_train_first_5min = y_train[x_train['far_price'].isna() |  x_train['near_price'].isna()]
y_train_second_5min  = y_train[x_train['far_price'].notna() & x_train['near_price'].notna()]

#drop columns
x_train_transformed_first_5min = x_train_transformed_first_5min.drop(['far_price', 'near_price'], axis=1)

x_train_transformed_first_5min = preprocessing_pipeline_first_5min.fit_transform(x_train_transformed_first_5min)
x_train_transformed_second_5min = preprocessing_pipeline_second_5min.fit_transform(x_train_transformed_second_5min)



In [13]:
random_forest_first_5min = RandomForestRegressor(n_estimators=20, random_state=42, verbose=1, n_jobs=-1,  max_depth=15)
random_forest_second_5min = RandomForestRegressor(n_estimators=20, random_state=42, verbose=1, n_jobs=-1,  max_depth=15)

# Fit the first model
random_forest_first_5min.fit(x_train_transformed_first_5min, y_train_first_5min.values.reshape(-1))
random_forest_second_5min.fit(x_train_transformed_second_5min, y_train_second_5min.values.reshape(-1))

# Custom predict method to use different models based on the 'far_price' condition
def custom_predict(x_test):
    # Select the rows where 'far_price' is not NaN
    x_test_first_5min = x_test.loc[x_test['far_price'].isna() |  x_test['near_price'].isna()]
    x_test_second_5min = x_test.loc[x_test['far_price'].notna() & x_test['near_price'].notna()]

    x_test_first_5min = x_test_first_5min.drop(['far_price', 'near_price'], axis=1)

    x_test_first_5min_ = preprocessing_pipeline_first_5min.fit_transform(x_test_first_5min)
    x_test_second_5min_ = preprocessing_pipeline_second_5min.fit_transform(x_test_second_5min)


    # Make predictions with the corresponding model
    y_pred_first_5min = random_forest_first_5min.predict(x_test_first_5min_)
    y_pred_second_5min = random_forest_second_5min.predict(x_test_second_5min_)

    # Create a combined prediction array
    y_pred = pd.Series(index=x_test.index)  # Initialize with NaNs

    # Assign predictions to the correct rows
    y_pred.loc[x_test_first_5min.index] = y_pred_first_5min
    y_pred.loc[x_test_second_5min.index] = y_pred_second_5min

    return y_pred


y_pred = custom_predict(x_test)
print(f"Random Forest MAE: {mean_absolute_error(y_test, y_pred)}")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


Random Forest MAE: 5.820673258663492


[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    0.1s finished
