In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from load_data import load_data_from_csv
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import (
    BasicFeaturesPreprocessor,
    DupletsTripletsPreprocessor,
    MovingAvgPreProcessor,
    RemoveIrrelevantFeaturesDataPreprocessor,
    DropTargetNADataPreprocessor,
    FarNearPriceFillNaPreprocessor,
    MovingAvgFillNaPreprocessor,
    RemoveRecordsByStockDateIdPreprocessor,
)
from data_preprocessor.stock_feature_engineering import (
    StockNormalizeFeaturesPreprocessor,
)
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor
from data_preprocessor.stockid_features import StockIdFeaturesPreProcessor
from data_preprocessor.deep_feature_synthesis import DfsPreProcessor
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator

from model_pipeline.lgb_pipeline import LGBModelPipelineFactory

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline
from train_pipeline.train_optuna_pipeline import DefaultOptunaTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator

import optuna.integration.lightgbm as lgb
import optuna

import numpy as np

import sys
import pandas as pd

from sklearn import preprocessing

In [3]:
from model_pipeline.mlp_pipeline import MLPModelPipelineFactory

In [4]:
N_fold = 5
model_save_dir = './models/'

processors = [    
    ReduceMemUsageDataPreprocessor(verbose=True),
    RemoveRecordsByStockDateIdPreprocessor([
        {"stock_id": 19, "date_id": 438},
        {"stock_id": 101, "date_id": 328},
        {"stock_id": 131, "date_id": 35},
        {"stock_id": 158, "date_id": 388},
    ]),
    FarNearPriceFillNaPreprocessor(),
    # BasicFeaturesPreprocessor(),
    # DupletsTripletsPreprocessor(),
    MovingAvgPreProcessor("wap"),
    MovingAvgFillNaPreprocessor("wap", 1.0),
    # StockIdFeaturesPreProcessor(),   
    # DTWKMeansPreprocessor(),
    # DfsPreProcessor(),
    DropTargetNADataPreprocessor(),    
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    # FillNaPreProcessor(1.0),
    # PolynomialFeaturesPreProcessor(),
]


processor = CompositeDataPreprocessor(processors)


In [5]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

raw_data = df_train
# df_train = df_train[:100000]


Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [6]:
df_train = processor.apply(df_train)

CompositeDataPreprocessor - original df shape: (5237980, 17)
Processing ReduceMemUsageDataPreprocessor...
Memory usage of dataframe is 679.36 MB
Memory usage after optimization is: 304.72 MB
Decreased by 55.15%
dtypes:
stock_id                     int16
date_id                      int16
seconds_in_bucket            int16
imbalance_size             float32
imbalance_buy_sell_flag       int8
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap                        float32
target                     float32
time_id                      int16
row_id                      object
dtype: object
ReduceMemUsageDataPreprocessor took 1.56s. New df shape: (5237980, 17).
Processing RemoveRecordsByStockDateIdPreprocessor...
RemoveRecordsByStockDateIdPreprocessor - removi

In [7]:
print(df_train.columns)
display(df_train)

Index(['seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag',
       'reference_price', 'matched_size', 'far_price', 'near_price',
       'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'target',
       'wap_mov_avg_3_1', 'wap_mov_avg_6_3', 'wap_mov_avg_12_6',
       'wap_mov_avg_24_12'],
      dtype='object')


Unnamed: 0,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12
0,0,3.180603e+06,1,0.999812,13380277.00,1.000000,1.000000,0.999812,60651.500000,1.000026,8493.030273,1.000000,-3.029704,1.000000,,,
1,0,1.666039e+05,-1,0.999896,1642214.25,1.000000,1.000000,0.999896,3233.040039,1.000660,20605.089844,1.000000,-5.519986,1.000000,,,
2,0,3.028799e+05,-1,0.999561,1819368.00,1.000000,1.000000,0.999403,37956.000000,1.000298,18995.000000,1.000000,-8.389950,1.000000,,,
3,0,1.191768e+07,-1,1.000171,18389746.00,1.000000,1.000000,0.999999,2324.899902,1.000214,479032.406250,1.000000,-4.010201,1.000000,,,
4,0,4.475500e+05,-1,0.999532,17860614.00,1.000000,1.000000,0.999394,16485.539062,1.000016,434.100006,1.000000,-7.349849,1.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276,1.000345,1.000304,1.000318,1.000202
5237976,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077,1.000816,1.000710,1.000560,1.000506
5237977,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443,0.995958,0.996070,0.996130,0.996436
5237978,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184,0.999116,0.999218,0.999305,0.999313


In [19]:
feat_dynamic_real = df_train.columns.tolist()
feat_dynamic_real.remove("target")
num_input_features = len(feat_dynamic_real)
print(num_input_features, feat_dynamic_real)

16 ['seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'wap_mov_avg_3_1', 'wap_mov_avg_6_3', 'wap_mov_avg_12_6', 'wap_mov_avg_24_12']


In [20]:
# should not have any na features
any_na_values_mask = df_train[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)

(5237760,) (1047552,)


In [21]:
df_train[any_na_values_mask]

Unnamed: 0,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12
0,0,3.180603e+06,1,0.999812,13380277.00,1.0,1.0,0.999812,60651.500000,1.000026,8493.030273,1.000000,-3.029704,1.000000,,,
1,0,1.666039e+05,-1,0.999896,1642214.25,1.0,1.0,0.999896,3233.040039,1.000660,20605.089844,1.000000,-5.519986,1.000000,,,
2,0,3.028799e+05,-1,0.999561,1819368.00,1.0,1.0,0.999403,37956.000000,1.000298,18995.000000,1.000000,-8.389950,1.000000,,,
3,0,1.191768e+07,-1,1.000171,18389746.00,1.0,1.0,0.999999,2324.899902,1.000214,479032.406250,1.000000,-4.010201,1.000000,,,
4,0,4.475500e+05,-1,0.999532,17860614.00,1.0,1.0,0.999394,16485.539062,1.000016,434.100006,1.000000,-7.349849,1.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5229175,100,3.760183e+06,-1,0.999851,8075852.00,1.0,1.0,0.999851,11233.250000,0.999968,10119.679688,0.999912,6.380081,0.999864,0.999887,0.999938,
5229176,100,3.930340e+06,-1,1.001157,2805574.25,1.0,1.0,1.001157,57514.378906,1.001414,30848.400391,1.001324,-3.709793,1.001318,1.000936,1.000699,
5229177,100,1.438044e+06,-1,0.999912,3831440.25,1.0,1.0,0.999912,1067.099976,1.000287,78461.250000,0.999917,3.190041,1.000304,1.000505,1.000389,
5229178,100,3.328978e+06,1,1.000053,62579416.00,1.0,1.0,0.999933,113044.656250,1.000173,331281.500000,0.999994,-1.890063,1.000086,1.000196,1.000133,


## Normalize features (TODO: normalize by group? stock id?)

In [10]:
normalize_columns = set([
    "imbalance_size",
    "matched_size",
    "bid_size",
    "ask_size",
])
normalize_columns = list(normalize_columns.intersection(set(feat_dynamic_real)))
print(normalize_columns)

['ask_size', 'imbalance_size', 'matched_size', 'bid_size']


In [11]:
model_name = "20240410_mlp_raw"

In [12]:
default_data_generator = DefaultTrainEvalDataGenerator()
k_fold_data_generator = ManualKFoldDataGenerator(n_fold=5)

In [13]:
model_post_processor = CompositeModelPostProcessor([
    SaveModelPostProcessor(save_dir=model_save_dir)
])

In [14]:
print(f"Generate data")
train_dfs, eval_dfs, num_train_eval_sets = k_fold_data_generator.generate(df_train)

models = []
model_res = []

print(f"Start train and tune, num_train_eval_sets: {num_train_eval_sets}")

Generate data
Start train and tune, num_train_eval_sets: 5


In [15]:
mlp_model_pipeline_factory = MLPModelPipelineFactory(model_name, 16)

In [16]:
# X_train_fold, y_train_fold, X_val_fold, y_val_fold = self.model_pipeline.create_XY(train_dfs[fold_index], eval_dfs[fold_index])

In [17]:
for fold in range(num_train_eval_sets):
    print(f"Training fold {fold} - start")

    model_pipeline = mlp_model_pipeline_factory.create_model_pipeline()
    model_pipeline.init_model()
    print(f"Training fold {fold} - initialized")

    fold_df_train = train_dfs[fold]
    fold_df_eval = eval_dfs[fold]
    print(f"Training fold {fold} - train size: {fold_df_train.shape}, eval size: {fold_df_eval.shape}")

    # normalize features, scaler per fold, fit on training set, transform both training and validation set
    scaler = preprocessing.StandardScaler()
    scaler.fit(fold_df_train[normalize_columns])
    fold_df_train[normalize_columns] = scaler.transform(fold_df_train[normalize_columns])
    fold_df_eval[normalize_columns] = scaler.transform(fold_df_eval[normalize_columns])

    X_train_fold, y_train_fold, X_val_fold, y_val_fold = model_pipeline.create_XY(train_dfs[fold], eval_dfs[fold])

    print(f"Training fold {fold} - start training")
    train_res = model_pipeline.train(X_train_fold, y_train_fold, X_val_fold, y_val_fold, _)
    fold_model = model_pipeline.get_model()
    models.append(fold_model)
    model_res.append(train_res)
    print(f"Training fold {fold} - finished training")
    
    model_post_processor.model_post_processor.process(fold_model, model_pipeline, fold)
    print(f"Training fold {fold} - finished post processing")

    print(f"Training fold {fold} - end")

print(f"finished training, num_train_eval_sets: {num_train_eval_sets}")



Training fold 0 - start


None
Training fold 0 - initialized
Training fold 0 - train size: (4190208, 17), eval size: (1047552, 17)
Training fold 0 - start training
Epoch 1/10
fit exception


Traceback (most recent call last):
  File "c:\Users\Florence\Documents\Repo\optiver2023a\florence\model_pipeline\mlp_pipeline.py", line 58, in train
    history = self.model.fit(
  File "c:\Users\Florence\anaconda3\envs\optiver2023\lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "c:\Users\Florence\anaconda3\envs\optiver2023\lib\site-packages\keras\src\layers\input_spec.py", line 227, in assert_input_compatibility
    raise ValueError(
ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 12, but received input with shape (32, 16)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 16), dtype=float32)
  • training=True
  • mask=None


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 12, but received input with shape (32, 16)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 16), dtype=float32)
  • training=True
  • mask=None

In [None]:
    callback_results = []
    for callback in self.callbacks:
        callback_res = callback.on_callback(models, model_res, train_dfs, eval_dfs, num_train_eval_sets)
        callback_results.append(callback_res)

    return models, model_res, train_dfs, eval_dfs, num_train_eval_sets, callback_results

In [None]:
pipeline = DefaultTrainPipeline(MLPModelPipelineFactory(model_name), k_fold_data_generator, model_post_processor, [MAECallback()])

In [None]:
models, model_res, train_dfs, eval_dfs, num_train_eval_sets, callback_results = pipeline.train(df_train)

generate data
start training, num_train_eval_sets: 5
Training fold 0 - start


None
Training fold 0 - initialized
Training fold 0 - train size: (4190313, 13), eval size: (1047579, 13)
Training fold 0 - start training


TypeError: MLPModelPipeline.train() missing 3 required positional arguments: 'eval_X', 'eval_Y', and 'eval_res'

In [None]:
lgb_avg_mae = ScoringUtils.calculate_mae(lgb_models, lgb_eval_dfs)
print(lgb_avg_mae)