In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["KERAS_BACKEND"] = "torch"

In [3]:
from load_data import load_data_from_csv
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import (
    BasicFeaturesPreprocessor,
    DupletsTripletsPreprocessor,
    MovingAvgPreProcessor,
    RemoveIrrelevantFeaturesDataPreprocessor,
    DropTargetNADataPreprocessor,
    FarNearPriceFillNaPreprocessor,
    MovingAvgFillNaPreprocessor,
    RemoveRecordsByStockDateIdPreprocessor,
)
from data_preprocessor.stock_feature_engineering import (
    StockNormalizeFeaturesPreprocessor,
)
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor
from data_preprocessor.stockid_features import StockIdFeaturesPreProcessor
# from data_preprocessor.deep_feature_synthesis import DfsPreProcessor
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor, KerasSaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator

import numpy as np

import sys
import pandas as pd
import json

from sklearn import preprocessing

from model_pipeline.mlp_pipeline import MLPModelPipelineFactory

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x1518535c2fc0>
Traceback (most recent call last):
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^

In [4]:
N_fold = 5
model_save_dir = './models'

processors = [    
    ReduceMemUsageDataPreprocessor(verbose=True),
    RemoveRecordsByStockDateIdPreprocessor([
        {"stock_id": 19, "date_id": 438},
        {"stock_id": 101, "date_id": 328},
        {"stock_id": 131, "date_id": 35},
        {"stock_id": 158, "date_id": 388},
    ]),
    FarNearPriceFillNaPreprocessor(),
    # BasicFeaturesPreprocessor(),
    # DupletsTripletsPreprocessor(),
    # MovingAvgPreProcessor("wap"),
    # MovingAvgFillNaPreprocessor("wap", 1.0),
    # StockIdFeaturesPreProcessor(),   
    # DTWKMeansPreprocessor(),
    # DfsPreProcessor(),
    # DropTargetNADataPreprocessor(),    
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    # FillNaPreProcessor(1.0),
    # PolynomialFeaturesPreProcessor(),
]


processor = CompositeDataPreprocessor(processors)


In [5]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [6]:
raw_data = df_train.copy(deep=True)
# df_train = df_train[:100000]

In [7]:
df_train = raw_data

In [8]:
df_train = processor.apply(df_train)

CompositeDataPreprocessor - original df shape: (5237980, 17)
Processing ReduceMemUsageDataPreprocessor...
Memory usage of dataframe is 679.36 MB
Memory usage after optimization is: 304.72 MB
Decreased by 55.15%
dtypes:
stock_id                     int16
date_id                      int16
seconds_in_bucket            int16
imbalance_size             float32
imbalance_buy_sell_flag       int8
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap                        float32
target                     float32
time_id                      int16
row_id                      object
dtype: object
ReduceMemUsageDataPreprocessor took 0.57s. New df shape: (5237980, 17).
Processing RemoveRecordsByStockDateIdPreprocessor...
RemoveRecordsByStockDateIdPreprocessor - removi

In [9]:
print(df_train.columns)
display(df_train)

Index(['seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag',
       'reference_price', 'matched_size', 'far_price', 'near_price',
       'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'target'],
      dtype='object')


Unnamed: 0,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target
0,0,3.180603e+06,1,0.999812,13380277.00,1.000000,1.000000,0.999812,60651.500000,1.000026,8493.030273,1.000000,-3.029704
1,0,1.666039e+05,-1,0.999896,1642214.25,1.000000,1.000000,0.999896,3233.040039,1.000660,20605.089844,1.000000,-5.519986
2,0,3.028799e+05,-1,0.999561,1819368.00,1.000000,1.000000,0.999403,37956.000000,1.000298,18995.000000,1.000000,-8.389950
3,0,1.191768e+07,-1,1.000171,18389746.00,1.000000,1.000000,0.999999,2324.899902,1.000214,479032.406250,1.000000,-4.010201
4,0,4.475500e+05,-1,0.999532,17860614.00,1.000000,1.000000,0.999394,16485.539062,1.000016,434.100006,1.000000,-7.349849
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276
5237976,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077
5237977,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443
5237978,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184


In [10]:
feat_dynamic_real = df_train.columns.tolist()
feat_dynamic_real.remove("target")
num_input_features = len(feat_dynamic_real)
print(num_input_features, feat_dynamic_real)

12 ['seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']


In [11]:
# should not have any na features
any_na_values_mask = df_train[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)

(5237760,) (0,)


## Normalize features (TODO: normalize by group? stock id?)

In [12]:
normalize_columns = set([
    "imbalance_size",
    "matched_size",
    "bid_size",
    "ask_size",
])
normalize_columns = list(normalize_columns.intersection(set(feat_dynamic_real)))
print(normalize_columns)

['matched_size', 'bid_size', 'ask_size', 'imbalance_size']


In [16]:
model_name = "20240422_mlp_raw"

In [17]:
default_data_generator = DefaultTrainEvalDataGenerator()
k_fold_data_generator = TimeSeriesKFoldDataGenerator(n_fold=5)

In [18]:
model_post_processor = CompositeModelPostProcessor([
    KerasSaveModelPostProcessor(save_dir=model_save_dir)
])

In [19]:
print(f"Generate data")
train_dfs, eval_dfs, num_train_eval_sets = k_fold_data_generator.generate(df_train)

models = []
model_res = []

print(f"Start train and tune, num_train_eval_sets: {num_train_eval_sets}")

Generate data
Start train and tune, num_train_eval_sets: 5


In [20]:
mlp_model_pipeline_factory = MLPModelPipelineFactory(model_name, 12)

In [23]:
param = {
    "layers": 0,
    'learning_rate': 0.00001,
    'epochs': 5,
    'batch_size': 256,
}

In [25]:
# for fold in range(num_train_eval_sets):
for fold in range(4, 5):
    print(f"Training fold {fold} - {json.dumps(param, indent=2)}")
    
    print(f"Training fold {fold} - start")

    model_pipeline = mlp_model_pipeline_factory.create_model_pipeline()
    model_pipeline.init_model(
        param = param,
        fold=fold,
    )
    print(f"Training fold {fold} - initialized")

    fold_df_train = train_dfs[fold]
    fold_df_eval = eval_dfs[fold]
    print(f"Training fold {fold} - train size: {fold_df_train.shape}, eval size: {fold_df_eval.shape}")

    # normalize features, scaler per fold, fit on training set, transform both training and validation set
    scaler = preprocessing.StandardScaler()
    scaler.fit(fold_df_train[normalize_columns])
    fold_df_train[normalize_columns] = scaler.transform(fold_df_train[normalize_columns])
    fold_df_eval[normalize_columns] = scaler.transform(fold_df_eval[normalize_columns])

    X_train_fold, y_train_fold, X_val_fold, y_val_fold = model_pipeline.create_XY(fold_df_train, fold_df_eval)

    train_manual_mae = model_pipeline.eval_once(X_train_fold, y_train_fold)
    eval_manual_mae = model_pipeline.eval_once(X_val_fold, y_val_fold)
    print(f"Training fold {fold} - before training - train_manual_mae: {train_manual_mae}, eval_manual_mae: {eval_manual_mae}")

    print(f"Training fold {fold} - start training")
    train_res = model_pipeline.train(X_train_fold, y_train_fold, X_val_fold, y_val_fold, None)
    fold_model = model_pipeline.get_model()
    models.append(fold_model)
    model_res.append(train_res)
    print(f"Training fold {fold} - finished training")

    train_manual_mae = model_pipeline.eval_once(X_train_fold, y_train_fold)
    eval_manual_mae = model_pipeline.eval_once(X_val_fold, y_val_fold)
    print(f"Training fold {fold} - after training - train_manual_mae: {train_manual_mae}, eval_manual_mae: {eval_manual_mae}")
    
    model_post_processor.process(fold_model, model_pipeline, fold)
    print(f"Training fold {fold} - finished post processing")

    print(f"Training fold {fold} - end")

print(f"finished training, num_train_eval_sets: {num_train_eval_sets}")

Training fold 4 - {
  "layers": 0,
  "learning_rate": 1e-05,
  "epochs": 5,
  "batch_size": 256
}
Training fold 4 - start


None
Training fold 4 - initialized
Training fold 4 - train size: (4364800, 13), eval size: (872960, 13)
[1m17050/17050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1ms/step
[1m3410/3410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
Training fold 4 - before training - train_manual_mae: 7.574123859405518, eval_manual_mae: 7.200293064117432
Training fold 4 - start training
Epoch 1/5
[1m17050/17050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 9ms/step - loss: 6.4943 - mae: 6.4943 - val_loss: 5.9842 - val_mae: 5.9842
Epoch 2/5
[1m 5260/17050[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:43[0m 9ms/step - loss: 6.4831 - mae: 6.4831

KeyboardInterrupt: 