<table>
<tr>
<td>V1 </td>
<td>Simple feature engineering</td>
<td>second</td>
</tr>
<tr>
<td>V2 </td>
<td>Add pressure and inefficiency</td>
<td>second</td>
</tr>    
</table>



| Tables   |      Are      |  Cool |
|----------|:-------------:|------:|
| col 1 is |  left-aligned | $1600 |
| col 2 is |    centered   |   $12 |
| col 3 is | right-aligned |    $1 |

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import joblib 
import os
import sklearn 

from load_data import load_data_from_csv

In [2]:
from data_preprocessor import CompositeDataPreprocessor
from feature_engineering import EnrichDFDataPreprocessor, RemoveIrrelevantFeaturesDataPreprocessor, DropTargetNADataPreprocessor

In [9]:
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator

from model_pipeline.lgb_pipeline import LGBModelPipelineFactory
from model_pipeline.xgb_pipeline import XGBModelPipelineFactory
from model_pipeline.cbt_pipeline import CatBoostModelPipelineFactory

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator, SimpleEstimator

## Data preprocessing pipeline

In [4]:
processors = [
    EnrichDFDataPreprocessor(),
    DropTargetNADataPreprocessor(),
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id'])    
]
processor = CompositeDataPreprocessor(processors)

### Load data

In [5]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


### Pre-process data

In [6]:
df_train = processor.apply(df_train)
print(df_train.shape[0])
print(df_train.columns)
display(df_train.tail())

5237892
Index(['seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag',
       'reference_price', 'matched_size', 'far_price', 'near_price',
       'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'target',
       'imb_s1', 'imb_s2', 'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2',
       'pressure', 'inefficiency'],
      dtype='object')


Unnamed: 0,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,...,wap,target,imb_s1,imb_s2,reference_price_ask_price_bid_price_imb2,reference_price_ask_price_wap_imb2,reference_price_bid_price_wap_imb2,ask_price_bid_price_wap_imb2,pressure,inefficiency
5237975,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,...,1.000328,2.310276,-0.816784,-0.841104,526921200000.0,9.636364,,9.636364,75.664813,0.086305
5237976,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.4,1.0009,...,1.000819,-8.220077,0.374254,-0.926706,,0.4602273,792633500000.0,0.460227,1.704028,0.038041
5237977,540,0.0,0,0.995789,12725436.1,0.995789,0.995789,0.995789,16790.66,0.995883,...,0.995797,1.169443,-0.829388,-1.0,-282225600000.0,10.75,,10.75,0.0,0.0
5237978,540,1000898.84,1,0.99921,94773271.05,0.99921,0.99921,0.99897,125631.72,0.99921,...,0.999008,-1.540184,-0.684154,-0.979099,-9.251859e-13,-1.099231e-12,5.315789,5.315789,1.494117,0.010561
5237979,540,1884285.71,-1,1.002129,24073677.32,1.000859,1.001494,1.002129,250081.44,1.002447,...,1.002274,-6.530285,-0.091024,-0.85482,,1.193103,-653021900000.0,1.193103,7.534688,0.078272


## Train

In [8]:
os.system('mkdir models')

1

In [9]:
N_fold = 5
model_save_dir = './models/'

In [10]:
default_data_generator = DefaultTrainEvalDataGenerator()
k_fold_data_generator = ManualKFoldDataGenerator(n_fold=N_fold)

In [11]:
model_post_processor = CompositeModelPostProcessor([
    SaveModelPostProcessor(save_dir=model_save_dir)
])

In [12]:
lgb_pipeline = DefaultTrainPipeline(LGBModelPipelineFactory(), k_fold_data_generator, model_post_processor, [MAECallback()])

In [13]:
lgb_models, lgb_model_res, lgb_train_dfs, lgb_eval_dfs, lgb_num_train_eval_sets, lgb_callback_results = lgb_pipeline.train(df_train)

generate data
start training, num_train_eval_sets: 5
Training fold 0 - start
Training fold 0 - initialized
Training fold 0 - train size: (4190313, 21), eval size: (1047579, 21)
Training fold 0 - start training
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.206606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4648
[LightGBM] [Info] Number of data points in the train set: 4190313, number of used features: 20
[LightGBM] [Info] Start training from score -0.060201
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's l1: 6.40048
Training fold 0 - finished training
Training fold 0 - finished post processing
Training fold 0 - end
Training fold 1 - start
Training fold 1 - initialized
Training fold 1 - train size: (4190313, 21), eval size: (1047579, 21)
Training fold 1 - start training
[LightGBM] [Info] Auto-choosing row-wise multi-thr

In [47]:
lgb_avg_mae = ScoringUtils.calculate_mae(lgb_models, lgb_eval_dfs)
print(lgb_avg_mae)

6.302917211424277


In [8]:
baseline_avg_mae = ScoringUtils.calculate_mae([BaselineEstimator()], [df_train])
print(baseline_avg_mae)

6.407770748115235


# Load

In [6]:
models = []

In [7]:
def load(modelname, fold):
    models.append(joblib.load(f'models/{modelname}_{fold}.model'))

In [8]:
for i in range (0, 5):
    load('lgb', i)
    # load('xgb', i)
    # load('cbt', i)

In [9]:
models

[LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1')]

# Submit

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
len(features)

In [None]:
counter = 0
# sample_prediction['target'] = 0
# env.predict(sample_prediction)
for (test, revealed_targets, sample_prediction) in iter_test:
#     print(test.shape)
    test_ = enrich_df_with_features(test)[features]
#     print(test_.shape)
#     print(len(features))
#     print(len(test_))
    sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1