<table>
<tr>
<td>V1 </td>
<td>Simple feature engineering</td>
<td>second</td>
</tr>
<tr>
<td>V2 </td>
<td>Add pressure and inefficiency</td>
<td>second</td>
</tr>    
</table>



| Version   |      Date      |  Score | Score 2
|----------|:-------------:|------:|--:|
| Baseline |  20240110 | 6.407770748115235 |
| Imb, pressure, inefficiency |    20240110   |   6.302917211424277 ||
| Add WAP 30 Moving average | 20240112 |  6.302853946841688  ||
| Add WAP 60, 120, 240 Moving average | 20240112 |  6.3019380482507374 | 6.302636961287867 ||
| Polynomial features | 20240222| 6.300027022932713|6.315463191662042|

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install lightgbm
!pip install xgboost
!pip install catboost



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import joblib 
import os
import sklearn 

from load_data import load_data_from_csv

In [4]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import EnrichDFDataPreprocessor, MovingAvgPreProcessor, RemoveIrrelevantFeaturesDataPreprocessor, DropTargetNADataPreprocessor
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor

In [5]:
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator

from model_pipeline.lgb_pipeline import LGBModelPipelineFactory
from model_pipeline.xgb_pipeline import XGBModelPipelineFactory
from model_pipeline.cbt_pipeline import CatBoostModelPipelineFactory

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator

## Data preprocessing pipeline

In [6]:
processors = [    
    ReduceMemUsageDataPreprocessor(verbose=True),
    EnrichDFDataPreprocessor(),
    MovingAvgPreProcessor("wap"),    
    DropTargetNADataPreprocessor(),    
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    FillNaPreProcessor(),
    PolynomialFeaturesPreProcessor(),
]
processor = CompositeDataPreprocessor(processors)

### Load data

In [7]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [8]:
df_train.dtypes

stock_id                     int64
date_id                      int64
seconds_in_bucket            int64
imbalance_size             float64
imbalance_buy_sell_flag      int64
reference_price            float64
matched_size               float64
far_price                  float64
near_price                 float64
bid_price                  float64
bid_size                   float64
ask_price                  float64
ask_size                   float64
wap                        float64
target                     float64
time_id                      int64
row_id                      object
dtype: object

### Pre-process data

In [9]:
df_train = processor.apply(df_train)
print(df_train.shape[0])
print(df_train.columns)
display(df_train.tail())

Processing ReduceMemUsageDataPreprocessor...
Memory usage of dataframe is 679.36 MB
Memory usage after optimization is: 304.72 MB
Decreased by 55.15%
dtypes:
stock_id                     int16
date_id                      int16
seconds_in_bucket            int16
imbalance_size             float32
imbalance_buy_sell_flag       int8
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap                        float32
target                     float32
time_id                      int16
row_id                      object
dtype: object
ReduceMemUsageDataPreprocessor took 0.34s. New df shape: (5237980, 17).
Processing EnrichDFDataPreprocessor...
EnrichDFDataPreprocessor took 10.73s. New df shape: (5237980, 25).
Processing MovingAvgPreProcessor...
MovingAvgPreProcess

Unnamed: 0,near_price ask_price,near_price ask_size,near_price wap,near_price imb_s1,near_price imb_s2,near_price reference_price_ask_price_bid_price_imb2,near_price reference_price_ask_price_wap_imb2,near_price reference_price_bid_price_wap_imb2,near_price ask_price_bid_price_wap_imb2,near_price pressure,...,inefficiency wap_mov_avg_6_3,inefficiency wap_mov_avg_12_6,inefficiency wap_mov_avg_24_12,wap_mov_avg_3_1 wap_mov_avg_6_3,wap_mov_avg_3_1 wap_mov_avg_12_6,wap_mov_avg_3_1 wap_mov_avg_24_12,wap_mov_avg_6_3 wap_mov_avg_12_6,wap_mov_avg_6_3 wap_mov_avg_24_12,wap_mov_avg_12_6 wap_mov_avg_24_12,target
5237887,1.000168,319777.3125,1.000062,-0.816566,-0.840881,0.0,9.67134,-46.987495,9.67134,75.644691,...,0.086331,0.086332,0.086322,1.000649,1.000662,1.000547,1.000622,1.000506,1.000519,2.310276
5237888,1.001286,93429.117188,1.001205,0.374398,-0.927064,0.0,0.460882,0.0,0.460882,1.704686,...,0.038068,0.038062,0.03806,1.001526,1.001376,1.001322,1.00127,1.001216,1.001066,-8.220077
5237889,0.991689,179280.171875,0.991604,-0.825896,-0.995789,1569.363403,10.811423,0.0,10.811423,0.0,...,0.0,0.0,0.0,0.992044,0.992104,0.992409,0.992216,0.99252,0.99258,1.169443
5237890,0.998421,669363.8125,0.998219,-0.683613,-0.978325,0.0,0.0,5.337512,5.337512,1.492937,...,0.010553,0.010554,0.010554,0.998334,0.998422,0.998429,0.998523,0.998531,0.998618,-1.540184
5237891,1.003945,300616.03125,1.003772,-0.09116,-0.856097,-1336.994507,1.192255,1217.816772,1.192255,7.545946,...,0.078455,0.07844,0.078424,1.004425,1.004243,1.004032,1.004498,1.004287,1.004105,-6.530285


In [10]:
data = [{'a': 1, 'b': 2, 'c': 3},
        {'a': 10, 'b': 20, 'c': 30}]
 
# Creates DataFrame.
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,2,3
1,10,20,30


In [11]:
int(301/2)

150

In [12]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2, interaction_only=True)
df = poly.fit_transform(df).astype("float32")
df.shape

(2, 7)

## Train

In [13]:
os.system('mkdir models')

mkdir: cannot create directory ‘models’: File exists


256

In [14]:
N_fold = 5
model_save_dir = './models/'

In [15]:
default_data_generator = DefaultTrainEvalDataGenerator()
k_fold_data_generator = ManualKFoldDataGenerator(n_fold=N_fold)

In [16]:
model_post_processor = CompositeModelPostProcessor([
    SaveModelPostProcessor(save_dir=model_save_dir)
])

In [17]:
lgb_pipeline = DefaultTrainPipeline(LGBModelPipelineFactory(), k_fold_data_generator, model_post_processor, [MAECallback()])

In [18]:
lgb_models, lgb_model_res, lgb_train_dfs, lgb_eval_dfs, lgb_num_train_eval_sets, lgb_callback_results = lgb_pipeline.train(df_train)

generate data
start training, num_train_eval_sets: 5
Training fold 0 - start
Training fold 0 - initialized
Training fold 0 - train size: (4190313, 152), eval size: (1047579, 152)
Training fold 0 - start training
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.659221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38505
[LightGBM] [Info] Number of data points in the train set: 4190313, number of used features: 151
[LightGBM] [Info] Start training from score -0.060201
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's l1: 6.41505
Training fold 0 - finished training
Training fold 0 - finished post processing
Training fold 0 - end
Training fold 1 - start
Training fold 1 - initialized
Training fold 1 - train size: (4190313, 152), eval size: (1047579, 152)
Training fold 1 - start training
[LightGBM] [Info] Auto-choosing col-wise mul

In [19]:
lgb_avg_mae = ScoringUtils.calculate_mae(lgb_models, lgb_eval_dfs)
print(lgb_avg_mae)

6.315463191662042


In [20]:
baseline_avg_mae = ScoringUtils.calculate_mae([BaselineEstimator()], [df_train])
print(baseline_avg_mae)

6.40777074786513


# Load

In [None]:
models = []

In [None]:
def load(modelname, fold):
    models.append(joblib.load(f'models/{modelname}_{fold}.model'))

In [None]:
for i in range (0, 5):
    load('lgb', i)
    # load('xgb', i)
    # load('cbt', i)

In [None]:
models

# Submit

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
test_processors = [
    EnrichDFDataPreprocessor(),
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id'])    
]
test_processor = CompositeDataPreprocessor(processors)

In [None]:
counter = 0
cache = pd.DataFrame()

for (test, revealed_targets, sample_prediction) in iter_test:
    test_ = processor.apply(test)
    cache = pd.concat([cache, test_], ignore_index=True, axis=0)  
    sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1

In [None]:
# sample_prediction['target'] = 0
# env.predict(sample_prediction)

In [None]:
# counter = 0
# # sample_prediction['target'] = 0
# # env.predict(sample_prediction)
# for (test, revealed_targets, sample_prediction) in iter_test:
# #     print(test.shape)
#     test_ = enrich_df_with_features(test)[features]
# #     print(test_.shape)
# #     print(len(features))
# #     print(len(test_))
#     sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
#     env.predict(sample_prediction)
#     counter += 1