<table>
<tr>
<td>V1 </td>
<td>Simple feature engineering</td>
<td>second</td>
</tr>
<tr>
<td>V2 </td>
<td>Add pressure and inefficiency</td>
<td>second</td>
</tr>    
</table>



| Version   |      Date      |  Score | Score 2
|----------|:-------------:|------:|--:|
| Baseline |  20240110 | 6.407770748115235 |
| Imb, pressure, inefficiency |    20240110   |   6.302917211424277 ||
| Add WAP 30 Moving average | 20240112 |  6.302853946841688  ||
| Add WAP 60, 120, 240 Moving average | 20240112 |  6.3019380482507374 | 6.302636961287867 ||

In [7]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import joblib 
import os
import sklearn 

from load_data import load_data_from_csv

In [16]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import EnrichDFDataPreprocessor, MovingAvgPreProcessor, RemoveIrrelevantFeaturesDataPreprocessor, DropTargetNADataPreprocessor
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor

In [3]:
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator

from model_pipeline.lgb_pipeline import LGBModelPipelineFactory
from model_pipeline.xgb_pipeline import XGBModelPipelineFactory
from model_pipeline.cbt_pipeline import CatBoostModelPipelineFactory

from model_post_processor.model_post_processor import CompositeModelPostProcessor, SaveModelPostProcessor

from train_pipeline.train_pipeline import DefaultTrainPipeline

from train_pipeline.train_pipeline_callbacks import MAECallback
from utils.scoring_utils import ScoringUtils
from model_pipeline.dummy_models import BaselineEstimator

## Data preprocessing pipeline

In [17]:
processors = [    
    ReduceMemUsageDataPreprocessor(),
    EnrichDFDataPreprocessor(),
    MovingAvgPreProcessor("wap"),    
    DropTargetNADataPreprocessor(),    
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    FillNaPreProcessor(),
    PolynomialFeaturesPreProcessor(),
]
processor = CompositeDataPreprocessor(processors)

### Load data

In [5]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


### Pre-process data

In [18]:
df_train = processor.apply(df_train)
print(df_train.shape[0])
print(df_train.columns)
display(df_train.tail())

Processing ReduceMemUsageDataPreprocessor...
Processing EnrichDFDataPreprocessor...
Processing MovingAvgPreProcessor...
Processing DropTargetNADataPreprocessor...
Processing RemoveIrrelevantFeaturesDataPreprocessor...
Processing FillNaPreProcessor...
Processing PolynomialFeaturesPreProcessor...


MemoryError: Unable to allocate 32.0 GiB for an array with shape (5237892, 821) and data type float64

In [21]:
data = [{'a': 1, 'b': 2, 'c': 3},
        {'a': 10, 'b': 20, 'c': 30}]
 
# Creates DataFrame.
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,2,3
1,10,20,30


In [25]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2, interaction_only=True)
df = poly.fit_transform(df).astype("float32")
df

array([[1.00e+00, 1.00e+00, 1.00e+00, 1.00e+00, 2.00e+00, 3.00e+00,
        2.00e+00, 3.00e+00, 6.00e+00, 1.00e+00, 2.00e+00, 3.00e+00,
        2.00e+00, 3.00e+00, 6.00e+00, 2.00e+00, 3.00e+00, 2.00e+00,
        3.00e+00, 6.00e+00, 6.00e+00, 4.00e+00, 6.00e+00, 1.20e+01,
        6.00e+00, 9.00e+00, 1.80e+01, 6.00e+00, 1.20e+01, 1.80e+01,
        1.00e+00, 1.00e+00, 2.00e+00, 3.00e+00, 2.00e+00, 3.00e+00,
        6.00e+00, 1.00e+00, 2.00e+00, 3.00e+00, 2.00e+00, 3.00e+00,
        6.00e+00, 2.00e+00, 3.00e+00, 2.00e+00, 3.00e+00, 6.00e+00,
        6.00e+00, 4.00e+00, 6.00e+00, 1.20e+01, 6.00e+00, 9.00e+00,
        1.80e+01, 6.00e+00, 1.20e+01, 1.80e+01, 1.00e+00, 2.00e+00,
        3.00e+00, 2.00e+00, 3.00e+00, 6.00e+00, 1.00e+00, 2.00e+00,
        3.00e+00, 2.00e+00, 3.00e+00, 6.00e+00, 2.00e+00, 3.00e+00,
        2.00e+00, 3.00e+00, 6.00e+00, 6.00e+00, 4.00e+00, 6.00e+00,
        1.20e+01, 6.00e+00, 9.00e+00, 1.80e+01, 6.00e+00, 1.20e+01,
        1.80e+01, 2.00e+00, 3.00e+00, 2.00e+00, 

## Train

In [8]:
os.system('mkdir models')

1

In [9]:
N_fold = 5
model_save_dir = './models/'

In [10]:
default_data_generator = DefaultTrainEvalDataGenerator()
k_fold_data_generator = ManualKFoldDataGenerator(n_fold=N_fold)

In [11]:
model_post_processor = CompositeModelPostProcessor([
    SaveModelPostProcessor(save_dir=model_save_dir)
])

In [12]:
lgb_pipeline = DefaultTrainPipeline(LGBModelPipelineFactory(), k_fold_data_generator, model_post_processor, [MAECallback()])

In [13]:
lgb_models, lgb_model_res, lgb_train_dfs, lgb_eval_dfs, lgb_num_train_eval_sets, lgb_callback_results = lgb_pipeline.train(df_train)

generate data


start training, num_train_eval_sets: 5
Training fold 0 - start
Training fold 0 - initialized
Training fold 0 - train size: (4190313, 25), eval size: (1047579, 25)
Training fold 0 - start training
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.645594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5668
[LightGBM] [Info] Number of data points in the train set: 4190313, number of used features: 24
[LightGBM] [Info] Start training from score -0.060201
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's l1: 6.40127
Training fold 0 - finished training
Training fold 0 - finished post processing
Training fold 0 - end
Training fold 1 - start
Training fold 1 - initialized
Training fold 1 - train size: (4190313, 25), eval size: (1047579, 25)
Training fold 1 - start training
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the ov

In [14]:
lgb_avg_mae = ScoringUtils.calculate_mae(lgb_models, lgb_eval_dfs)
print(lgb_avg_mae)

6.302636961287867


In [15]:
baseline_avg_mae = ScoringUtils.calculate_mae([BaselineEstimator()], [df_train])
print(baseline_avg_mae)

6.407770748115235


# Load

In [21]:
models = []

In [22]:
def load(modelname, fold):
    models.append(joblib.load(f'models/{modelname}_{fold}.model'))

In [23]:
for i in range (0, 5):
    load('lgb', i)
    # load('xgb', i)
    # load('cbt', i)

In [24]:
models

[LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1')]

# Submit

In [25]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

ModuleNotFoundError: No module named 'optiver2023'

In [None]:
test_processors = [
    EnrichDFDataPreprocessor(),
    RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id'])    
]
test_processor = CompositeDataPreprocessor(processors)

In [None]:
counter = 0
cache = pd.DataFrame()

for (test, revealed_targets, sample_prediction) in iter_test:
    test_ = processor.apply(test)
    cache = pd.concat([cache, test_], ignore_index=True, axis=0)  
    sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1

In [None]:
# sample_prediction['target'] = 0
# env.predict(sample_prediction)

In [None]:
# counter = 0
# # sample_prediction['target'] = 0
# # env.predict(sample_prediction)
# for (test, revealed_targets, sample_prediction) in iter_test:
# #     print(test.shape)
#     test_ = enrich_df_with_features(test)[features]
# #     print(test_.shape)
# #     print(len(features))
# #     print(len(test_))
#     sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
#     env.predict(sample_prediction)
#     counter += 1