In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [78]:
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
import numpy as np 
import joblib 
import os
import sklearn 

In [79]:
indices = ['stock_id', 'date_id','time_id', 'row_id','target']

features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
            'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
            'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
            'imb_s1', 'imb_s2']

In [80]:
def enrich_features(features):
   
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    features.append(f'{a}_{b}_{c}_imb2')
                    
    features = set(features)

In [81]:
enrich_features(features)

In [82]:
features

['seconds_in_bucket',
 'imbalance_buy_sell_flag',
 'imbalance_size',
 'matched_size',
 'bid_size',
 'ask_size',
 'reference_price',
 'far_price',
 'near_price',
 'ask_price',
 'bid_price',
 'wap',
 'imb_s1',
 'imb_s2',
 'far_price_reference_price_imb',
 'near_price_reference_price_imb',
 'near_price_far_price_imb',
 'ask_price_reference_price_imb',
 'ask_price_far_price_imb',
 'ask_price_near_price_imb',
 'bid_price_reference_price_imb',
 'bid_price_far_price_imb',
 'bid_price_near_price_imb',
 'bid_price_ask_price_imb',
 'wap_reference_price_imb',
 'wap_far_price_imb',
 'wap_near_price_imb',
 'wap_ask_price_imb',
 'wap_bid_price_imb',
 'near_price_far_price_reference_price_imb2',
 'ask_price_far_price_reference_price_imb2',
 'ask_price_near_price_reference_price_imb2',
 'ask_price_near_price_far_price_imb2',
 'bid_price_far_price_reference_price_imb2',
 'bid_price_near_price_reference_price_imb2',
 'bid_price_near_price_far_price_imb2',
 'bid_price_ask_price_reference_price_imb2',
 'b

In [83]:
def enrich_df(df):

    df_ = df.copy()

    df_['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df_['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df_[f'{a}_{b}_imb'] = df_.eval(f'({a}-{b})/({a}+{b})')  
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df_[[a,b,c]].max(axis=1)
                    min_ = df_[[a,b,c]].min(axis=1)
                    mid_ = df_[[a,b,c]].sum(axis=1)-min_-max_

                    df_[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_ + 0.1)
    
    return df_

In [101]:
model_dict = {
    'lgb': lgb.LGBMRegressor(objective='regression_l1', n_estimators=50),
    'xgb': xgb.XGBRegressor(tree_method='hist', objective='reg:absoluteerror', n_estimators=500, early_stopping_rounds = 100),
    'cbt': cbt.CatBoostRegressor(objective='MAE', iterations=50),
}

In [85]:
models = []

Train


In [86]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'

In [87]:
df_train = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/train.csv')
df_test = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/example_test_files/test.csv')
revealed_targets = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
sample_submission = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [88]:
df_train.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

In [89]:
df_train_ = enrich_df(df_train)

In [90]:
df_train_.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id', 'imb_s1', 'imb_s2',
       'far_price_reference_price_imb', 'near_price_reference_price_imb',
       'near_price_far_price_imb', 'ask_price_reference_price_imb',
       'ask_price_far_price_imb', 'ask_price_near_price_imb',
       'bid_price_reference_price_imb', 'bid_price_far_price_imb',
       'bid_price_near_price_imb', 'bid_price_ask_price_imb',
       'wap_reference_price_imb', 'wap_far_price_imb', 'wap_near_price_imb',
       'wap_ask_price_imb', 'wap_bid_price_imb',
       'near_price_far_price_reference_price_imb2',
       'ask_price_far_price_reference_price_imb2',
       'ask_price_near_price_reference_price_imb2',
       'ask_price_near_price_far_price_imb2',
       'bid_price_far_price_reference_price_imb2',
  

In [91]:
os.system('mkdir models')

1

In [92]:
df_train_.isna().sum()

stock_id                                           0
date_id                                            0
seconds_in_bucket                                  0
imbalance_size                                   220
imbalance_buy_sell_flag                            0
reference_price                                  220
matched_size                                     220
far_price                                    2894342
near_price                                   2857180
bid_price                                        220
bid_size                                           0
ask_price                                        220
ask_size                                           0
wap                                              220
target                                            88
time_id                                            0
row_id                                             0
imb_s1                                            87
imb_s2                                        

In [93]:
df_train_[features].columns

Index(['seconds_in_bucket', 'imbalance_buy_sell_flag', 'imbalance_size',
       'matched_size', 'bid_size', 'ask_size', 'reference_price', 'far_price',
       'near_price', 'ask_price', 'bid_price', 'wap', 'imb_s1', 'imb_s2',
       'far_price_reference_price_imb', 'near_price_reference_price_imb',
       'near_price_far_price_imb', 'ask_price_reference_price_imb',
       'ask_price_far_price_imb', 'ask_price_near_price_imb',
       'bid_price_reference_price_imb', 'bid_price_far_price_imb',
       'bid_price_near_price_imb', 'bid_price_ask_price_imb',
       'wap_reference_price_imb', 'wap_far_price_imb', 'wap_near_price_imb',
       'wap_ask_price_imb', 'wap_bid_price_imb',
       'near_price_far_price_reference_price_imb2',
       'ask_price_far_price_reference_price_imb2',
       'ask_price_near_price_reference_price_imb2',
       'ask_price_near_price_far_price_imb2',
       'bid_price_far_price_reference_price_imb2',
       'bid_price_near_price_reference_price_imb2',
       'bid

In [94]:
df_train_.shape[0]

5237980

In [95]:
X = df_train_.dropna(subset=['target'])[features].values
X.shape

(5237892, 49)

In [96]:
Y = df_train_.dropna(subset=['target'])['target'].values
Y.shape

(5237892,)

In [97]:
N_fold = 5

Train

In [98]:
index = np.arange(len(X))

In [102]:
def train(model_dict, modelname, fold):
    model = model_dict[modelname]
    match modelname:
        case 'lgb':
            model.fit(X[index%N_fold!=fold], Y[index%N_fold!=fold], 
                eval_set=[(X[index%N_fold==fold], Y[index%N_fold==fold])], 
                callbacks=[lgb.early_stopping(100)]
            )
        case 'xgb':
            model.fit(X[index%N_fold!=fold], Y[index%N_fold!=fold], 
                eval_set=[(X[index%N_fold==fold], Y[index%N_fold==fold])], 
            )
        case 'cbt':
            model.fit(X[index%N_fold!=fold], Y[index%N_fold!=fold], 
                eval_set=[(X[index%N_fold==fold], Y[index%N_fold==fold])], 
                early_stopping_rounds = 100
            )
    models.append(model)
    joblib.dump(model, f'./models/{modelname}_{fold}.model')

In [103]:
for i in range(N_fold):
    print(f"Training fold {i}")
    train(model_dict, 'lgb', i)
    train(model_dict, 'xgb', i)
    train(model_dict, 'cbt', i)

Training fold 0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.367777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12043
[LightGBM] [Info] Number of data points in the train set: 4190313, number of used features: 49
[LightGBM] [Info] Start training from score -0.060201
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's l1: 6.37495
[0]	validation_0-mae:6.46163
[1]	validation_0-mae:6.43183
[2]	validation_0-mae:6.41496
[3]	validation_0-mae:6.40461
[4]	validation_0-mae:6.39809
[5]	validation_0-mae:6.39293
[6]	validation_0-mae:6.38937
[7]	validation_0-mae:6.38692
[8]	validation_0-mae:6.38451
[9]	validation_0-mae:6.38207
[10]	validation_0-mae:6.38069
[11]	validation_0-mae:6.37822
[12]	validation_0-mae:6.37748
[13]	validation_0-mae:6.37607
[14]	validation_0-mae:6.37381
[15]	validation_0-mae:6.37235
[16]	validation_0-mae:6.37196
[

Load

In [None]:
# def load(model_dict, modelname, fold):
#     models.append(joblib.load(f'models/{modelname}_{fold}.model'))

In [None]:
# for i in range (0, 5):
#     load(model_dict, 'lgb', i)

In [None]:
models

Submit

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
len(features)

In [None]:
counter = 0
# sample_prediction['target'] = 0
# env.predict(sample_prediction)
for (test, revealed_targets, sample_prediction) in iter_test:
#     print(test.shape)
    test_ = enrich_df(test)[features]
#     print(test_.shape)
#     print(len(features))
#     print(len(test_))
    sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1