In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
import numpy as np 
import joblib 
import os
import sklearn 

from itertools import combinations

## Feature Engineering

In [2]:
nonfeatures = ['stock_id', 'date_id','time_id', 'row_id','target']

# indices = ['stock_id', 'date_id','time_id', 'row_id','target']

# features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
#             'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
#             'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
#             'imb_s1', 'imb_s2']

# prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
# sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

In [3]:
def enrich_df(df):

    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
            'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
            'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
            'imb_s1', 'imb_s2']

    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    df_ = df.copy()

    df_['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df_['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')  

    # for i,a in enumerate(prices):
    #     for j,b in enumerate(prices):
    #         if i>j:
    #             df_[f'{a}_{b}_imb'] = df_.eval(f'({a}-{b})/({a}+{b})')       

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
                   
    # for i,a in enumerate(prices):
    #     for j,b in enumerate(prices):
    #         for k,c in enumerate(prices):
    #             if i>j and j>k:
    #                 max_ = df_[[a,b,c]].max(axis=1)
    #                 min_ = df_[[a,b,c]].min(axis=1)
    #                 mid_ = df_[[a,b,c]].sum(axis=1)-min_-max_

    #                 df_[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_ + 0.1)

    for a, b, c in combinations( ['reference_price', 'ask_price', 'bid_price', 'wap'], 3):
        maxi = df_[[a,b,c]].max(axis=1)
        mini = df_[[a,b,c]].min(axis=1)
        mid = df_[[a,b,c]].sum(axis=1)-mini-maxi

        df_[f'{a}_{b}_{c}_imb2'] = np.where(mid.eq(mini), np.nan, (maxi - mid) / (mid - mini))
        # np.nan if mid.eq(mini) else (maxi - mid) / (mid - mini)

        # if mid == mini:  # Prevent division by zero
        #     df_[f'{a}_{b}_{c}_imb2'] = np.nan
        # else:
        #     df_[f'{a}_{b}_{c}_imb2'] = (maxi - mid) / (mid - mini)

    return df_

## Models

In [4]:
model_dict = {
    'lgb': lgb.LGBMRegressor(objective='regression_l1', n_estimators=50),
    'xgb': xgb.XGBRegressor(tree_method='hist', objective='reg:absoluteerror', n_estimators=500, early_stopping_rounds = 100),
    'cbt': cbt.CatBoostRegressor(objective='MAE', iterations=50),
}

In [30]:
models = []

# Train


### Load data

In [6]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'

In [7]:
df_train = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/train.csv')
df_test = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/example_test_files/test.csv')
revealed_targets = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
sample_submission = pd.read_csv(f'{DATA_PATH}/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

### Memory reduction

In [8]:
def reduce_mem_usage(df, verbose=0):

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df


In [9]:
df_train.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

### Enrich and train

In [10]:
df_train_ = enrich_df(df_train)

In [11]:
df_train_.tail()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,wap,target,time_id,row_id,imb_s1,imb_s2,reference_price_ask_price_bid_price_imb2,reference_price_ask_price_wap_imb2,reference_price_bid_price_wap_imb2,ask_price_bid_price_wap_imb2
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,...,1.000328,2.310276,26454,480_540_195,-0.816784,-0.841104,526921200000.0,9.636364,,9.636364
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,...,1.000819,-8.220077,26454,480_540_196,0.374254,-0.926706,,0.4602273,792633500000.0,0.460227
5237977,197,480,540,0.0,0,0.995789,12725436.1,0.995789,0.995789,0.995789,...,0.995797,1.169443,26454,480_540_197,-0.829388,-1.0,-282225600000.0,10.75,,10.75
5237978,198,480,540,1000898.84,1,0.99921,94773271.05,0.99921,0.99921,0.99897,...,0.999008,-1.540184,26454,480_540_198,-0.684154,-0.979099,-9.251859e-13,-1.099231e-12,5.315789,5.315789
5237979,199,480,540,1884285.71,-1,1.002129,24073677.32,1.000859,1.001494,1.002129,...,1.002274,-6.530285,26454,480_540_199,-0.091024,-0.85482,,1.193103,-653021900000.0,1.193103


In [12]:
df_train_.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id', 'imb_s1', 'imb_s2',
       'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2'],
      dtype='object')

In [13]:
os.system('mkdir models')

1

In [14]:
df_train_.isna().sum()

stock_id                                          0
date_id                                           0
seconds_in_bucket                                 0
imbalance_size                                  220
imbalance_buy_sell_flag                           0
reference_price                                 220
matched_size                                    220
far_price                                   2894342
near_price                                  2857180
bid_price                                       220
bid_size                                          0
ask_price                                       220
ask_size                                          0
wap                                             220
target                                           88
time_id                                           0
row_id                                            0
imb_s1                                           87
imb_s2                                          220
reference_pr

In [15]:
features = [c for c in df_train_.columns if c not in nonfeatures]
df_train_[features].columns

Index(['seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag',
       'reference_price', 'matched_size', 'far_price', 'near_price',
       'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'imb_s1',
       'imb_s2', 'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2'],
      dtype='object')

In [16]:
df_train_.shape[0]

5237980

In [17]:
X = df_train_.dropna(subset=['target'])[features].values
X.shape

(5237892, 18)

In [18]:
X

array([[ 0.00000000e+00,  3.18060269e+06,  1.00000000e+00, ...,
         1.38297872e-01, -1.69335346e+12,  1.38297872e-01],
       [ 0.00000000e+00,  1.66603910e+05, -1.00000000e+00, ...,
         6.34615385e+00,  9.36748722e+11,  6.34615385e+00],
       [ 0.00000000e+00,  3.02879870e+05, -1.00000000e+00, ...,
         6.78815490e-01,  2.77848101e+00,  4.99162479e-01],
       ...,
       [ 5.40000000e+02,  0.00000000e+00,  0.00000000e+00, ...,
         1.07500000e+01,             nan,  1.07500000e+01],
       [ 5.40000000e+02,  1.00089884e+06,  1.00000000e+00, ...,
        -1.09923072e-12,  5.31578947e+00,  5.31578947e+00],
       [ 5.40000000e+02,  1.88428571e+06, -1.00000000e+00, ...,
         1.19310345e+00, -6.53021946e+11,  1.19310345e+00]])

In [19]:
Y = df_train_.dropna(subset=['target'])['target'].values
Y.shape

(5237892,)

In [20]:
N_fold = 5

## Train

In [21]:
# offline_split = df_train['date_id']>(split_day - 45)
# df_offline_train = df_train_feats[~offline_split]
# df_offline_valid = df_train_feats[offline_split]
# df_offline_train_target = df_train['target'][~offline_split]
# df_offline_valid_target = df_train['target'][offline_split]


In [22]:
index = np.arange(len(X))

In [23]:
def train(model_dict, modelname, fold):
    model = model_dict[modelname]
    match modelname:
        case 'lgb':
            model.fit(X[index%N_fold!=fold], Y[index%N_fold!=fold], 
                eval_set=[(X[index%N_fold==fold], Y[index%N_fold==fold])], 
                callbacks=[lgb.early_stopping(100)]
            )
        case 'xgb':
            model.fit(X[index%N_fold!=fold], Y[index%N_fold!=fold], 
                eval_set=[(X[index%N_fold==fold], Y[index%N_fold==fold])], 
            )
        case 'cbt':
            model.fit(X[index%N_fold!=fold], Y[index%N_fold!=fold], 
                eval_set=[(X[index%N_fold==fold], Y[index%N_fold==fold])], 
                early_stopping_rounds = 100
            )
    models.append(model)
    joblib.dump(model, f'./models/{modelname}_{fold}.model')

In [25]:
for i in range(N_fold):
    print(f"Training fold {i}")
    train(model_dict, 'lgb', i)
    # train(model_dict, 'xgb', i)
    # train(model_dict, 'cbt', i)

Training fold 0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4138
[LightGBM] [Info] Number of data points in the train set: 4190313, number of used features: 18
[LightGBM] [Info] Start training from score -0.060201
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's l1: 6.40007
Training fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.379314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4138
[LightGBM] [Info] Number of data points in the train set: 4190313, number of used features: 18
[LightGBM] [Info] Start training from score -0.069737
Training until validation scores don't improve for 100 rounds
Did not meet ea

# Load

In [31]:
def load(model_dict, modelname, fold):
    models.append(joblib.load(f'models/{modelname}_{fold}.model'))

In [32]:
for i in range (0, 5):
    load(model_dict, 'lgb', i)
    # load(model_dict, 'xgb', i)
    # load(model_dict, 'cbt', i)

In [33]:
models

[LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1')]

# Evaluate

In [34]:
models

[LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1'),
 LGBMRegressor(n_estimators=50, objective='regression_l1')]

In [35]:
df_train_ = enrich_df(df_train)
df_train_.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id',
       'reference_price_far_price_imb', 'reference_price_near_price_imb',
       'reference_price_ask_price_imb', 'reference_price_bid_price_imb',
       'reference_price_wap_imb', 'far_price_near_price_imb',
       'far_price_ask_price_imb', 'far_price_bid_price_imb',
       'far_price_wap_imb', 'near_price_ask_price_imb',
       'near_price_bid_price_imb', 'near_price_wap_imb',
       'ask_price_bid_price_imb', 'ask_price_wap_imb', 'bid_price_wap_imb',
       'imb_s1', 'imb_s2', 'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2'],
      dtype='object')

In [36]:
df_train['baseline_prediction'] = 0
baseline_mae = (df_train['baseline_prediction'] - df_train['target']).abs().mean()
print(baseline_mae)

6.40777074811524


In [37]:
def print_prediction(column):    
    mae = (df_train[column] - df_train['target']).abs().mean()
    print(mae, '. MAE improvement in basis points: ', (baseline_mae - mae))

In [38]:
simple_mapping = {
    1: 0.1,
    0: 0,
    -1: -0.1
}
df_train['simple_prediction'] = df_train['imbalance_buy_sell_flag'].map(simple_mapping)
print_prediction('simple_prediction')

6.407056596608261 . MAE improvement in basis points:  0.000714151506978844


In [39]:
df_train['model_prediction'] = np.mean([model.predict(df_train_[features]) for model in models], 0)
print_prediction('model_prediction')

6.299541952483538 . MAE improvement in basis points:  0.10822879563170229


In [40]:
df_train['lgb_prediction'] = np.mean([model.predict(df_train_[features]) for model in models if type(model) == lgb.LGBMRegressor], 0)
print_prediction('lgb_prediction')

6.299541952483538 . MAE improvement in basis points:  0.10822879563170229


In [41]:
df_train['cbt_prediction'] = np.mean([model.predict(df_train_[features]) for model in models if type(model) == cbt.core.CatBoostRegressor], 0)
print_prediction('cbt_prediction')

nan . MAE improvement in basis points:  nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [42]:
df_train['xgb_prediction'] = np.mean([model.predict(df_train_[features]) for model in models if type(model) == xgb.sklearn.XGBRegressor], 0)
print_prediction('xgb_prediction')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan . MAE improvement in basis points:  nan


In [43]:
type(models[0]) == lgb.LGBMRegressor
type(models[5]) == cbt.core.CatBoostRegressor
type(models[10]) == xgb.sklearn.XGBRegressor

IndexError: list index out of range

In [None]:
type(models[10])

xgboost.sklearn.XGBRegressor

In [None]:
df_train[['target', 'baseline_prediction', 'simple_prediction', 'model_prediction']]

Unnamed: 0,target,baseline_prediction,simple_prediction,model_prediction
0,-3.029704,0,0.1,0.774883
1,-5.519986,0,-0.1,6.633996
2,-8.389950,0,-0.1,1.212444
3,-4.010200,0,-0.1,6.532201
4,-7.349849,0,-0.1,-0.867726
...,...,...,...,...
5237975,2.310276,0,-0.1,-6.720454
5237976,-8.220077,0,-0.1,-8.196658
5237977,1.169443,0,0.0,-1.715851
5237978,-1.540184,0,0.1,-1.059905


In [None]:
df_train['model_prediction'] = np.mean([model.predict(df_train_[features]) for model in models], 0)

# Submit

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
len(features)

In [None]:
counter = 0
# sample_prediction['target'] = 0
# env.predict(sample_prediction)
for (test, revealed_targets, sample_prediction) in iter_test:
#     print(test.shape)
    test_ = enrich_df(test)[features]
#     print(test_.shape)
#     print(len(features))
#     print(len(test_))
    sample_prediction['target'] = np.mean([model.predict(test_) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1