In [1]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from itertools import combinations
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
revealed_targets = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [3]:
median_vol = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()
train['bid_size_std_30'] = train.groupby(['stock_id'], as_index=False)['bid_size'].rolling(30, min_periods=1).std().sort_index()['bid_size']
train['bid_size_std_10'] = train.groupby(['stock_id'], as_index=False)['bid_size'].rolling(10, min_periods=1).std().sort_index()['bid_size']
train['bid_size_std_5'] = train.groupby(['stock_id'], as_index=False)['bid_size'].rolling(5, min_periods=1).std().sort_index()['bid_size']
train['ask_size_std_30'] = train.groupby(['stock_id'], as_index=False)['ask_size'].rolling(30, min_periods=1).std().sort_index()['ask_size']
train['ask_size_std_10'] = train.groupby(['stock_id'], as_index=False)['ask_size'].rolling(10, min_periods=1).std().sort_index()['ask_size']
train['ask_size_std_5'] = train.groupby(['stock_id'], as_index=False)['ask_size'].rolling(5, min_periods=1).std().sort_index()['ask_size']

In [4]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id', 'time_id', 'data_id']]
    df = df[cols]
    df['bid_plus_ask_sizes'] = df['bid_size'] + df['ask_size']
    df['median_vol'] = df['stock_id'].map(median_vol.to_dict())
    df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_vol'], 1, 0) 
    df['high_volume_ratio'] = np.where(df['bid_plus_ask_sizes'] > df['median_vol'], 1, 0)*df['bid_plus_ask_sizes']/df['bid_plus_ask_sizes'].sum()
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    df['high_volume_imbalance_size'] = df['high_volume']*df['imbalance_size']
    df['low_volume_matched_size'] = df['high_volume']*df['matched_size']
    df['mid_price'] = (df['ask_price'] + df['bid_price'])/2
    df['wwap'] = df.eval('(bid_size*bid_price+ask_size*ask_price)/(bid_size+ask_size)')
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    df['ask_x_size'] = df.eval('ask_size*ask_price')
    df['bid_x_size'] = df.eval('bid_size*bid_price')
        
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size'] 
    
    df['mix_diff_price'] = df['ask_minus_bid'].div(df['bid_plus_ask_sizes'])
    
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_

        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)
    
        
    gc.collect()
    
    return df

In [5]:
%%time

y = train['target'].values
X = feat_eng(train.drop(columns='target'))

CPU times: user 59.5 s, sys: 12.4 s, total: 1min 11s
Wall time: 1min 11s


In [6]:
%%time

params = {'n_estimators': 700, 'learning_rate': 0.01, 'max_depth': 7, 'colsample_bytree': 0.95, 'min_child_weight': 31,
         'subsample_for_bin':500000, 'reg_alpha':0.01}
m = lgb.LGBMRegressor(**params, random_state=51)

m.fit(X, y)

CPU times: user 40min 55s, sys: 12.3 s, total: 41min 8s
Wall time: 11min 31s


In [7]:
feat_imp = pd.Series(m.feature_importances_, index=X.columns).sort_values()
print('Columns with poor contribution', feat_imp[feat_imp<10].index)
fig = px.bar(x=feat_imp, y=feat_imp.index, orientation='h')
fig.show()

Columns with poor contribution Index(['high_volume'], dtype='object')


In [8]:
test = feat_eng(test)

In [9]:
def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [10]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [11]:
# Here is how I define rolling feature, however, it takes too much time to complete the test.
# It would be much helpful if anyone could improve the efficiency
stock_id = list(range(200))

def rolling_window(a, window):
    insert = np.full(window-1, a[0])
    a = np.insert(a, 0, insert)
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def ts_stddev(data, window=10):
    value = rolling_window(np.array(data), window).std(axis=1)
    return value

def rolling_eng(test, test_all, counter):
    bid_size_std_60 = []
    bid_size_std_120 = []
    bid_size_std_300 = []
    ask_size_std_60 = []
    ask_size_std_120 = []
    ask_size_std_300 = []
    for sid in stock_id:
        bid_size = test_all[test_all['stock_id'] == sid]['bid_size']
        ask_size = test_all[test_all['stock_id'] == sid]['ask_size']
        bid_size_std_60.append(ts_stddev(bid_size, 30)[-1])
        bid_size_std_120.append(ts_stddev(bid_size, 10)[-1])
        bid_size_std_300.append(ts_stddev(bid_size, 5)[-1])
        ask_size_std_60.append(ts_stddev(ask_size, 30)[-1])
        ask_size_std_120.append(ts_stddev(ask_size, 10)[-1])
        ask_size_std_300.append(ts_stddev(ask_size, 5)[-1])
    test['bid_szie_std_30'] = np.array(bid_size_std_60)
    test['bid_size_std_10'] = np.array(bid_size_std_120)
    test['bid_size_std_50'] = np.array(bid_size_std_300)
    test['ask_size_std_30'] = np.array(ask_size_std_60)
    test['ask_size_std_10'] = np.array(ask_size_std_120)
    test['ask_size_std_5'] = np.array(ask_size_std_300)
    return test


In [12]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    test['counter'] = counter
#     print(test['counter'])
    if counter == 0:
        test_all = test
    else:
        test_all = pd.concat([test_all, test], axis = 0)
    
    test_roll = rolling_eng(test, test_all, counter)
    feat = feat_eng(test_roll)
    sample_prediction['target'] = m.predict(feat.drop(columns = ['counter']))
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
