## Optiver - Trading at the Close

## 1. Setup

In [1]:
import warnings
from pathlib import Path
import yaml
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb

In [2]:
warnings.filterwarnings('ignore') 

competition_dataset_directory = Path('/kaggle/input/optiver-trading-at-the-close')
external_dataset_directory = Path('/kaggle/input/optiver-trading-at-the-close-dataset')

In [3]:
df_train = pd.read_csv(competition_dataset_directory / 'train.csv').drop(columns=['time_id', 'row_id']).astype(np.float32)
df_train = df_train.loc[df_train['date_id'] == 480].reset_index(drop=True)

# Drop imbalance_buy_sell_flag because it's only the sign of imbalance_size
df_train['imbalance_buy_sell_size'] = df_train.eval('imbalance_size * imbalance_buy_sell_flag').astype(np.float32)
df_train.drop(columns=['imbalance_buy_sell_flag'], inplace=True)

# Column-wise size features
df_train['imbalance_size_matched_size_ratio'] = df_train.eval('imbalance_size / matched_size').astype(np.float32)
df_train['imbalance_size_matched_size_difference'] = df_train.eval('imbalance_size - matched_size').astype(np.float32)
df_train['imbalance_size_matched_size_sum'] = df_train.eval('imbalance_size + matched_size').astype(np.float32)
df_train['imbalance_size_bid_size_ratio'] = df_train.eval('imbalance_size / bid_size').astype(np.float32)
df_train['matched_size_bid_size_ratio'] = df_train.eval('matched_size / bid_size').astype(np.float32)
df_train['bid_size_ask_size_ratio'] = df_train.eval('bid_size / ask_size').astype(np.float32)

# Column-wise price features
df_train['reference_price_far_price_difference'] = df_train.eval('reference_price - far_price').astype(np.float32)
df_train['reference_price_near_price_difference'] = df_train.eval('reference_price - near_price').astype(np.float32)
df_train['reference_price_bid_price_difference'] = df_train.eval('reference_price - bid_price').astype(np.float32)
df_train['reference_price_ask_price_difference'] = df_train.eval('reference_price - ask_price').astype(np.float32)
df_train['reference_price_wap_difference'] = df_train.eval('reference_price - wap').astype(np.float32)
df_train['far_price_near_price_difference'] = df_train.eval('far_price - near_price').astype(np.float32)
df_train['bid_price_ask_price_difference'] = df_train.eval('bid_price - ask_price').astype(np.float32)

# Column-wise price and size features
df_train['reference_price_matched_size'] = df_train.eval('reference_price * matched_size')
df_train['reference_price_imbalance_size'] = df_train.eval('reference_price * imbalance_size')

df_train

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,...,bid_size_ask_size_ratio,reference_price_far_price_difference,reference_price_near_price_difference,reference_price_bid_price_difference,reference_price_ask_price_difference,reference_price_wap_difference,far_price_near_price_difference,bid_price_ask_price_difference,reference_price_matched_size,reference_price_imbalance_size
0,0.0,480.0,0.0,5.372506e+06,0.999894,18358364.00,,,0.999894,18186.189453,...,1.518781,,,0.000000,-0.000176,-0.000106,,-0.000176,18356418.00,5.371937e+06
1,1.0,480.0,0.0,0.000000e+00,1.000230,1463335.75,,,0.999966,949.200012,...,0.047584,,,0.000264,-0.000474,0.000230,,-0.000738,1463672.25,0.000000e+00
2,2.0,480.0,0.0,2.668167e+06,1.000514,3178828.25,,,0.999999,194.020004,...,0.003066,,,0.000515,0.000052,0.000514,,-0.000463,3180462.25,2.669538e+06
3,3.0,480.0,0.0,1.009214e+07,0.999887,26401518.00,,,0.999839,41308.000000,...,1.999516,,,0.000048,-0.000194,-0.000113,,-0.000242,26398534.00,1.009100e+07
4,4.0,480.0,0.0,5.413386e+06,1.000016,14120504.00,,,0.999358,18225.000000,...,9.084429,,,0.000658,-0.000055,0.000016,,-0.000713,14120730.00,5.413472e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10995,195.0,480.0,540.0,2.440723e+06,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,...,0.100847,0.000583,0.000583,0.000000,-0.000117,-0.000011,0.000000,-0.000117,28289326.00,2.441497e+06
10996,196.0,480.0,540.0,3.495105e+05,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,...,2.196184,0.000514,0.000257,0.000000,-0.000257,-0.000176,-0.000257,-0.000257,9193607.00,3.497352e+05
10997,197.0,480.0,540.0,0.000000e+00,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,...,0.093262,0.000000,0.000000,0.000000,-0.000094,-0.000008,0.000000,-0.000094,12671849.00,0.000000e+00
10998,198.0,480.0,540.0,1.000899e+06,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,...,0.187540,0.000000,0.000000,0.000240,0.000000,0.000202,0.000000,-0.000240,94698400.00,1.000108e+06


## 2. Models

In [4]:
def load_model(model_directory, model_file_names, model_type):
    
    """
    Load models from the given model directory

    Parameters
    ----------
    model_directory: str
        Path of the model directory

    model_file_names: str
        Name of the model files

    model_type: str (lightgbm or xgboost)
        Model type
        
    Returns
    -------
    models: dict
        Dictionary of models
        
    config: dict
        Dictionary of configurations
    """

    config = yaml.load(open(model_directory / 'config.yaml', 'r'), Loader=yaml.FullLoader)
        
    models = {}

    for model_file_name in model_file_names:
        if model_type == 'lightgbm':
            model = lgb.Booster(model_file=model_directory / model_file_name)
        elif model_type == 'xgboost':
            model = xgb.Booster()
            model.load_model(model_directory / model_file_name)
        models[model_file_name] = model
        print(f'{model_type} model is loaded from {model_directory / model_file_name}')

    return models, config


In [5]:
models, config = load_model(
    model_directory=external_dataset_directory / 'xgboost_regression_v9',
    model_file_names=[
        'model_seed42.json'
    ],
    model_type='xgboost'
)


xgboost model is loaded from /kaggle/input/optiver-trading-at-the-close-dataset/xgboost_regression_v9/model_seed42.json


## 3. Inference

In [6]:
# Read precomputed stock weights
with open(external_dataset_directory / 'stock_weights.json', mode='r') as f:
    stock_weights = json.load(f)
stock_weights = {int(stock_id): weight for stock_id, weight in stock_weights.items()}

In [7]:
columns_to_cast = [
    'stock_id', 'date_id', 'seconds_in_bucket',
    'imbalance_size', 'imbalance_buy_sell_flag', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',
]

# Raw and column-wise features to concatenate on each iteration
test_columns = [
    # Raw features
    'stock_id', 'date_id', 'seconds_in_bucket',
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'ask_price', 'bid_size', 'ask_size', 'wap',

    # Other features
    'stock_weight',

    # Column-wise size features
    'imbalance_size_matched_size_ratio', 'imbalance_size_matched_size_sum', 'imbalance_size_matched_size_difference',
    'imbalance_size_bid_size_ratio',
    'matched_size_bid_size_ratio',
    'bid_size_ask_size_ratio',
    
    # Column-wise price features
    'reference_price_far_price_difference', 'reference_price_near_price_difference',
    'reference_price_bid_price_difference', 'reference_price_ask_price_difference', 'reference_price_wap_difference',
    'far_price_near_price_difference',
    'bid_price_ask_price_difference',
    
    # Column-wise price and size features
    'reference_price_matched_size', 'reference_price_imbalance_size'
]

# Row-wise differences
difference_periods = [1, 2, 3, 4, 5, 10, 15]
difference_columns = [
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',

    'imbalance_size_matched_size_sum',

    'reference_price_far_price_difference',
    'reference_price_near_price_difference',
    'reference_price_bid_price_difference',
    'reference_price_ask_price_difference',
    'reference_price_wap_difference',
    'bid_price_ask_price_difference'
]

# Row-wise ratios
ratio_periods = [1, 2, 3, 4, 5, 10, 15]
ratio_columns = [
    'matched_size', 'imbalance_buy_sell_size',
    'bid_size', 'ask_size'
]

# Rolling statistics
rolling_windows = [3]
rolling_columns = [
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',

    'imbalance_size_matched_size_sum',

    'reference_price_far_price_difference',
    'reference_price_near_price_difference',
    'reference_price_bid_price_difference',
    'reference_price_ask_price_difference',
    'reference_price_wap_difference',
    'bid_price_ask_price_difference'
]

# Current day open differences
current_day_open_columns = [
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',

    'imbalance_size_matched_size_sum',

    'reference_price_bid_price_difference',
    'reference_price_ask_price_difference',
    'reference_price_wap_difference',
    'bid_price_ask_price_difference'
]

# Current day 30th differences
current_day_30th_columns = [
    'imbalance_buy_sell_size', 'matched_size'
]

# Previous day close difference
previous_day_close_columns = [
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',

    'imbalance_size_matched_size_sum',
]

# Previous day high difference
previous_day_high_columns = [
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',
]

# Previous day shifts
daily_shift_columns = [
    'imbalance_buy_sell_size', 'matched_size', 'target'
]

# Stock ranks inside buckets
stock_pct_rank_columns = [
    'imbalance_buy_sell_size', 'matched_size',
    'reference_price', 'far_price', 'near_price',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap'
]

In [8]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

verbose = False

In [9]:
df_test_current_day = None
df_test_previous_day = None
df_test_current_day_open = None
df_test_previous_day_close = None
df_test_previous_day_high = None

#df_predictions = []

for batch_idx, (df_test_batch, df_revealed_targets_batch, df_submission_batch) in enumerate(iter_test):
    
    df_test_batch[columns_to_cast] = df_test_batch[columns_to_cast].astype(np.float32)
        
    # Merge precomputed stock weights
    df_test_batch['stock_weight'] = df_test_batch['stock_id'].map(stock_weights).astype(np.float32)
    
    # Drop imbalance_buy_sell_flag because it's only the sign of imbalance_size
    df_test_batch['imbalance_buy_sell_size'] = df_test_batch.eval('imbalance_size * imbalance_buy_sell_flag')
    df_test_batch.drop(columns=['imbalance_buy_sell_flag'], inplace=True)

    # Column-wise size features
    df_test_batch['imbalance_size_matched_size_ratio'] = df_test_batch.eval('imbalance_size / matched_size')
    df_test_batch['imbalance_size_matched_size_difference'] = df_test_batch.eval('imbalance_size - matched_size')
    df_test_batch['imbalance_size_matched_size_sum'] = df_test_batch.eval('imbalance_size + matched_size')
    df_test_batch['imbalance_size_bid_size_ratio'] = df_test_batch.eval('imbalance_size / bid_size')
    df_test_batch['matched_size_bid_size_ratio'] = df_test_batch.eval('matched_size / bid_size')
    df_test_batch['bid_size_ask_size_ratio'] = df_test_batch.eval('bid_size / ask_size')
    
    # Column-wise price features
    df_test_batch['reference_price_far_price_difference'] = df_test_batch.eval('reference_price - far_price')
    df_test_batch['reference_price_near_price_difference'] = df_test_batch.eval('reference_price - near_price')
    df_test_batch['reference_price_bid_price_difference'] = df_test_batch.eval('reference_price - bid_price')
    df_test_batch['reference_price_ask_price_difference'] = df_test_batch.eval('reference_price - ask_price')
    df_test_batch['reference_price_wap_difference'] = df_test_batch.eval('reference_price - wap')
    df_test_batch['far_price_near_price_difference'] = df_test_batch.eval('far_price - near_price')
    df_test_batch['bid_price_ask_price_difference'] = df_test_batch.eval('bid_price - ask_price')
    
    # Column-wise price and size features
    df_test_batch['reference_price_matched_size'] = df_test_batch.eval('reference_price * matched_size')
    df_test_batch['reference_price_imbalance_size'] = df_test_batch.eval('reference_price * imbalance_size')
    
    current_seconds_in_bucket = df_test_batch['seconds_in_bucket'].values[0]
    if current_seconds_in_bucket == 0:
        
        if df_test_current_day is not None:
            # Revealed targets are merged to current day dataframe before new current day dataframe is created
            df_revealed_targets_batch = df_revealed_targets_batch.loc[:, [
                'stock_id', 'seconds_in_bucket', 'revealed_target'
            ]].rename(columns={'revealed_target': 'target'}).astype(np.float32)
            df_test_current_day = df_test_current_day.merge(df_revealed_targets_batch, on=['stock_id', 'seconds_in_bucket'], how='left')
            # Current day dataframe becomes the previous day dataframe
            df_test_previous_day = df_test_current_day.copy(deep=True)
            if verbose:
                print(f'Iteration {batch_idx} previous date is created from single date')
        else:
            # Training set date_id 480 becomes the previous day dataframe on the first iteration
            df_test_previous_day = df_train.copy(deep=True)
            if verbose:
                print(f'Iteration {batch_idx} training set assigned to previous day')
            
        # Create the current day dataframe from the first batch
        df_test_current_day = df_test_batch.loc[:, test_columns].copy(deep=True)
        
        # Create the static dataframes from the previous day dataframe
        df_test_current_day_open = df_test_batch.loc[:, ['stock_id'] + current_day_open_columns].copy(deep=True).rename(columns={
            column: f'{column}_current_day_open' for column in current_day_open_columns
        })
        df_test_previous_day_close = df_test_previous_day.groupby(['stock_id'])[previous_day_close_columns].last().rename(columns={
            column: f'{column}_previous_day_close' for column in previous_day_close_columns
        }).reset_index()
        df_test_previous_day_high = df_test_previous_day.groupby(['stock_id'])[previous_day_high_columns].max().rename(columns={
            column: f'{column}_previous_day_high' for column in previous_day_high_columns
        }).reset_index()
        
        # Create 30th dataframe from the current as a placeholder with NaNs
        df_test_current_day_30th = df_test_batch.loc[:, ['stock_id'] + current_day_30th_columns].copy(deep=True).rename(columns={
            column: f'{column}_current_day_30th' for column in current_day_30th_columns
        }).reset_index()
        df_test_current_day_30th.loc[:, [f'{column}_current_day_30th' for column in current_day_30th_columns]] = np.nan
        if verbose:
            print(f'Iteration {batch_idx} current day 30th is created with nans')

        if verbose:
            print(f'Iteration {batch_idx} single date is first batch')
                    
    else:
        # Concatenate test batch to current day dataframe
        df_test_current_day = pd.concat((df_test_current_day, df_test_batch.loc[:, test_columns]), axis=0).reset_index(drop=True)
        if verbose:
            print(f'Iteration {batch_idx} single date concatenated')
            
        if current_seconds_in_bucket == 300:
            df_test_current_day_30th = df_test_batch.loc[:, ['stock_id'] + current_day_30th_columns].copy(deep=True).rename(columns={
                column: f'{column}_current_day_30th' for column in current_day_30th_columns
            }).reset_index()
            if verbose:
                print(f'Iteration {batch_idx} current day 30th is updated with real values')
            
    for period in difference_periods:
        df_test_diff_features = df_test_current_day.groupby(['stock_id'])[difference_columns].diff(periods=period).rename(columns={
            column: f'{column}_diff_{period}' for column in difference_columns
        })
        df_test_diff_features = pd.concat((
            df_test_current_day.loc[:, ['stock_id', 'date_id', 'seconds_in_bucket']],
            df_test_diff_features
        ), axis=1, ignore_index=False)
        df_test_batch = df_test_batch.merge(df_test_diff_features, on=['stock_id', 'date_id', 'seconds_in_bucket'], how='left')
        
    for period in ratio_periods:
        df_test_ratio_features = df_test_current_day.groupby(['stock_id'])[ratio_columns].pct_change(periods=period).rename(columns={
            column: f'{column}_pct_change_{period}' for column in ratio_columns
        })
        df_test_ratio_features = pd.concat((
            df_test_current_day.loc[:, ['stock_id', 'date_id', 'seconds_in_bucket']],
            df_test_ratio_features
        ), axis=1, ignore_index=False)
        df_test_batch = df_test_batch.merge(df_test_ratio_features, on=['stock_id', 'date_id', 'seconds_in_bucket'], how='left')
        
    for window in rolling_windows:
        df_rolling_features = df_test_current_day.groupby(['stock_id'])[
            rolling_columns
        ].rolling(window=window, min_periods=1).agg([
            'mean', 'std'
        ]).reset_index(level=(0,), drop=True).astype(np.float32)
        df_rolling_features.columns = df_rolling_features.columns.map(f'_window_{window}_'.join).str.strip('_')
        df_rolling_features = pd.concat((
            df_test_current_day.loc[:, ['stock_id', 'date_id', 'seconds_in_bucket']],
            df_rolling_features
        ), axis=1, ignore_index=False)
        df_test_batch = df_test_batch.merge(df_rolling_features, on=['stock_id', 'date_id', 'seconds_in_bucket'], how='left')
       
    # Current day open difference features
    df_test_batch = df_test_batch.merge(df_test_current_day_open, on=['stock_id'], how='left')
    df_test_batch[[f'{column}_current_day_open_difference' for column in current_day_open_columns]] =\
    df_test_batch[[f'{column}_current_day_open' for column in current_day_open_columns]] -\
    df_test_batch[current_day_open_columns].values
    
    # Current day 30th difference features
    df_test_batch = df_test_batch.merge(df_test_current_day_30th, on=['stock_id'], how='left')
    df_test_batch[[f'{column}_current_day_30th_difference' for column in current_day_30th_columns]] =\
    df_test_batch[[f'{column}_current_day_30th' for column in current_day_30th_columns]] -\
    df_test_batch[current_day_30th_columns].values
    
    # Previous day close difference features
    df_test_batch = df_test_batch.merge(df_test_previous_day_close, on=['stock_id'], how='left')
    df_test_batch[[f'{column}_previous_day_close_difference' for column in previous_day_close_columns]] =\
    df_test_batch[[f'{column}_previous_day_close' for column in previous_day_close_columns]] -\
    df_test_batch[previous_day_close_columns].values
    
    # Previous day high difference features
    df_test_batch = df_test_batch.merge(df_test_previous_day_high, on=['stock_id'], how='left')
    df_test_batch[[f'{column}_previous_day_high_difference' for column in previous_day_high_columns]] =\
    df_test_batch[[f'{column}_previous_day_high' for column in previous_day_high_columns]] -\
    df_test_batch[previous_day_high_columns].values

    # Merge daily shift features
    df_test_previous_day_values = df_test_previous_day.loc[
        df_test_previous_day['seconds_in_bucket'] == current_seconds_in_bucket,
        daily_shift_columns + ['stock_id']
    ].rename(columns={column: f'{column}_daily_shift' for column in daily_shift_columns})
    df_test_batch = df_test_batch.merge(
        df_test_previous_day_values,
        on=['stock_id'],
        how='left'
    )
    
    df_test_batch[[f'{column}_stock_pct_rank' for column in stock_pct_rank_columns]] = df_test_batch[stock_pct_rank_columns].rank(pct=True).astype(np.float32)
    
    df_test_batch.replace([np.inf, -np.inf], np.nan, inplace=True)
            
    batch_predictions = np.zeros(df_test_batch.shape[0])
    for model in list(models.values()):
        batch_predictions += (model.predict(xgb.DMatrix(df_test_batch.loc[:, config['training']['features']])) / len(models))
    
    df_test_batch['prediction'] = batch_predictions
    df_test_batch['weighted_prediction'] = df_test_batch['prediction'] * df_test_batch['stock_weight']
    df_test_batch['target'] = df_test_batch['prediction'] - df_test_batch['weighted_prediction'].sum()
    
    #df_predictions.append(df_test_batch)
    
    df_submission_batch = df_submission_batch.drop(columns=['target']).merge(df_test_batch.loc[:, ['row_id', 'target']], on='row_id', how='left')    
    env.predict(df_submission_batch)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
