# Section 1: Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import gc

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Section 2: Data Preprocessing

In [2]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
# revealed_targets = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
# sample_submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [3]:
train.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

In [4]:
train.shape

(5237980, 17)

In [5]:
train.head(5)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


## Section 2A: Feature Engineering
> TODO: Feature Engin.


In [6]:
def feat_eng(df):
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']] # compatible for training, test and inference
    df = df[cols]
    # feature_engineering
    df.drop(columns=[
        'date_id', 
#         'reference_price_far_price_imb',
#         'reference_price_minus_near_price',
#         'reference_price_near_price_imb',
#         'far_price_near_price_imb',
#         'far_price_ask_price_imb',
#         'far_price_bid_price_imb',
#         'far_price_minus_wap',
#         'std_size',
#         'bid_size_over_ask_size',
#         'ask_price_bid_price_imb',
#         'near_price_times_wap'
    ], inplace=True)
        
    gc.collect()
    
    return df

## Section 2B: Ready X, y

> TODO: train_test_split

In [7]:
%%time

y = train['target'].values
X = feat_eng(train.drop(columns='target'))

# prices = [c for c in train.columns if 'price' in c]
# pca_prices = PCA(n_components=1)
# X['pca_prices'] = pca_prices.fit_transform(X[prices].fillna(1))

CPU times: user 480 ms, sys: 292 ms, total: 773 ms
Wall time: 780 ms


# Section 3: Train Model

> TODO: Hyperparameters Tuning (CV),
Optimizer,
Loss Function,
Learning Rate Scheduler,

In [8]:
%%time

m = lgb.LGBMRegressor(learning_rate=0.018052307589575444, max_depth=9, n_estimators=700,
              num_leaves=442, objective='mae', random_state=42,
              reg_alpha=0.02216069565875271, reg_lambda=0.01223572246957101)
m.fit(X, y)

CPU times: user 13min 34s, sys: 21.1 s, total: 13min 55s
Wall time: 4min 54s


## Section 3a: Inspect Model
> TODO: Inspect Training results ( Overfit/underfit)

In [9]:
display(sorted(list(zip(m.feature_importances_, X.columns))))

[(3934, 'imbalance_buy_sell_flag'),
 (10344, 'wap'),
 (10952, 'far_price'),
 (11144, 'near_price'),
 (12483, 'ask_size'),
 (12705, 'reference_price'),
 (13262, 'bid_size'),
 (14392, 'ask_price'),
 (14504, 'bid_price'),
 (17486, 'stock_id'),
 (18164, 'imbalance_size'),
 (18532, 'seconds_in_bucket'),
 (23699, 'matched_size')]

In [10]:
m

## Section 3b: Test Model

In [11]:
test = feat_eng(test)
# test['pca_prices'] = pca_prices.transform(test[prices].fillna(1))

In [12]:
test.shape

(33000, 13)

In [13]:
m.predict(test)

array([-1.57572974,  1.56921004,  3.73720539, ...,  1.43214855,
        1.84654655, -2.46154919])

# Section 4: Inference Model

## Section 4a: Helper Function

In [14]:
def zero_sum(prices, volumes): 
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [15]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [16]:
counter = 0 # not sure why its needed but i'll keep it now: Falcon
for (test, revealed_targets, sample_prediction) in iter_test:
    feat = feat_eng(test)
#     feat['pca_prices'] = pca_prices.transform(feat[prices].fillna(1))
    sample_prediction['target'] = m.predict(feat)
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    env.predict(sample_prediction)
    counter += 1 

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
