# Section 1: Imports

In [6]:
# basics
import pandas as pd
import numpy as np

# models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

# system
import gc
import os

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


def is_kaggle_env():
    return 'KAGGLE_KERNEL_RUN_TYPE' in os.environ

if is_kaggle_env():
    print("This is running inside Kaggle!")
else:
    print("This is running outside Kaggle!")


This is running outside Kaggle!


# Section 2: Data Preprocessing

In [29]:
if not is_kaggle_env():
    DATA_DIR = "../data/"
else:
    DATA_DIR = "/kaggle/input/optiver-trading-at-the-close/"
    
train = pd.read_csv(DATA_DIR + 'train.csv')
# revealed_targets = pd.read_csv(DATA_DIR / 'example_test_files/revealed_targets.csv')
test = pd.read_csv(DATA_DIR + 'example_test_files/test.csv')
# sample_submission = pd.read_csv(DATA_DIR / 'example_test_files/sample_submission.csv')

In [30]:
train.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

In [31]:
train.shape

(5237980, 17)

In [32]:
train.head(5)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [33]:
train = train.fillna(1) # simply drop nan values
train.shape

(5237980, 17)

## Section 2A: Feature Engineering
> TODO: Feature Engin.


In [34]:
def feat_eng(df):
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']] # compatible for training, test and inference
    df = df[cols]
    # feature_engineering
    df.drop(columns=[
        'date_id', 
#         'reference_price_far_price_imb',
#         'reference_price_minus_near_price',
#         'reference_price_near_price_imb',
#         'far_price_near_price_imb',
#         'far_price_ask_price_imb',
#         'far_price_bid_price_imb',
#         'far_price_minus_wap',
#         'std_size',
#         'bid_size_over_ask_size',
#         'ask_price_bid_price_imb',
#         'near_price_times_wap'
    ], inplace=True)
        
    gc.collect()
    
    return df

## Section 2B: Ready X, y

> TODO: train_test_split

In [35]:
%%time

y = train['target'].values
X = feat_eng(train.drop(columns='target'))

# prices = [c for c in train.columns if 'price' in c]
# pca_prices = PCA(n_components=1)
# X['pca_prices'] = pca_prices.fit_transform(X[prices].fillna(1))

CPU times: user 211 ms, sys: 176 ms, total: 388 ms
Wall time: 385 ms


In [36]:
display(y.shape)
display(X.shape)

(5237980,)

(5237980, 13)

# Section 3: Train Model

> TODO: Hyperparameters Tuning (CV),
Optimizer,
Loss Function,
Learning Rate Scheduler,

In [24]:
%%time
X = X[:100000]
y = y[:100000]


m = RandomForestRegressor(
    criterion='absolute_error',
#     max_depth=5, 
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
m.fit(X, y)

CPU times: user 8h 3min 20s, sys: 36.1 s, total: 8h 3min 56s
Wall time: 22min 56s


## Section 3a: Inspect Model
> TODO: Inspect Training results ( Overfit/underfit)

In [25]:
display(sorted(list(zip(m.feature_importances_, X.columns))))

[(0.011251007347893067, 'imbalance_buy_sell_flag'),
 (0.03550823107395644, 'near_price'),
 (0.03640299924368233, 'far_price'),
 (0.0735956830614513, 'wap'),
 (0.07954614646297864, 'reference_price'),
 (0.08051785991492752, 'ask_price'),
 (0.08075408849713815, 'imbalance_size'),
 (0.08142565115467491, 'bid_price'),
 (0.08183499907418827, 'stock_id'),
 (0.09222819006839579, 'seconds_in_bucket'),
 (0.11170180083854137, 'ask_size'),
 (0.11186299949149461, 'bid_size'),
 (0.12337034377067761, 'matched_size')]

In [26]:
m

## Section 3b: Test Model

In [37]:
test = feat_eng(test)
# test['pca_prices'] = pca_prices.transform(test[prices].fillna(1))

In [38]:
test.shape
test = test.fillna(1)

In [39]:
m.predict(test)

array([-1.08476278,  1.52677893,  2.26571559, ...,  2.91283727,
        0.36857723,  0.12997984])

# Section 4: Inference Model

## Section 4a: Helper Function

In [43]:
def zero_sum(prices, volumes): 
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [44]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

ModuleNotFoundError: No module named 'optiver2023'

In [42]:
counter = 0 # not sure why its needed but i'll keep it now: Falcon
for (test, revealed_targets, sample_prediction) in iter_test:
    feat = feat_eng(test)
#     feat['pca_prices'] = pca_prices.transform(feat[prices].fillna(1))
    feat = feat.fillna(1)
    sample_prediction['target'] = m.predict(feat)
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    env.predict(sample_prediction)
    counter += 1 

NameError: name 'iter_test' is not defined