In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

kaggle_environment = True # True if running on Kaggle, don't forget to add the dataset!

if kaggle_environment:
    data_path = '/kaggle/input/'
else:
    data_path = 'kaggle/input/'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/optiver-trading-at-the-close/public_timeseries_testing_util.py
/kaggle/input/optiver-trading-at-the-close/train.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv
/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv
/kaggle/input/optiver-trading-at-the-close/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/optiver-trading-at-the-close/optiver2023/__init__.py


In [2]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
revealed_targets = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [3]:
train['imbalance_size'].interpolate(method='linear', inplace=True)


In [4]:
from sklearn.linear_model import LinearRegression

subset_train_matched = train[['matched_size', 'reference_price']]
train_df = subset_train_matched.dropna()
predict_subset = train[train['matched_size'].isna()]

if not predict_subset.empty:
    # Define the model
    model = LinearRegression()

    # Train the model
    X_train = train_df[['reference_price']]
    y_train = train_df['matched_size']
    model.fit(X_train, y_train)

    # Handle missing values in the prediction subset
    mean_value = X_train['reference_price'].mean()
    X_predict = predict_subset[['reference_price']].copy()
    X_predict['reference_price'].fillna(mean_value, inplace=True)

    # Predict the matched_size values
    predicted_values = model.predict(X_predict)

    # Assign the predicted values back to the main dataframe
    train.loc[predict_subset.index, 'matched_size'] = predicted_values
else:
    print("There are no missing values in 'matched_size'.")

In [5]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def impute_near_far(test: pd.DataFrame) -> pd.DataFrame:
  
    near_far_test_imp = test[['far_price', 'near_price']]
    
    imputer_test = IterativeImputer(max_iter=10, random_state=0)
    df_imputed_test = imputer_test.fit_transform(near_far_test_imp)
    
    df_imputed_test = pd.DataFrame(df_imputed_test, columns=near_far_test_imp.columns)
    
    test[['far_price', 'near_price']] = df_imputed_test[['far_price', 'near_price']]
    
    return test

train = impute_near_far(train)
test = impute_near_far(test)


In [6]:
train['bid_price'].interpolate(method='linear', inplace=True)

In [7]:
train['ask_price'].interpolate(method='linear', inplace=True)

In [8]:
train['reference_price'].fillna((train['bid_price'] + train['ask_price']) / 2, inplace=True)


In [9]:
train['wap'] = (train['bid_price']*train['ask_size'] + train['ask_price']*train['bid_price']) / (train['bid_size'] + train['ask_size'])


In [10]:
train.isna().sum()

stock_id                    0
date_id                     0
seconds_in_bucket           0
imbalance_size              0
imbalance_buy_sell_flag     0
reference_price             0
matched_size                0
far_price                   0
near_price                  0
bid_price                   0
bid_size                    0
ask_price                   0
ask_size                    0
wap                         0
target                     88
time_id                     0
row_id                      0
dtype: int64

In [11]:
test.isna().sum()


stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
time_id                    0
row_id                     0
dtype: int64

#### Feature Engineering - Testing with LightGBM and Lagged Features ####

In [12]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from itertools import combinations
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [13]:
def feat_eng(df, num_lags, columns_to_lag):
    grouping_columns = ['date_id']
    if 'time_id' in df.columns:
        grouping_columns.append('time_id')
    grouped = df.groupby(grouping_columns)
    for col in columns_to_lag:
        for lag in range(1, num_lags + 1):
            df[f'{col}_lag_{lag}'] = grouped[col].shift(lag)
    
    # Drop unwanted columns
    cols = [c for c in df.columns if c not in ['row_id']]
    df = df[cols]
    
    return df

In [14]:
feat_testing = feat_eng(train, 3, ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap'])
feat_testing.interpolate(method='linear', inplace=True)
feat_testing.fillna(method='ffill', inplace=True)
feat_testing.fillna(method='bfill', inplace=True)


In [15]:
feat_testing.isna().sum()

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
imbalance_size_lag_1       0
imbalance_size_lag_2       0
imbalance_size_lag_3       0
reference_price_lag_1      0
reference_price_lag_2      0
reference_price_lag_3      0
matched_size_lag_1         0
matched_size_lag_2         0
matched_size_lag_3         0
far_price_lag_1            0
far_price_lag_2            0
far_price_lag_3            0
near_price_lag_1           0
near_price_lag_2           0
near_price_lag_3           0
bid_price_lag_1            0
bid_price_lag_2            0
bid_price_lag_3            0
bid_size_lag_1

In [None]:
pip install optuna

#### Alternative Approach using Bayesian Optimization for Hyperparam tuning ####

In [None]:
# import optuna
# import lightgbm as lgb
# from sklearn.metrics import mean_absolute_error
# from sklearn.model_selection import train_test_split

# # Sample data preparation
# y = feat_testing['target'].values
# X = feat_testing.drop(columns='target')
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# def objective(trial):
#     # Define search space
#     params = {
#         'objective': 'regression',
#         'metric': 'mae',
#         'verbosity': -1,
#         'boosting_type': 'gbdt',
#         'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
#         'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#     }
    
#     model = lgb.LGBMRegressor(**params)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return mean_absolute_error(y_test, y_pred)

# # Initiate the study object
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Results
# print('Number of finished trials: ', len(study.trials))
# print('Best trial:')
# trial = study.best_trial

# print('Value: ', trial.value)
# print('Params: ')
# for key, value in trial.params.items():
#     print(f'    {key}: {value}')


In [17]:
%%time

from sklearn.model_selection import train_test_split
import lightgbm as lgb

y = feat_testing['target'].values
X = feat_testing.drop(columns='target')

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#trial 13
params = {
    'lambda_l1': 0.014134741987050156,
    'lambda_l2': 0.000131004124534732,
    'num_leaves': 161,
    'feature_fraction': 0.9167193786275163,
    'bagging_fraction': 0.852338866686305,
    'bagging_freq': 1,
    'min_child_samples': 21
}

# Instantiate the LGBMRegressor with the new parameters
m = lgb.LGBMRegressor(**params, random_state=42)

m.fit(X_train, y_train)
y_pred = m.predict(X_test)

CPU times: user 4min 38s, sys: 5.16 s, total: 4min 43s
Wall time: 1min 28s


In [19]:
from sklearn.metrics import mean_absolute_error

mae_score = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae_score:.4f}")

Mean Absolute Error (MAE): 6.2687


In [20]:
if kaggle_environment:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()

In [None]:
# counter = 0
# for (test, revealed_targets, sample_prediction) in iter_test:
#     print(test)
#     test['counter'] = counter
#     print(test['counter'])
#     if counter == 0:
#         test_all = test
#     else:
#         test_all = pd.concat([test_all, test], axis = 0)

      
#     test_feat = feat_eng(test, 3, ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap'])
#     test_feat.interpolate(method='linear', inplace=True)
#     test_feat.fillna(method='ffill', inplace=True)
#     test_feat.fillna(method='bfill', inplace=True)
#     sample_prediction['target'] = m.predict(test_feat.drop(columns = ['counter']))
#     print(sample_prediction)
#     env.predict(sample_prediction)
#     counter += 1

In [None]:
if kaggle_environment:
    # To count how many time the "for loop" runs.
    counter = 0

    # init 3 empty lists
    test_ls, revealed_targets_ls, sample_prediction_ls = [], [], []

    for (test, revealed_targets, sample_prediction) in iter_test:
        # Append the dataframe that API return into the list.
        test_ls.append(test.copy())
        revealed_targets_ls.append(revealed_targets.copy())
        sample_prediction_ls.append(sample_prediction.copy())
        
        feat_v_testing = feat_eng(test, 3, ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap'])
        feat_v_testing.interpolate(method='linear', inplace=True)
        feat_v_testing.fillna(method='ffill', inplace=True)
        feat_v_testing.fillna(method='bfill', inplace=True)
        
        # Writes our predictions 
        sample_prediction["target"] = m.predict(feat_v_testing)
        
        # This line submit our predictions.
        env.predict(sample_prediction)
        counter += 1

    print('\n', '=' * 50, sep="")
    print(f"counter: {counter}")