In [None]:
###### https://www.kaggle.com/code/verracodeguacas/hoarders-ensemble

In [None]:
import os
import gc
import sys
import numpy as np
import pandas as pd 
# pd.set_option("display.max_columns", 50)
# pd.set_option("display.max_rows", 50)

from warnings import filterwarnings
filterwarnings(action="ignore", category=pd.errors.PerformanceWarning)
# pd.options.mode.chained_assignment = None # pd 경고문을 안 뜨게 해

# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline

import joblib
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit

import json
import optuna
from itertools import combinations
# from pprint import pprint, pformat
# from optuna.visualization import (plot_optimization_history,
#                                  plot_param_importances,
#                                  plot_parallel_coordinate)

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
        
PATH = "/kaggle/input/optiver-trading-at-the-close/"
# reduced_PATH = "/kaggle/input/optiver-memoryreduceddatasets/"

In [None]:
change_dtypes = {"stock_id": np.uint8,
                "date_id": np.uint16, 
                "seconds_in_bucket": np.uint16,
                "imbalance_buy_sell_flag": np.int8,
                "time_id": np.uint16}

train_df = pd.read_csv(PATH + "train.csv", dtype=change_dtypes).drop(columns=["row_id","time_id"], axis=1)
# test_df = pd.read_csv(PATH + "example_test_files/test.csv", dtype=change_dtypes).drop(columns=["row_id","time_id"], axis=1)

train_df.dropna(subset=["target"], inplace=True)
train_df.reset_index(drop=True, inplace=True)

del change_dtypes

In [None]:
train_df.head()

In [None]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print(f"starting memory: {start_mem}")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # check if the column's data type is not "object" (i.e. numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                    
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                    
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                    
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
                else:
                    # check if the column's data type is a float
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float32)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
                        
        if verbose:
            print(f"start memory: {start_mem}")
            end_mem = df.memory_usage().sum() / 1024 ** 2
            print(f"current memory: {end_mem}")
        
        return df

In [None]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a,b,c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val

            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)
            
    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a,b,c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a,b,c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

In [None]:
def create_features(df, reduce_memory=True):
    cols_to_drop = ["imbalance_buy_sell_flag"]
    prices = ["reference_price","far_price","near_price","ask_price","bid_price","wap"]
    sizes = ["matched_size","bid_size","ask_size","imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_size) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size - ask_size)/(bid_size + ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size - matched_size)/(matched_size + imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    
#     df["price_spread"] = df.eval("ask_price - bid_price")
#     df["imbalance_ratio"] = df.eval("imbalance_size / matched_size")
    
#     df["ask_volume"] = df.eval("ask_size * ask_price")
#     df["bid_volume"] = df.eval("bid_size * bid_price")
    
#     df["ask_bid_volumes_diff"] = df["ask_volume"] - df["bid_volume"]
    
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"]) # size imbalance
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
#     df["imbalance_buy_flag"] = np.where(df["imbalance_buy_sell_flag"]==1, True, False)
#     df["imbalance_sell_flag"] = np.where(df["imbalance_buy_sell_flag"]==-1, True, False)
    
    # create features for pairwise price imbalances
    # https://www.kaggle.com/code/judith007/lb-5-3393-rapids-gpu-speeds-up-feature-engineer
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
    
    # V2 features # 아래 피쳐들 의미를 잘 모르겠땅
    df["imbalance_momentum"] = df.groupby(["stock_id"])["imbalance_size"].diff(periods=1)/df["matched_size"]    
#     df["spread_intensity"] = df.groupby(["stock_id"])["price_spread"].diff()
    df["price_pressure"] = df["imbalance_size"] * (df["ask_price"] - df["bid_price"])
#     df["market_urgency"] = df["price_spread"] * df["imbalance_size1"]
    df["depth_pressure"] = (df["ask_size"] - df["bid_size"]) * (df["far_price"] - df["near_price"])
    
    # V3 features
    for col in ["matched_size","imbalance_size","reference_price","imbalance_buy_sell_flag"]:
        for window in [1,2,3,10]:      # 이거 숫자들의 의미가 뭐지?
            df[f"{col}_shift_{window}"] = df.groupby("stock_id")[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby("stock_id")[col].pct_change(window)
            
    # calculate diff features for specific columns
    for col in ["ask_price","bid_price","ask_size","bid_size"]:
        for window in [1,2,3,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
            
    for c in [["ask_price","bid_price","wap","reference_price"], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    
    df = df.drop(columns=cols_to_drop)
    
    if reduce_memory:
        print(f"Reducing memory usage... (Current: {df.memory_usage().sum() / 1024 / 1024:.1f} MB)")
        df = df.astype({"stock_id":"uint8",
                       "seconds_in_bucket":"uint16",
                       # converts all float64s to float32s
                       **{col:"float32" for col in df.columns if df[col].dtype == "float64"}})
        print(f"Memory usage after reduction: {df.memory_usage().sum() / 1024 / 1024:.1f} MB")
    
    return df

In [None]:
from IPython.display import FileLink
train_df = create_features(train_df)
train_df = reduce_mem_usage(train_df)
print(train_df.shape)
#     os.chdir("/kaggle/working/")
#     FileLink("train.feather")

y = train_df["target"]
train_df.drop(columns=["target","date_id"], inplace=True)
train_df.head()

In [None]:
for_drops = []
for col,d in zip(train_df.columns, train_df.isnull().sum()):
#     print(col, " :: ", d)
    if (d/5237892) > 0.5:
        for_drops.append(col)

In [None]:
train_df.drop(columns=for_drops, inplace=True)
# print(train_df.shape)
gc.collect()

In [None]:
def cross_validate(model, X, y, cv):
    scores = np.zeros(cv.n_splits)
    
    print(f"Starting evaluation..")
    print("="*30)
    for i, (train_index, test_index) in enumerate(cv.split(X)):
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=False, test_size=0.1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(25, verbose=False)])
        
        y_pred = model.predict(X_test)
        scores[i] = mean_absolute_error(y_pred, y_test)
        print(f"Fold {i+1}: {scores[i]:.4f}")
        
    print("-"*30)
    print(f"Average MAE = {scores.mean():.4f} ± {scores.std():.2f}")
    print("="*30)
    
    return scores

In [None]:
### optimizing Optuna
def objective(trial):
    params = {"random_seed": 123,
             "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
             "num_leaves": trial.suggest_int("num_leaves", 20, 100),
             "max_depth": trial.suggest_int("max_depth", 1, 12),
             "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
             "subsample": trial.suggest_float("subsample", 0.4, 1.0),
             "subsample_freq": trial.suggest_categorical("subsample_freq", [0,1]),
             "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
             "reg_lambda": trial.suggest_float("reg_lambda", 5e-1, 1.0, log=True)}
#               "learning_rate": trial.suggest_float("learning_rate", 0.07, 0.12, log=False),
#               "num_leaves": num_leaves}
    
    model = lgb.LGBMRegressor(**params)
    model.fit(train_df, y)
    y_pred = model.predict(train_df)
    score = mean_absolute_error(y, y_pred)
    return score

In [None]:
def run_optimization(objective, n_trials=100, n_jobs=1, best_trial=None):
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="minimize")
    
    if best_trial is not None:
        print("Enqueuing previous best trial...")
        study.enqueue_trial(best_trial)
    
    study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, show_progress_bar=True)
    print(study.best_params)
    
    print(f"Num of finished trials: {len(study.trials)}")
    print(f"Best MAE: {study.best_value:.4f}")
    
    print("Params")
    print("=" * 10)
    print(study.best_params)
    with open("/kaggle/working/best_params.json", "w") as f:
        json.dump(study.best_params, f)
        os.chdir("/kaggle/working/")
        FileLink("best_params.json")
    return study

In [None]:
LOGGING_LEVELS = ["DEBUG","WARNING","INFO","SUCCESS","WARNING"]
def get_objective_function(cv=None, logging_level="info"):
    """Returns the objective function for optuna."""
    # configure logging level
    if logging_level.upper() not in LOGGING_LEVELS:
        raise ValueError(f"Expected logging_level to be one of {LOGGING_LEVELS}, but got '{logging_level}' instead.")
    handler = {"sink": sys.stdout, "level": logging_level.upper()}
        
    def optimize_lgbm(trial):
        """Optimizes a LGBMRegressor with cross-validation."""
        max_depth = 18
        param_space = {"boosting_type":"gbdt",
            "objective": "mae",
            "subsample": 0.6,
#             "num_leaves": 300, #30,
            "n_estimators": 8000, # 700,
            "min_child_samples": 20,
#             "reg_alpha": 0.02,
#             "reg_lambda": 0.01,
            "learning_rate": 0.087,
            "num_leaves": trial.suggest_int("num_leaves", 20, int((2**max_depth) * 0.75)),
            "max_depth": max_depth
        }
        model = lgb.LGBMRegressor(**param_space)    
        scores = cross_validate(model, train_df, y, cv=cv)
        return scores.mean()
    return optimize_lgbm

In [None]:
lgbm_best_params = {"boosting_type":"gbdt",
            "objective": "mae",
            "subsample": 0.5,  # 0.6
            "num_leaves": 256, #30,
            "n_estimators": 8000, # 700,
            "min_child_samples": 20,
#             "reg_alpha": 0.02,
#             "reg_lambda": 0.01,
            "learning_rate": 0.05,
            "num_leaves": 256,
            "max_depth": 18}

lgbm_best_params2 = {'objective': 'mae', 
                     'random_state': 1020, 
                     'max_bin': 256, 
                     'boosting_type': 'gbdt', 
                     'learning_rate': 0.015, 
                     'max_depth': 12, 
                     'n_estimators': 1400, 
                     'num_leaves': 300, 
                     'reg_alpha': 0.005, 
                     'reg_lambda': 0.001, 
                     'colsample_bytree': 0.6, 
                     'subsample': 0.0871, 
                     'min_child_samples': 128}

In [None]:
run_lgbm_optimization = False
n_trials = 10
logging_level = "info"
cv = TimeSeriesSplit(n_splits=3)  # 5

reuse_best_trial = False
best_trial = lgbm_best_params

import pickle
from IPython.display import clear_output
if run_lgbm_optimization:
    clear_output(wait=True)
    objective = get_objective_function(cv=cv, logging_level=logging_level)
    study = run_optimization(objective, n_trials=n_trials, n_jobs=1, 
                            best_trial=best_trial)
    lgbm_best_params = study.best_params
    
    best_trial = lgbm_best_params if reuse_best_trial else None
    print("best params:", study.best_params)
    
    if n_trials > 1:
        plot_param_importances(study).show()
        plot_parallel_coordinate(study, params=["max_depth","num_leaves",
                                               "learning_rate","min_split_gain",
                                               "min_child_samples"]).show()


def save_model(model, name: str):
    """Saves the LGBM model to disk in {name}.txt and {name}.pkl format.
    
    Load the pickled model with
    ````
    with open("{name}.pkl", "rb") as f:
         model = pickle.load(f)
    ```     
    """
    model.booster_.save_model(f"{name}.txt") # type lgb.basic.Booster

    # and the model with pickle
    with open(f"/kaggle/working/{name}.pkl", "wb") as f:
        pickle.dump(model, f)

        
# _ = cross_validate(
#     model=LGBMRegressor(**lgbm_best_params),
#     X=train_df,
#     y=y,
#     cv=TimeSeriesSplit(n_splits=5))      
        
    
rng = np.random.default_rng()
seeds = rng.integers(low=0, high=1000, size=3)
for i, seed in enumerate(seeds):
    print(f"Fitting model {i} with seed={seed}")
    X_train, X_test, y_train, y_test = train_test_split(train_df, y, shuffle=False, test_size=0.2)
    model = lgb.LGBMRegressor(**lgbm_best_params)
    model.set_params(random_state=seed)

    callbacks = [
        lgb.log_evaluation(period=100),
        lgb.early_stopping(100, verbose=True)
    ]
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=callbacks)
    mean_absolute_error(y_test, model.predict(X_test, num_iteration=model.best_iteration_))
    save_model(model, name=f"model-{i}")

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

model.fit(X_train, y_train)

cnt = 0
for (test, revealed_targets, sample_predict) in iter_test:
    sample_predict["target"] = model.predict(test.drop("row_id", axis=1))
    env.predict(sample_predict)
    cnt += 1