In [1]:
import joblib

import pandas as pd
import numpy as np
import polars as pl

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt

from sklearn.preprocessing import StandardScaler

import gc

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f"Decreased by {decrease:.2f}%")

    return df

In [3]:
# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (
        np.average((y_true) ** 2, weights=sample_weight) + 1e-38
    )
    return -r2


# Custom R2 metric for LightGBM
def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (
        np.average((y_true) ** 2, weights=sample_weight) + 1e-38
    )
    return "r2", r2, True


# Custom R2 metric for CatBoost
class r2_cbt(object):
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)

        return error_sum, weight_sum

In [4]:
target = "responder_6"
main = f"C:/Users/edmun/OneDrive/Desktop/Personal-Projects/Kaggle/Jane Street Real Time Market Data Forecasting"

TRAINING = False

LGB_Params = {
    "n_estimators": 500,
    "device": "gpu",
    "gpu_use_dp": True,
    "objective": "l2",
    "n_jobs": 2,
}

XGB_Params = {
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 1,
    "reg_lambda": 5,
    "eval_metric": r2_xgb,
    "disable_default_eval_metric": True,
    "device": "cuda",
    "early_stopping_rounds": 100,
}

CBT_Params = {
    "iterations": 1000,
    "learning_rate": 0.05,
    "task_type": "GPU",
    "loss_function": "RMSE",
    "eval_metric": r2_cbt(),
}

# Number of dates to skip from the beginning of the dataset
skip_dates = 500

# Global variable for fitted models for the test set inference
fitted_models = []

# Define the feature names based on the number of features (79 in this case)
feature_names = [f"feature_{i:02d}" for i in range(79)]
exogeneous_features = [
    "sin_time_id",
    "cos_time_id",
    "sin_time_id_halfday",
    "cos_time_id_halfday",
]

N_fold = 5

In [5]:
X_valid = pd.read_parquet(f"{main}/data/training_data/X_valid.parquet")
y_valid = (
    pd.read_parquet(f"{main}/data/training_data/y_valid.parquet").to_numpy().flatten()
)
w_valid = (
    pd.read_parquet(f"{main}/data/training_data/w_valid.parquet").to_numpy().flatten()
)

In [None]:
model_dict = {
    "XGB": xgb.XGBRegressor(**XGB_Params),
    "LGB": lgb.LGBMRegressor(**LGB_Params),
    "CBT": cbt.CatBoostRegressor(**CBT_Params),
}

In [7]:
def training_models(model_dict, fold):
    X_train = reduce_mem_usage(
        pd.read_parquet(f"{main}/data/training_data/X_train_{fold}.parquet")
    )
    y_train = (
        reduce_mem_usage(
            pd.read_parquet(f"{main}/data/training_data/y_train_{fold}.parquet")
        )
        .to_numpy()
        .flatten()
    )
    w_train = (
        reduce_mem_usage(
            pd.read_parquet(f"{main}/data/training_data/w_train_{fold}.parquet")
        )
        .to_numpy()
        .flatten()
    )

    for name, model in model_dict.items():
        if name == "XGB":
            model.fit(
                X_train,
                y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                sample_weight_eval_set=[w_valid],
                verbose=10,
            )

        if name == "LGB":
            model.fit(
                X_train,
                y_train,
                w_train,
                eval_metric=[r2_lgb],
                eval_set=[(X_valid, y_valid, w_valid)],
                callbacks=[lgb.early_stopping(100), lgb.log_evaluation(10)],
            )

        if name == "CBT":
            evalset = cbt.Pool(X_valid, y_valid, weight=w_valid)

            model.fit(
                X_train,
                y_train,
                sample_weight=w_train,
                eval_set=[evalset],
                verbose=10,
                early_stopping_rounds=100,
            )

        model_filename = f"{main}/Scaled_Models/{name}/{name}_{fold+1}.pkl"
        joblib.dump(model, model_filename)

        gc.collect()

    del (
        X_train,
        y_train,
        w_train,
    )

In [None]:
for fold in range(N_fold):
    training_models(model_dict=model_dict, fold=fold)