In [None]:
!pip install koolbox scikit-learn==1.5.2
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr
from xgboost import XGBRegressor
from sklearn.base import clone
from koolbox import Trainer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import optuna
import joblib
import gc

warnings.filterwarnings("ignore")

class CFG:
    train_path = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    test_path = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    sample_sub_path = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"

    target = "label"
    n_folds = 5
    seed = 42

    run_optuna = True
    n_optuna_trials = 250

def reduce_mem_usage(dataframe, dataset):    
    print('Reducing memory usage for:', dataset)
    initial_mem_usage = dataframe.memory_usage().sum() / 1024**2
    
    for col in dataframe.columns:
        col_type = dataframe[col].dtype

        c_min = dataframe[col].min()
        c_max = dataframe[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                dataframe[col] = dataframe[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                dataframe[col] = dataframe[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                dataframe[col] = dataframe[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                dataframe[col] = dataframe[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                dataframe[col] = dataframe[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                dataframe[col] = dataframe[col].astype(np.float32)
            else:
                dataframe[col] = dataframe[col].astype(np.float64)

    final_mem_usage = dataframe.memory_usage().sum() / 1024**2
    print('--- Memory usage before: {:.2f} MB'.format(initial_mem_usage))
    print('--- Memory usage after: {:.2f} MB'.format(final_mem_usage))
    print('--- Decreased memory usage by {:.1f}%\n'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))

    return dataframe

cols_to_drop = [
    'X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 
    'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716',
    'X717', 'X864', 'X867', 'X869', 'X870', 'X871', 'X872', 'X104', 'X110', 'X116',
    'X122', 'X128', 'X134', 'X140', 'X146', 'X152', 'X158', 'X164', 'X170', 'X176',
    'X182', 'X351', 'X357', 'X363', 'X369', 'X375', 'X381', 'X387', 'X393', 'X399',
    'X405', 'X411', 'X417', 'X423', 'X429'
]

train = pd.read_parquet(CFG.train_path).reset_index(drop=True)
test = pd.read_parquet(CFG.test_path).reset_index(drop=True)

train = train.drop(columns=cols_to_drop)
test = test.drop(columns=["label"] + cols_to_drop)

train = reduce_mem_usage(train, "train")
test = reduce_mem_usage(test, "test")

# Create interaction features for train
train['bid_ask_interaction'] = train['bid_qty'] * train['ask_qty']
train['bid_buy_interaction'] = train['bid_qty'] * train['buy_qty']
train['bid_sell_interaction'] = train['bid_qty'] * train['sell_qty']
train['ask_buy_interaction'] = train['ask_qty'] * train['buy_qty']
train['ask_sell_interaction'] = train['ask_qty'] * train['sell_qty']
train['buy_sell_interaction'] = train['buy_qty'] * train['sell_qty']

# Calculate spread indicators for train
train['spread_indicator'] = (train['ask_qty'] - train['bid_qty']) / (train['ask_qty'] + train['bid_qty'] + 1e-8)

# Volume-weighted features for train
train['volume_weighted_buy'] = train['buy_qty'] * train['volume']
train['volume_weighted_sell'] = train['sell_qty'] * train['volume']
train['volume_weighted_bid'] = train['bid_qty'] * train['volume']
train['volume_weighted_ask'] = train['ask_qty'] * train['volume']

# NEW FEATURES - Add ratio features
train['buy_sell_ratio'] = train['buy_qty'] / (train['sell_qty'] + 1e-8)
train['bid_ask_ratio'] = train['bid_qty'] / (train['ask_qty'] + 1e-8)

# NEW FEATURES - Add order flow imbalance
train['order_flow_imbalance'] = (train['buy_qty'] - train['sell_qty']) / (train['volume'] + 1e-8)

# NEW FEATURES - Add market pressure indicators
train['buying_pressure'] = train['buy_qty'] / (train['volume'] + 1e-8)
train['selling_pressure'] = train['sell_qty'] / (train['volume'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES - Liquidity measures
train['total_liquidity'] = train['bid_qty'] + train['ask_qty']
train['liquidity_imbalance'] = (train['bid_qty'] - train['ask_qty']) / (train['total_liquidity'] + 1e-8)
train['relative_spread'] = (train['ask_qty'] - train['bid_qty']) / (train['volume'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES - Trade intensity
train['trade_intensity'] = (train['buy_qty'] + train['sell_qty']) / (train['volume'] + 1e-8)
train['avg_trade_size'] = train['volume'] / (train['buy_qty'] + train['sell_qty'] + 1e-8)
train['net_trade_flow'] = (train['buy_qty'] - train['sell_qty']) / (train['buy_qty'] + train['sell_qty'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES - Market depth and activity
train['depth_ratio'] = train['total_liquidity'] / (train['volume'] + 1e-8)
train['volume_participation'] = (train['buy_qty'] + train['sell_qty']) / (train['total_liquidity'] + 1e-8)
train['market_activity'] = train['volume'] * train['total_liquidity']

# ADDITIONAL NEW MARKET FEATURES - Execution quality indicators
train['effective_spread_proxy'] = np.abs(train['buy_qty'] - train['sell_qty']) / (train['volume'] + 1e-8)
train['realized_volatility_proxy'] = np.abs(train['order_flow_imbalance']) * train['volume']

# ADDITIONAL NEW MARKET FEATURES - Normalized volumes
train['normalized_buy_volume'] = train['buy_qty'] / (train['bid_qty'] + 1e-8)
train['normalized_sell_volume'] = train['sell_qty'] / (train['ask_qty'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES - Complex interactions
train['liquidity_adjusted_imbalance'] = train['order_flow_imbalance'] * train['depth_ratio']
train['pressure_spread_interaction'] = train['buying_pressure'] * train['spread_indicator']

# Replace any inf or -inf values with NaN, then fill NaN with 0
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(0)

# Create same features for test
test['bid_ask_interaction'] = test['bid_qty'] * test['ask_qty']
test['bid_buy_interaction'] = test['bid_qty'] * test['buy_qty']
test['bid_sell_interaction'] = test['bid_qty'] * test['sell_qty']
test['ask_buy_interaction'] = test['ask_qty'] * test['buy_qty']
test['ask_sell_interaction'] = test['ask_qty'] * test['sell_qty']
test['buy_sell_interaction'] = test['buy_qty'] * test['sell_qty']

# Calculate spread indicators for test
test['spread_indicator'] = (test['ask_qty'] - test['bid_qty']) / (test['ask_qty'] + test['bid_qty'] + 1e-8)

# Volume-weighted features for test
test['volume_weighted_buy'] = test['buy_qty'] * test['volume']
test['volume_weighted_sell'] = test['sell_qty'] * test['volume']
test['volume_weighted_bid'] = test['bid_qty'] * test['volume']
test['volume_weighted_ask'] = test['ask_qty'] * test['volume']

# NEW FEATURES FOR TEST - Add ratio features
test['buy_sell_ratio'] = test['buy_qty'] / (test['sell_qty'] + 1e-8)
test['bid_ask_ratio'] = test['bid_qty'] / (test['ask_qty'] + 1e-8)

# NEW FEATURES FOR TEST - Add order flow imbalance
test['order_flow_imbalance'] = (test['buy_qty'] - test['sell_qty']) / (test['volume'] + 1e-8)

# NEW FEATURES FOR TEST - Add market pressure indicators
test['buying_pressure'] = test['buy_qty'] / (test['volume'] + 1e-8)
test['selling_pressure'] = test['sell_qty'] / (test['volume'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES FOR TEST - Liquidity measures
test['total_liquidity'] = test['bid_qty'] + test['ask_qty']
test['liquidity_imbalance'] = (test['bid_qty'] - test['ask_qty']) / (test['total_liquidity'] + 1e-8)
test['relative_spread'] = (test['ask_qty'] - test['bid_qty']) / (test['volume'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES FOR TEST - Trade intensity
test['trade_intensity'] = (test['buy_qty'] + test['sell_qty']) / (test['volume'] + 1e-8)
test['avg_trade_size'] = test['volume'] / (test['buy_qty'] + test['sell_qty'] + 1e-8)
test['net_trade_flow'] = (test['buy_qty'] - test['sell_qty']) / (test['buy_qty'] + test['sell_qty'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES FOR TEST - Market depth and activity
test['depth_ratio'] = test['total_liquidity'] / (test['volume'] + 1e-8)
test['volume_participation'] = (test['buy_qty'] + test['sell_qty']) / (test['total_liquidity'] + 1e-8)
test['market_activity'] = test['volume'] * test['total_liquidity']

# ADDITIONAL NEW MARKET FEATURES FOR TEST - Execution quality indicators
test['effective_spread_proxy'] = np.abs(test['buy_qty'] - test['sell_qty']) / (test['volume'] + 1e-8)
test['realized_volatility_proxy'] = np.abs(test['order_flow_imbalance']) * test['volume']

# ADDITIONAL NEW MARKET FEATURES FOR TEST - Normalized volumes
test['normalized_buy_volume'] = test['buy_qty'] / (test['bid_qty'] + 1e-8)
test['normalized_sell_volume'] = test['sell_qty'] / (test['ask_qty'] + 1e-8)

# ADDITIONAL NEW MARKET FEATURES FOR TEST - Complex interactions
test['liquidity_adjusted_imbalance'] = test['order_flow_imbalance'] * test['depth_ratio']
test['pressure_spread_interaction'] = test['buying_pressure'] * test['spread_indicator']

# Replace any inf or -inf values with NaN, then fill NaN with 0
test = test.replace([np.inf, -np.inf], np.nan)
test = test.fillna(0)

X = train.drop(CFG.target, axis=1)
y = train[CFG.target]
X_test = test

# Ensure no inf values in X and X_test
X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

# Force garbage collection before training
gc.collect()

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of features: {X.shape[1]}")
print("\n")

def _pearsonr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

lgbm_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.5625888953382505,
    "learning_rate": 0.029312951475451557,
    "min_child_samples": 63,
    "min_child_weight": 0.11456572852335424,
    "n_estimators": 126,
    "n_jobs": -1,
    "num_leaves": 37,
    "random_state": 42,
    "reg_alpha": 85.2476527854083,
    "reg_lambda": 99.38305361388907,
    "subsample": 0.450669817684892,
    "verbose": -1
}

lgbm_goss_params = {
    "boosting_type": "goss",
    "colsample_bytree": 0.34695458228489784,
    "learning_rate": 0.031023014900595287,
    "min_child_samples": 30,
    "min_child_weight": 0.4727729225033618,
    "n_estimators": 220,
    "n_jobs": -1,
    "num_leaves": 58,
    "random_state": 42,
    "reg_alpha": 38.665994901468224,
    "reg_lambda": 92.76991677464294,
    "subsample": 0.4810891284493255,
    "verbose": -1
}

xgb_params = {
    "colsample_bylevel": 0.4778015829774066,
    "colsample_bynode": 0.362764358742407,
    "colsample_bytree": 0.7107423488010493,
    "gamma": 1.7094857725240398,
    "learning_rate": 0.02213323588455387,
    "max_depth": 20,
    "max_leaves": 12,
    "min_child_weight": 16,
    "n_estimators": 1667,
    "n_jobs": -1,
    "random_state": 42,
    "reg_alpha": 39.352415706891264,
    "reg_lambda": 75.44843704068275,
    "subsample": 0.06566669853471274,
    "verbosity": 0
}

scores = {}
oof_preds = {}
test_preds = {}

print("="*60)
print("TRAINING BASE MODELS")
print("="*60)

# Train LightGBM (GBDT)
print("\n1. Training LightGBM (GBDT)...")
lgbm_trainer = Trainer(
    LGBMRegressor(**lgbm_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=_pearsonr,
    task="regression",
    metric_precision=6
)

lgbm_trainer.fit(X, y)

scores["LightGBM (gbdt)"] = lgbm_trainer.fold_scores
oof_preds["LightGBM (gbdt)"] = lgbm_trainer.oof_preds
test_preds["LightGBM (gbdt)"] = lgbm_trainer.predict(X_test)

print(f"   Fold scores: {lgbm_trainer.fold_scores}")
print(f"   Mean CV score: {np.mean(lgbm_trainer.fold_scores):.6f}")
print(f"   Std CV score: {np.std(lgbm_trainer.fold_scores):.6f}")

# Free up memory
del lgbm_trainer
gc.collect()

# Train LightGBM (GOSS)
print("\n2. Training LightGBM (GOSS)...")
lgbm_goss_trainer = Trainer(
    LGBMRegressor(**lgbm_goss_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=_pearsonr,
    task="regression",
    metric_precision=6
)

lgbm_goss_trainer.fit(X, y)

scores["LightGBM (goss)"] = lgbm_goss_trainer.fold_scores
oof_preds["LightGBM (goss)"] = lgbm_goss_trainer.oof_preds
test_preds["LightGBM (goss)"] = lgbm_goss_trainer.predict(X_test)

print(f"   Fold scores: {lgbm_goss_trainer.fold_scores}")
print(f"   Mean CV score: {np.mean(lgbm_goss_trainer.fold_scores):.6f}")
print(f"   Std CV score: {np.std(lgbm_goss_trainer.fold_scores):.6f}")

# Free up memory
del lgbm_goss_trainer
gc.collect()

# Train XGBoost
print("\n3. Training XGBoost...")
xgb_trainer = Trainer(
    XGBRegressor(**xgb_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=_pearsonr,
    task="regression",
    metric_precision=6
)

xgb_trainer.fit(X, y)

scores["XGBoost"] = xgb_trainer.fold_scores
oof_preds["XGBoost"] = xgb_trainer.oof_preds
test_preds["XGBoost"] = xgb_trainer.predict(X_test)

print(f"   Fold scores: {xgb_trainer.fold_scores}")
print(f"   Mean CV score: {np.mean(xgb_trainer.fold_scores):.6f}")
print(f"   Std CV score: {np.std(xgb_trainer.fold_scores):.6f}")

# Free up memory
del xgb_trainer
gc.collect()

def plot_weights(weights, title):
    sorted_indices = np.argsort(weights[0])[::-1]
    sorted_coeffs = np.array(weights[0])[sorted_indices]
    sorted_model_names = np.array(list(oof_preds.keys()))[sorted_indices]

    plt.figure(figsize=(10, weights.shape[1] * 0.5))
    ax = sns.barplot(x=sorted_coeffs, y=sorted_model_names, palette="RdYlGn_r")

    for i, (value, name) in enumerate(zip(sorted_coeffs, sorted_model_names)):
        if value >= 0:
            ax.text(value, i, f"{value:.3f}", va="center", ha="left", color="black")
        else:
            ax.text(value, i, f"{value:.3f}", va="center", ha="right", color="black")

    xlim = ax.get_xlim()
    ax.set_xlim(xlim[0] - 0.1 * abs(xlim[0]), xlim[1] + 0.1 * abs(xlim[1]))

    plt.title(title)
    plt.xlabel("")
    plt.ylabel("")
    plt.tight_layout()
    plt.show()

X = pd.DataFrame(oof_preds)
X_test = pd.DataFrame(test_preds)
joblib.dump(X, "oof_preds.pkl")
joblib.dump(X_test, "test_preds.pkl")

# Free up memory from original features
del train, test
gc.collect()

def objective(trial):    
    params = {
        "random_state": CFG.seed,
        "alpha": trial.suggest_float("alpha", 0, 1000),
        "tol": trial.suggest_float("tol", 1e-6, 1e-2),
        "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        "positive": trial.suggest_categorical("positive", [True, False])
    }

    trainer = Trainer(
        Ridge(**params),
        cv=KFold(n_splits=5, shuffle=False),
        metric=_pearsonr,
        task="regression",
        verbose=False
    )
    trainer.fit(X, y)
    
    return np.mean(trainer.fold_scores)

print("\n" + "="*60)
print("TRAINING ENSEMBLE MODEL")
print("="*60)

if CFG.run_optuna:
    print("\nOptimizing Ridge hyperparameters with Optuna...")
    sampler = optuna.samplers.TPESampler(seed=CFG.seed, multivariate=True)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=CFG.n_optuna_trials, n_jobs=-1, catch=(ValueError,))
    best_params = study.best_params
    
    print(f"Best parameters found: {best_params}")
    print(f"Best CV score: {study.best_value:.6f}")
    
    # Free up memory
    del study
    gc.collect()

    ridge_params = {
        "random_state": CFG.seed,
        "alpha": best_params["alpha"],
        "tol": best_params["tol"],
        "fit_intercept": best_params["fit_intercept"],
        "positive": best_params["positive"]
    }
else:
    ridge_params = {
        "random_state": CFG.seed
    }

print("\nTraining final Ridge ensemble...")
ridge_trainer = Trainer(
    Ridge(**ridge_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=_pearsonr,
    task="regression",
    metric_precision=6
)

ridge_trainer.fit(X, y)

scores["Ridge (ensemble)"] = ridge_trainer.fold_scores
ridge_test_preds = ridge_trainer.predict(X_test)

print(f"   Fold scores: {ridge_trainer.fold_scores}")
print(f"   Mean CV score: {np.mean(ridge_trainer.fold_scores):.6f}")
print(f"   Std CV score: {np.std(ridge_trainer.fold_scores):.6f}")

ridge_coeffs = np.zeros((1, X.shape[1]))
for m in ridge_trainer.estimators:
    ridge_coeffs += m.coef_
ridge_coeffs = ridge_coeffs / len(ridge_trainer.estimators)

# Free up memory
del ridge_trainer
gc.collect()

# Print summary of all models
print("\n" + "="*60)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*60)

scores_df = pd.DataFrame(scores)
mean_scores = scores_df.mean().sort_values(ascending=False)

print("\nModel Rankings (by mean CV score):")
print("-" * 40)
for i, (model, score) in enumerate(mean_scores.items(), 1):
    std_score = scores_df[model].std()
    print(f"{i}. {model:20s} - Mean: {score:.6f} (±{std_score:.6f})")

print("\nDetailed Fold Scores:")
print("-" * 40)
for model in mean_scores.index:
    fold_scores = scores[model]
    print(f"\n{model}:")
    for fold, score in enumerate(fold_scores, 1):
        print(f"   Fold {fold}: {score:.6f}")

# Plot the ensemble weights
plot_weights(ridge_coeffs, "Ridge Ensemble Coefficients")

# Save submission
sub = pd.read_csv(CFG.sample_sub_path)
sub["prediction"] = ridge_test_preds
submission_filename = f"sub_ridge_{np.mean(scores['Ridge (ensemble)']):.6f}.csv"
sub.to_csv(submission_filename, index=False)

print(f"\n" + "="*60)
print(f"Submission saved as: {submission_filename}")
print(f"Final ensemble CV score: {np.mean(scores['Ridge (ensemble)']):.6f}")
print("="*60)

# Visualization of scores
order = mean_scores.index.tolist()

min_score = mean_scores.min()
max_score = mean_scores.max()
padding = (max_score - min_score) * 0.5
lower_limit = min_score - padding
upper_limit = max_score + padding

fig, axs = plt.subplots(1, 2, figsize=(15, scores_df.shape[1] * 0.5))

boxplot = sns.boxplot(data=scores_df, order=order, ax=axs[0], orient="h", color="grey")
axs[0].set_title(f"Fold Score Distribution")
axs[0].set_xlabel("")
axs[0].set_ylabel("")

barplot = sns.barplot(x=mean_scores.values, y=mean_scores.index, ax=axs[1], color="grey")
axs[1].set_title(f"Average Score")
axs[1].set_xlabel("")
axs[1].set_xlim(left=lower_limit, right=upper_limit)
axs[1].set_ylabel("")

for i, (score, model) in enumerate(zip(mean_scores.values, mean_scores.index)):
    color = "cyan" if "ensemble" in model.lower() else "grey"
    barplot.patches[i].set_facecolor(color)
    boxplot.patches[i].set_facecolor(color)
    barplot.text(score, i, round(score, 6), va="center")

plt.tight_layout()
plt.show()

print("\nProcess completed successfully!")