In [2]:
# submission_and_analysis.ipynb (Leaderboard-aware version)
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from catboost import CatBoostRegressor

In [3]:
# === LOAD DATA ===
train = pd.read_parquet("data/train.parquet")
test = pd.read_parquet("data/test.parquet")

In [4]:
# === FEATURE ENGINEERING (NO LEAKING) ===
def make_features(df):
    df = df.copy()
    
    # Classic handcrafted features
    df['liq_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-5)
    df['order_pressure'] = df['buy_qty'] - df['sell_qty']
    df['aggressiveness'] = (df['buy_qty'] + df['sell_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-5)

    # Shift to prevent leakage
    df['volatility_5'] = df['volume'].shift(1).rolling(5, min_periods=1).std()
    df['mean_pressure_3'] = df['order_pressure'].shift(1).rolling(3, min_periods=1).mean()

    # PCA on X features (only on train!)
    X_cols = [col for col in df.columns if col.startswith("X")]
    return df, X_cols

In [5]:
# Apply feature engineering
train, X_cols = make_features(train)

In [8]:
# Time-based train/val split using datetime index
train = train.dropna()

# Convert DatetimeIndex to int for quantile, then back to Timestamp
cutoff = pd.to_datetime(np.quantile(train.index.astype(np.int64), 0.8))

train_df = train[train.index <= cutoff]
val_df = train[train.index > cutoff]

In [9]:
# PCA: fit on train, transform both
pca = PCA(n_components=10)
pca_train = pca.fit_transform(train_df[X_cols])
pca_val = pca.transform(val_df[X_cols])

In [11]:
train_df = train[train.index <= cutoff].dropna().copy()
val_df = train[train.index > cutoff].dropna().copy()
for i in range(10):
    train_df[f'pca_{i}'] = pca_train[:, i]
    val_df[f'pca_{i}'] = pca_val[:, i]

In [12]:
# Combine all features
used_features = ['liq_imbalance', 'order_pressure', 'aggressiveness', 'volatility_5', 'mean_pressure_3'] + [f'pca_{i}' for i in range(10)]

X_train = train_df[used_features]
y_train = train_df['label']
X_val = val_df[used_features]
y_val = val_df['label']

In [13]:
# Train CatBoost
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    early_stopping_rounds=100,
    verbose=100
)
model.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	learn: 1.0017618	test: 1.0406042	best: 1.0406042 (0)	total: 195ms	remaining: 3m 14s
100:	learn: 0.9810510	test: 1.0434104	best: 1.0406042 (0)	total: 3.12s	remaining: 27.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.040604152
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostRegressor at 0x1b527be06e0>

In [14]:
# Validation
y_pred = model.predict(X_val)
val_pearson = pearsonr(y_val, y_pred)[0]
print(f"Validation Pearson Correlation: {val_pearson:.5f}")

Validation Pearson Correlation: -0.02944


In [16]:
# === SUBMISSION ===
# Apply feature pipeline to test

test, _ = make_features(test)
test_pca = pca.transform(test[X_cols])
for i in range(10):
    test[f'pca_{i}'] = test_pca[:, i]

X_test = test[used_features]
test['prediction'] = model.predict(X_test)

submission = pd.DataFrame({
    'ID': np.arange(1, len(test) + 1),
    'prediction': test['prediction']
 })
submission.to_csv("submission_test.csv", index=False)
print("✅ submission_test.csv saved.")

✅ submission_test.csv saved.
