# Crypto Market Price Movement Prediction

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

## CONFIGURATION

In [None]:
# ======================= CONFIGURATION =======================
# --- Paths ---
TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
SUB_PATH = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"
OUT_PATH = "submission.csv"

# --- Feature config ---
# Select features based on SHAP values using XGBoost trained on all features with OPTUNA for hyperparameter tuning
# Ref: https://www.kaggle.com/code/sadettinamilverdil/yat-r-m-tavsiyesi-de-ildir/comments
BASE_FEATURES = [
    "X863", "X856", "X598", "X862", "X385", "X852", "X603", "X860", "X674",
    "X415", "X345", "X855", "X174", "X302", "X178", "X168", "X612", "X888", "X421", "X333",
    "buy_qty", "sell_qty", "volume", "bid_qty", "ask_qty"
]

ENGINEERED_FEATURES = [
    'liquidity_ratio', 'buy_sell_interaction', 'log_volume', 'bid_buy_interaction',
    'volume_weighted_ask', 'market_activity', 'volume_participation', 'total_liquidity',
    'bid_sell_interaction', 'normalized_sell_volume', 'realized_volatility_proxy',
    'effective_spread_proxy', 'bid_ask_imbalance'
]

FEATURES = BASE_FEATURES + ENGINEERED_FEATURES

RANDOM_STATE = 42
FOLDS = 5

# XGBoost parameters based on Optuna hyperparameter tuning
# Ref: https://www.kaggle.com/code/sadettinamilverdil/yat-r-m-tavsiyesi-de-ildir/comments
XGB_PARAMS = {
    "tree_method": "gpu_hist",
    "colsample_bylevel": 0.4778,
    "colsample_bynode": 0.3628,
    "colsample_bytree": 0.7107,
    "gamma": 1.7095,
    "learning_rate": 0.02213,
    "max_depth": 20,
    "max_leaves": 12,
    "min_child_weight": 16,
    "n_estimators": 1667,
    "n_jobs": -1,
    "random_state": RANDOM_STATE,
    "reg_alpha": 39.3524,
    "reg_lambda": 75.4484,
    "subsample": 0.06567,
    "verbosity": 0
}


## FUNCTIONS

In [None]:
# ======================= FUNCTIONS =======================
# This function reduces memory usage of a DataFrame by downcasting numeric types and converting object types to categories where appropriate.
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_numeric_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).startswith('int'):
                if c_min >= 0:
                    if c_max < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif c_max < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif c_max < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
            else:
                df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == object:
            num_unique_values = df[col].nunique()
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:
                df[col] = df[col].astype('category')
    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    if verbose:
        print(f"Mem. usage decreased from {start_mem:.2f} MB to {end_mem:.2f} MB "
              f"({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)")
    return df

# This function fills missing values in a DataFrame using forward and backward fill methods, and replaces infinite values with NaN.
def fill_and_clean(df):
    df = df.fillna(method='ffill').fillna(method='bfill')
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.fillna(method='ffill').fillna(method='bfill')
    return df

# This function performs feature engineering on the DataFrame by creating new features based on existing ones.
# Ref1: https://www.kaggle.com/code/taylorsamarel/low-signal-to-noise/notebook
# Ref2: https://www.kaggle.com/code/yangq369/drw-lightgbm-fold
def feature_engineering(df):
    df['liquidity_ratio'] = (df['bid_qty'] + df['ask_qty']) / (df['volume'] + 1e-8)
    df['buy_sell_interaction'] = df['buy_qty'] * df['sell_qty']
    df['log_volume'] = np.log1p(df['volume'])
    df['bid_buy_interaction'] = df['bid_qty'] * df['buy_qty']
    df['volume_weighted_ask'] = df['ask_qty'] * df['volume']
    df['market_activity'] = df['volume'] * (df['bid_qty'] + df['ask_qty'])
    df['volume_participation'] = (df['buy_qty'] + df['sell_qty']) / ((df['bid_qty'] + df['ask_qty']) + 1e-8)
    df['total_liquidity'] = df['bid_qty'] + df['ask_qty']
    df['bid_sell_interaction'] = df['bid_qty'] * df['sell_qty']
    df['normalized_sell_volume'] = df['sell_qty'] / (df['ask_qty'] + 1e-8)
    
    # To compute realized_volatility_proxy and effective_spread_proxy, need order_flow_imbalance as intermediate
    df['order_flow_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-8)
    df['realized_volatility_proxy'] = np.abs(df['order_flow_imbalance']) * df['volume']
    df['effective_spread_proxy'] = np.abs(df['buy_qty'] - df['sell_qty']) / (df['volume'] + 1e-8)
    df['bid_ask_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-8)
    return df

# This function loads training and testing data from specified paths.
def load_data(train_path, test_path):
    train = pd.read_parquet(train_path)
    test = pd.read_parquet(test_path)
    return train, test

# This function prepares the data by reducing memory usage, filling missing values, and performing feature engineering.
def prepare_data(train, test):
    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)
    train = train.reset_index().rename(columns={'index': 'timestamp'})
    test = test.reset_index().rename(columns={'index': 'ID'})
    train = fill_and_clean(train)
    test = fill_and_clean(test)
    train = feature_engineering(train)
    test = feature_engineering(test)
    return train, test

# This function trains an XGBoost model using cross-validation and returns predictions for the test set.
def train_xgboost_cv(X, y, X_test, xgb_params, folds=5, weighted=True):
    kf = KFold(n_splits=folds) # , shuffle=True, random_state=RANDOM_STATE)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    fold_scores = []
    fold_test_preds = []

    for i, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"Fold: {i + 1}")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        model = xgb.XGBRegressor(**xgb_params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        val_pred = model.predict(X_valid)
        oof_preds[valid_idx] = val_pred
        test_pred_fold = model.predict(X_test)
        fold_test_preds.append(test_pred_fold)
        fold_score = pearsonr(y_valid, val_pred)[0]
        fold_scores.append(fold_score)
        print(f"Fold {i + 1} RMSE: {np.sqrt(mean_squared_error(y_valid, val_pred)):.4f} | Pearson: {fold_score:.4f}")

    # Calculate final predictions using both OOF simple average and weighted average 
    # **Options to use either for submission, Select based on performance**
    if weighted:
        # Weighted average for test predictions
        weights = np.array(fold_scores)
        weights = weights / weights.sum()
        weighted_test_preds = np.zeros(len(X_test))
        for w, preds in zip(weights, fold_test_preds):
            weighted_test_preds += w * preds
    else:
        # Simple average for test predictions
        test_preds = np.mean(fold_test_preds, axis=0)

    print("Final RMSE on OOF:", np.sqrt(mean_squared_error(y, oof_preds)))
    pearson_score = pearsonr(y, oof_preds)[0]
    print("Final Pearson Correlation =", pearson_score)
    print("Test predictions shape:", test_preds.shape)
    print("Weighted test predictions shape:", weighted_test_preds.shape)
    print("Weights used for each fold:", weights)
    return test_preds, weighted_test_preds

# This function saves the predictions to a CSV file for submission.
def save_submission(test_preds, out_path, sub_path=SUB_PATH):
    sub = pd.read_csv(sub_path)
    sub['prediction'] = test_preds
    sub.to_csv(out_path, index=False)
    print(f"Submission saved to {out_path}")

## MAIN SCRIPT

In [None]:
# ======================= MAIN SCRIPT =======================

if __name__ == "__main__":
    train, test = load_data(TRAIN_PATH, TEST_PATH)
    train, test = prepare_data(train, test)
    train_data_reduced = train[FEATURES + ['label']]
    test_data_reduced = test[FEATURES + ['label']]
    X = train_data_reduced.drop(columns=['label'])
    y = train_data_reduced['label']
    X_test = test_data_reduced.drop(columns=['label'])
    test_preds, weighted_test_preds = train_xgboost_cv(X, y, X_test, XGB_PARAMS, folds=FOLDS, weighted=False) # Choose weighted=False for simple average
    save_submission(test_preds, OUT_PATH, SUB_PATH)

NameError: name 'load_data' is not defined

In [5]:
# !kaggle competitions submit -c drw-crypto-market-prediction -f submission.csv -m "XGBoost with Weighted KFold + Feature Engineering" --quiet