In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.metrics import make_scorer
from scipy.stats import pearsonr
import gc
import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, Dense, Flatten, TimeDistributed, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

import lightgbm as lgb

# --- Configuration ---
# Set seed for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# Global configuration variables
N_FEATURES_SELECT = 150  # Number of features to select using SelectKBest
SEQUENCE_LENGTH = 60    # Timesteps for ConvLSTM input (e.g., 60 minutes)
BATCH_SIZE = 2048       # Batch size for ConvLSTM training
EPOCHS = 50             # Max epochs for ConvLSTM
VALIDATION_SPLIT_DATE = '2024-01-01' # Split point for training and validation data (Jan-Feb 2024)

# --- 1. Data Loading ---
print("Loading data...")
try:
    train_df = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
    test_df = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')
    sample_submission_df = pd.read_csv('/kaggle/input/drw-crypto-market-prediction/sample_submission.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data: {e}. Make sure the parquet files are in the specified path.")
    # Exit or handle gracefully in a real competition
    exit()

# Check if 'timestamp' is an index in train_df and reset if so
if 'timestamp' not in train_df.columns and train_df.index.name == 'timestamp':
    print("Train: 'timestamp' is DataFrame index, resetting index...")
    train_df = train_df.reset_index()
elif 'timestamp' not in train_df.columns:
    print("Error: 'timestamp' column not found in train_df. Available columns:", train_df.columns.tolist())
    exit() # Exit to prevent further errors if timestamp is truly missing

# Convert timestamp to datetime objects for easier time-based filtering
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])

# Check if 'ID' is an index in test_df and reset if so
if 'ID' not in test_df.columns and test_df.index.name == 'ID':
    print("Test: 'ID' is DataFrame index, resetting index...")
    test_df = test_df.reset_index()
elif 'ID' not in test_df.columns:
    print("Error: 'ID' column not found in test_df. Available columns:", test_df.columns.tolist())
    exit() # Exit to prevent further errors if ID is truly missing

# Store original IDs for submission
test_ids = test_df['ID']

# Drop 'ID' from test_df as it's not a feature
test_df = test_df.drop('ID', axis=1)

# --- Memory Optimization: Downcast numerical columns to float32 ---
# Identify all numerical columns excluding 'label' and 'ID' which will be handled separately or dropped.
def downcast_numerical_cols(df):
    for col in df.select_dtypes(include=[np.float64, np.int64]).columns:
        if col not in ['label', 'ID']: # Don't downcast target or ID columns prematurely
            df[col] = df[col].astype(np.float32)
    return df

print("Downcasting numerical columns to float32...")
train_df = downcast_numerical_cols(train_df)
test_df = downcast_numerical_cols(test_df)
print("Numerical columns downcasted.")


print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")


# --- 2. Data Preprocessing and Feature Engineering ---

def create_and_preprocess_features(df, is_train=True, scaler=None, selector=None):
    """
    Applies feature engineering, handles missing values, and scales data.
    """
    print(f"Starting feature engineering and preprocessing for {'train' if is_train else 'test'} data...")

    # --- Handle Missing Values (Forward-fill for time-series) ---
    # Apply ffill to all numerical columns
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    # Exclude 'label' from ffill if it's the target in train_df
    if 'label' in numerical_cols and not is_train:
        numerical_cols.remove('label')

    df[numerical_cols] = df[numerical_cols].ffill().bfill() # ffill then bfill to catch initial NaNs
    print(f"Missing values handled: {df.isnull().sum().sum()} remaining NaNs.")

    # --- Basic Feature Engineering ---
    # Create a proxy for price from bid/ask quantities
    # Note: bid_qty is total quantity buyers are willing to purchase at the BEST bid price
    # ask_qty is total quantity sellers are offering to sell at the BEST ask price
    # This 'mid_price' is a simple proxy based on quantities, not actual prices.
    df['mid_qty_proxy'] = (df['bid_qty'] + df['ask_qty']) / 2

    # Robust qty_imbalance calculation to prevent inf
    denominator = np.maximum(df['bid_qty'] + df['ask_qty'], 1e-6) # Ensure denominator is never too small
    df['qty_imbalance'] = (df['bid_qty'] - df['ask_qty']) / denominator


    # Rolling features for key columns
    rolling_windows = [10, 30, 60] # 10-min, 30-min, 60-min windows
    base_cols_for_rolling = ['volume', 'buy_qty', 'sell_qty', 'mid_qty_proxy']

    for col in base_cols_for_rolling:
        for window in rolling_windows:
            df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
            df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
            # Handle NaNs from rolling (e.g., at the beginning of the series)
            df[f'{col}_roll_std_{window}'] = df[f'{col}_roll_std_{window}'].fillna(0) # Std dev can be NaN if window is 1 or constant

    # Lagged features
    lag_periods = [1, 2, 5] # Lag by 1, 2, 5 minutes
    base_cols_for_lag = ['volume', 'buy_qty', 'sell_qty', 'mid_qty_proxy', 'qty_imbalance']

    for col in base_cols_for_lag:
        for lag in lag_periods:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # Ensure no NaNs after rolling and lagging (fill with 0 or previous value)
    df = df.ffill().bfill() # Apply again after feature engineering

    print(f"Engineered features added. Current shape: {df.shape}")

    # --- Explicitly handle inf values before scaling ---
    # Replace any remaining inf/-inf with NaN and re-impute
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print(f"Inf values replaced with NaN. Remaining NaNs after inf replacement: {df.isnull().sum().sum()}")
    df = df.ffill().bfill() # Re-impute after handling inf
    print(f"NaNs re-imputed after inf replacement. Total NaNs: {df.isnull().sum().sum()}")


    # Identify features for scaling and selection
    features_to_process = [col for col in df.columns if col not in ['timestamp', 'label', 'ID']]
    X_processed = df[features_to_process].copy() # Ensure copy to avoid original df modification

    # --- Normalization/Standardization ---
    if is_train:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_processed)
        print("Scaler fitted on training data.")
    else:
        if scaler is None:
            raise ValueError("Scaler must be provided for test data preprocessing.")
        X_scaled = scaler.transform(X_processed)
        print("Scaler transformed test data.")

    # Convert X_scaled back to DataFrame with float32
    X_scaled_df = pd.DataFrame(X_scaled, columns=features_to_process, index=df.index).astype(np.float32)

    # --- Feature Selection (only on training data for fitting, then transform test) ---
    if is_train:
        y_label = df['label'].astype(np.float32) # Ensure label is float32 for consistency
        selector = SelectKBest(mutual_info_regression, k=N_FEATURES_SELECT)
        # Fit selector on scaled features and labels
        selector.fit(X_scaled_df, y_label)
        selected_features = X_scaled_df.columns[selector.get_support()].tolist()
        print(f"Selected {len(selected_features)} features for modeling.")
    else:
        if selector is None:
            raise ValueError("Selector must be provided for test data preprocessing.")
        selected_features = X_scaled_df.columns[selector.get_support()].tolist()
        print(f"Applying feature selection to test data, using {len(selected_features)} features.")

    # Return the processed dataframe with only selected features
    return X_scaled_df[selected_features], scaler, selector, selected_features


# --- Apply preprocessing and feature engineering to train and test data ---
# Split train_df into training and validation sets based on timestamp
train_data = train_df[train_df['timestamp'] < VALIDATION_SPLIT_DATE].copy()
val_data = train_df[train_df['timestamp'] >= VALIDATION_SPLIT_DATE].copy()

print(f"Training data size (before feature eng): {train_data.shape}")
print(f"Validation data size (before feature eng): {val_data.shape}")

# Process training data to fit scaler and selector
X_train_processed, scaler, selector, selected_features = \
    create_and_preprocess_features(train_data, is_train=True)
y_train = train_data['label'].astype(np.float32) # Ensure y_train is float32

# Process validation data using the fitted scaler and selector
X_val_processed, _, _, _ = \
    create_and_preprocess_features(val_data, is_train=False, scaler=scaler, selector=selector)
y_val = val_data['label'].astype(np.float32) # Ensure y_val is float32

# Process test data using the fitted scaler and selector
X_test_processed, _, _, _ = \
    create_and_preprocess_features(test_df, is_train=False, scaler=scaler, selector=selector)

# Clean up memory
del train_df, train_data, val_data
gc.collect()


# --- 3. ConvLSTM Model Input Preparation ---

def create_sequences(X, y=None, sequence_length=SEQUENCE_LENGTH):
    """
    Creates sequences for ConvLSTM.
    Input X is a DataFrame with selected features.
    """
    xs, ys = [], []
    num_features = X.shape[1]

    for i in range(len(X) - sequence_length):
        x_seq = X.iloc[i:(i + sequence_length)].values
        # Reshape to (sequence_length, rows, cols, channels)
        # Here, rows=1, cols=num_features, channels=1 for ConvLSTM2D with a single series.
        x_seq = x_seq.reshape((sequence_length, 1, num_features, 1))
        xs.append(x_seq)

        if y is not None:
            ys.append(y.iloc[i + sequence_length]) # Predict the label at the end of the sequence
    
    print(f"Created {len(xs)} sequences with shape {xs[0].shape if xs else 'N/A'}")
    return np.array(xs).astype(np.float32), np.array(ys).astype(np.float32) if y is not None else None # Ensure output arrays are float32

print("Preparing ConvLSTM sequences for training...")
X_train_lstm, y_train_lstm = create_sequences(X_train_processed, y_train, SEQUENCE_LENGTH)

print("Preparing ConvLSTM sequences for validation...")
X_val_lstm, y_val_lstm = create_sequences(X_val_processed, y_val, SEQUENCE_LENGTH)

# Clean up memory
del X_train_processed, y_train, X_val_processed, y_val
gc.collect()


# --- 4. Model Selection and Training ---

# Pearson correlation as a metric for evaluation
def pearson_correlation_metric(y_true, y_pred):
    if len(y_true.shape) > 1:
        y_true = tf.squeeze(y_true)
    if len(y_pred.shape) > 1:
        y_pred = tf.squeeze(y_pred)

    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)

    # Calculate Pearson correlation coefficient
    mx = tf.reduce_mean(y_true)
    my = tf.reduce_mean(y_pred)
    xm = y_true - mx
    ym = y_pred - my
    r_num = tf.reduce_sum(xm * ym)
    r_den = tf.sqrt(tf.reduce_sum(tf.square(xm)) * tf.reduce_sum(tf.square(ym)))
    r = r_num / (r_den + 1e-8) # Add epsilon to prevent division by zero

    # Ensure r is within [-1, 1] range
    r = tf.maximum(tf.minimum(r, 1.0), -1.0)
    return r

# --- ConvLSTM Model ---
print("Building ConvLSTM model...")
num_features_lstm = X_train_lstm.shape[-2] # The number of features in the sequence
convlstm_model = Sequential([
    # Input shape: (timesteps, rows, cols, channels) -> (SEQUENCE_LENGTH, 1, num_features_lstm, 1)
    ConvLSTM2D(filters=64, kernel_size=(1, 3), activation='relu',
               input_shape=(SEQUENCE_LENGTH, 1, num_features_lstm, 1),
               padding='same', return_sequences=True, kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    ConvLSTM2D(filters=32, kernel_size=(1, 3), activation='relu',
               padding='same', return_sequences=False, kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Flatten(), # Flatten the output of ConvLSTM to feed into Dense layers
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.2),
    Dense(1, activation='linear') # Output layer for regression
])

convlstm_model.compile(loss='mae', optimizer=Adam(learning_rate=0.001), metrics=[pearson_correlation_metric])
convlstm_model.summary()

print("Training ConvLSTM model...")
early_stopping = EarlyStopping(monitor='val_pearson_correlation_metric', patience=10, mode='max', restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_pearson_correlation_metric', factor=0.5, patience=5, min_lr=1e-6, mode='max', verbose=1)

history = convlstm_model.fit(
    X_train_lstm, y_train_lstm,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_lstm, y_val_lstm),
    callbacks=[early_stopping, reduce_lr]
)
print("ConvLSTM model training complete.")

# Clean up memory
del X_train_lstm, y_train_lstm, X_val_lstm, y_val_lstm
gc.collect()


# --- LightGBM Model ---
print("Building and training LightGBM model...")
# For LightGBM, we use the raw (scaled and selected) features without sequence creation.
# Align indices for direct use in LightGBM.
lgbm_train_X = X_train_processed.iloc[SEQUENCE_LENGTH:].copy() # Align with LSTM target
lgbm_train_y = y_train.iloc[SEQUENCE_LENGTH:].copy()

lgbm_val_X = X_val_processed.iloc[SEQUENCE_LENGTH:].copy() # Align with LSTM target
lgbm_val_y = y_val.iloc[SEQUENCE_LENGTH:].copy()

lgbm_model = lgb.LGBMRegressor(objective='mae',
                               metric='mae', # LightGBM doesn't have native pearson, but MAE is a good proxy.
                               n_estimators=1000,
                               learning_rate=0.05,
                               num_leaves=31,
                               max_depth=-1,
                               min_child_samples=20,
                               subsample=0.8,
                               colsample_bytree=0.8,
                               random_state=42,
                               n_jobs=-1)

lgbm_model.fit(lgbm_train_X, lgbm_train_y,
                eval_set=[(lgbm_val_X, lgbm_val_y)],
                eval_metric='mae',
                callbacks=[lgb.early_stopping(100, verbose=False)])

print("LightGBM model training complete.")

# Clean up memory
del lgbm_train_X, lgbm_train_y, lgbm_val_X, lgbm_val_y
gc.collect()


# --- 5. Prediction and Ensemble ---

# Prepare test data for ConvLSTM predictions
print("Preparing ConvLSTM sequences for test data...")
# Test data needs to be lagged and rolled based on its own values up to the prediction point.
# Since test timestamps are masked, we treat it as a continuous series for sequence creation.
X_test_lstm, _ = create_sequences(X_test_processed, None, SEQUENCE_LENGTH)

# Handle cases where test_df is too short for sequences
if X_test_lstm.shape[0] == 0:
    print("Test data is too short to create LSTM sequences. Skipping ConvLSTM prediction.")
    convlstm_preds = np.zeros(len(test_ids)) # Default to zeros
else:
    print("Making ConvLSTM predictions...")
    convlstm_preds = convlstm_model.predict(X_test_lstm, batch_size=BATCH_SIZE).flatten()

# The ConvLSTM predictions will be shorter than the original test_df because of sequence creation.
# The predictions correspond to the label at `i + SEQUENCE_LENGTH`.
# We need to map these predictions back to the original test_ids.
# For simplicity, we'll align the predictions based on their index.
# The first SEQUENCE_LENGTH rows of the test_df won't have ConvLSTM predictions.
# We'll fill those with LightGBM predictions or zeros.

# Pad ConvLSTM predictions to match original test_df length
# The first `SEQUENCE_LENGTH` predictions will be missing from LSTM.
# Let's fill those with the average of the ConvLSTM predictions or 0.
# A more sophisticated approach would be to use LightGBM for these initial predictions.
padded_convlstm_preds = np.zeros(len(test_ids))
if X_test_lstm.shape[0] > 0:
    padded_convlstm_preds[SEQUENCE_LENGTH:] = convlstm_preds


# Make LightGBM predictions
print("Making LightGBM predictions...")
# LightGBM predicts for all rows where features are available.
lgbm_preds = lgbm_model.predict(X_test_processed[selected_features])


# --- Ensemble Predictions ---
# Simple weighted average ensemble
# You can tune these weights based on validation performance
print("Ensembling predictions...")
convlstm_weight = 0.6
lgbm_weight = 0.4

# Apply ensemble only where ConvLSTM has predictions.
# For the initial `SEQUENCE_LENGTH` rows, we'll rely solely on LightGBM.
final_predictions = np.zeros(len(test_ids))

if X_test_lstm.shape[0] > 0:
    # Use ensemble for rows where ConvLSTM predictions exist
    final_predictions[SEQUENCE_LENGTH:] = (convlstm_weight * padded_convlstm_preds[SEQUENCE_LENGTH:] +
                                           lgbm_weight * lgbm_preds[SEQUENCE_LENGTH:])
    # For the initial rows, use only LightGBM
    final_predictions[:SEQUENCE_LENGTH] = lgbm_preds[:SEQUENCE_LENGTH]
else:
    # If ConvLSTM predictions are not available, rely solely on LightGBM
    final_predictions = lgbm_preds

# Ensure predictions are float32
final_predictions = final_predictions.astype(np.float32)

print("Predictions ensembled.")

# --- 6. Submission ---
print("Creating submission file...")
submission_df = pd.DataFrame({'ID': test_ids, 'Prediction': final_predictions})
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")
print(submission_df.head())


2025-06-21 14:16:15.041310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750515375.240846      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750515375.300820      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading data...
Data loaded successfully.
Train: 'timestamp' is DataFrame index, resetting index...
Test: 'ID' is DataFrame index, resetting index...
Downcasting numerical columns to float32...
Numerical columns downcasted.
Train data shape: (525887, 897)
Test data shape: (538150, 896)
Training data size (before feature eng): (439668, 897)
Validation data size (before feature eng): (86219, 897)
Starting feature engineering and preprocessing for train data...
Missing values handled: 0 remaining NaNs.


  df['mid_qty_proxy'] = (df['bid_qty'] + df['ask_qty']) / 2
  df['qty_imbalance'] = (df['bid_qty'] - df['ask_qty']) / denominator
  df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
  df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
  df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
  df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
  df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
  df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
  df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
  df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
  df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
  df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods

Engineered features added. Current shape: (439668, 938)
Inf values replaced with NaN. Remaining NaNs after inf replacement: 9233028
NaNs re-imputed after inf replacement. Total NaNs: 9233028
