<a href="https://www.kaggle.com/code/nicholas33/drw-crypto-market-prediction-nb153?scriptVersionId=248907206" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# DRW Crypto Market Prediction Competition Pipeline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os # Import os for path checking
import gc # Import garbage collector
import math # For ceil in data generator

# Memory optimization and data processing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

# Deep learning and modeling (TensorFlow import remains for GPU memory setting)
import tensorflow as tf 
# Removed: from tensorflow.keras.models import Sequential, Model
# Removed: from tensorflow.keras.layers import (BatchNormalization, Input, Conv1D, MaxPooling1D, Flatten, Reshape, TimeDistributed)
# Removed: from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Removed: from tensorflow.keras.optimizers import Adam
# Removed: from tensorflow.keras.utils import Sequence

# Tree-based models for ensemble
import lightgbm as lgb
import xgboost as xgb # Import XGBoost
from scipy.stats import pearsonr

# Set memory growth for GPU (if available) - still good practice
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


# Removed TimeSeriesSequence class as it's no longer needed for Keras DL models
# class TimeSeriesSequence(Sequence):
#     ...


class CryptoMarketPredictor:
    def __init__(self, sequence_length=30, top_features=100, top_X_features_to_preselect=30):
        self.sequence_length = sequence_length
        self.top_features = top_features
        self.top_X_features_to_preselect = top_X_features_to_preselect
        self.scaler = RobustScaler()
        self.feature_selector = None
        self.selected_features = None # Stores final selected feature names
        self.models = {}
        # Path for the initial processed data after feature engineering
        self._engineered_data_checkpoint_path = './engineered_train_data_checkpoint.parquet'
        # Paths for scaled and feature-selected data (now loaded fully into memory for LGBM/XGBoost)
        # No longer using separate _scaled_X_path, _scaled_y_path for generator
        # The data will be loaded into memory for LGBM and XGBoost directly.


    def optimize_memory(self, df):
        """
        Optimize Pandas DataFrame memory usage and clean data.
        """
        print(f"Memory usage before optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

        # Clean data first
        df = self.clean_data(df)

        # Optimize numeric columns (Pandas directly)
        for col in df.select_dtypes(include=[np.number]).columns:
            if col == 'timestamp' or col == 'ID' or col == 'label':
                continue # Don't downcast timestamp, ID, or label
            df[col] = pd.to_numeric(df[col], downcast='float')

        print(f"Memory usage after optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        return df

    def clean_data(self, df):
        """
        Clean Pandas DataFrame by handling inf, -inf, and extreme values.
        """
        print("Cleaning data...")

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        # Replace inf and -inf with NaN first
        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)

        # Fill NaN values with forward fill, then backward fill, then 0
        for col in numeric_cols:
            df[col] = df[col].ffill().bfill().fillna(0)

        # Handle extreme outliers (values beyond 3*IQR)
        for col in df.select_dtypes(include=[np.float32, np.float64]).columns:
            if col in ['timestamp', 'ID', 'label']: continue
            if df[col].nunique() > 1:
                q25 = df[col].quantile(0.25)
                q75 = df[col].quantile(0.75)
                iqr = q75 - q25

                if iqr != 0 and not pd.isna(iqr):
                    lower_bound = q25 - 3 * iqr
                    upper_bound = q75 + 3 * iqr
                    df[col] = df[col].clip(lower_bound, upper_bound)
        print("Data cleaning applied.")
        return df

    def create_time_features(self, df):
        """
        Create time-based features with robust calculations using Pandas.
        Significantly reduced complexity for faster execution.
        """
        print("Creating time-based features...")

        # Basic market features
        df['mid_price'] = (df['bid_qty'] + df['ask_qty']) / 2
        df['spread'] = df['ask_qty'] - df['bid_qty']

        # Safe division for imbalance
        denominator = df['bid_qty'] + df['ask_qty'] + 1e-10
        df['imbalance'] = (df['bid_qty'] - df['ask_qty']) / denominator

        # Safe division for buy/sell ratio
        df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)

        # Rolling statistics - significantly reduced windows for speed
        windows = [10, 30]
        base_cols_for_rolling = ['volume', 'mid_price', 'buy_qty', 'sell_qty', 'imbalance']

        for col in base_cols_for_rolling:
            for window in windows:
                df[f'{col}_ma_{window}'] = df[col].rolling(window, min_periods=1).mean()
                df[f'{col}_std_{window}'] = df[col].rolling(window, min_periods=1).std().fillna(0)

        # Lagged features - significantly reduced lags for speed
        lags = [1, 5]
        base_cols_for_lag = ['mid_price', 'imbalance']

        for col in base_cols_for_lag:
            for lag in lags:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)

        # Technical indicators - reduced
        df['rsi_proxy'] = self.calculate_rsi_proxy(df['mid_price'], window=10)
        df['momentum'] = df['mid_price'] - df['mid_price'].shift(5)

        # Final check for any inf/nan values that might have been introduced
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            df[col] = df[col].ffill().bfill().fillna(0)

        print(f"Time-based features created. Current shape: {df.shape[0]} rows, {df.shape[1]} columns")
        return df

    def calculate_rsi_proxy(self, prices_series, window=14):
        """Calculate RSI-like indicator with safe operations for Pandas Series."""
        delta = prices_series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()

        rs = gain / (loss + 1e-10)
        rsi = 100 - (100 / (1 + rs))

        rsi = rsi.replace([np.inf, -np.inf], np.nan).fillna(50)

        return rsi

    def select_features(self, X_df, y_df, method='mutual_info'):
        """
        Feature selection to reduce dimensionality with robust handling.
        Operates directly on Pandas DataFrames.
        """
        print(f"Selecting top {self.top_features} features from {X_df.shape[1]} features...")

        print("Validating data before final feature selection...")

        # Check for any remaining inf/nan values
        inf_mask = np.isinf(X_df).any(axis=1)
        nan_mask = np.isnan(X_df).any(axis=1)
        invalid_mask = inf_mask | nan_mask

        if invalid_mask.sum() > 0:
            print(f"Removing {invalid_mask.sum()} rows with invalid values before final selection.")
            X_df = X_df[~invalid_mask]
            y_df = y_df[~invalid_mask]

        # Check for constant or near-constant features (can cause issues for some selectors)
        feature_std = X_df.std()
        constant_features_mask = feature_std < 1e-8

        if constant_features_mask.all():
            print("Warning: All features are constant. Cannot perform feature selection.")
            non_constant_features = X_df.columns.tolist()
        else:
            non_constant_features = X_df.columns[~constant_features_mask].tolist()
            if constant_features_mask.sum() > 0:
                print(f"Removing {constant_features_mask.sum()} constant features.")
            X_df = X_df[non_constant_features]

        print(f"Final data shape for feature selection: {X_df.shape}")

        n_features_to_select = min(self.top_features, X_df.shape[1])

        if method == 'mutual_info':
            selector = SelectKBest(score_func=mutual_info_regression, k=n_features_to_select)
        else:
            selector = SelectKBest(score_func=f_regression, k=n_features_to_select)

        X_selected = selector.fit_transform(X_df, y_df)
        self.feature_selector = selector
        self.selected_features = X_df.columns[selector.get_support()].tolist()

        print(f"\n--- Selected Features ({len(self.selected_features)}) ---")
        for feature in self.selected_features:
            print(f"- {feature}")
        print("---------------------------------------\n")

        # Return as NumPy array and Pandas Index to maintain alignment with y
        return X_selected.astype(np.float32), X_df.index

    # Removed build_convlstm_model, build_lstm_model, build_conv1d_model
    # All deep learning models are replaced by XGBoost
    # def build_convlstm_model(self, input_shape): ...
    # def build_lstm_model(self, input_shape): ...
    # def build_conv1d_model(self, input_shape): ...

    def build_cnn_lstm_model(self, input_shape):
        print("CNN-LSTM model building skipped for current speed optimization, but can be reinstated later.")
        return None

    def train_lightgbm(self, X_train, y_train, X_val, y_val):
        """Train LightGBM model"""
        print("Training LightGBM model...")

        # Robust NaN/Inf handling just before LightGBM training
        if np.any(np.isnan(X_train)) or np.any(np.isinf(X_train)):
            print("WARNING: NaNs/Infs found in X_train for LightGBM. Applying emergency cleanup.")
            X_train = np.nan_to_num(X_train, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(X_val)) or np.any(np.isinf(X_val)):
            print("WARNING: NaNs/Infs found in X_val for LightGBM. Applying emergency cleanup.")
            X_val = np.nan_to_num(X_val, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(y_train)) or np.any(np.isinf(y_train)):
            print("WARNING: NaNs/Infs found in y_train for LightGBM. Applying emergency cleanup.")
            y_train = np.nan_to_num(y_train, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(y_val)) or np.any(np.isinf(y_val)):
            print("WARNING: NaNs/Infs found in y_val for LightGBM. Applying emergency cleanup.")
            y_val = np.nan_to_num(y_val, nan=0.0, posinf=1e6, neginf=-1e6)


        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        params = {
            'objective': 'regression',
            'metric': 'mae',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'random_state': 42,
            'n_estimators': 1000
        }

        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=params['n_estimators'],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )

        return model

    def train_xgboost(self, X_train, y_train, X_val, y_val):
        """Train XGBoost model"""
        print("Training XGBoost model...")

        # Robust NaN/Inf handling just before XGBoost training
        if np.any(np.isnan(X_train)) or np.any(np.isinf(X_train)):
            print("WARNING: NaNs/Infs found in X_train for XGBoost. Applying emergency cleanup.")
            X_train = np.nan_to_num(X_train, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(X_val)) or np.any(np.isinf(X_val)):
            print("WARNING: NaNs/Infs found in X_val for XGBoost. Applying emergency cleanup.")
            X_val = np.nan_to_num(X_val, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(y_train)) or np.any(np.isinf(y_train)):
            print("WARNING: NaNs/Infs found in y_train for XGBoost. Applying emergency cleanup.")
            y_train = np.nan_to_num(y_train, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(y_val)) or np.any(np.isinf(y_val)):
            print("WARNING: NaNs/Infs found in y_val for XGBoost. Applying emergency cleanup.")
            y_val = np.nan_to_num(y_val, nan=0.0, posinf=1e6, neginf=-1e6)

        model = xgb.XGBRegressor(
            objective='reg:squarederror', # Regression objective
            eval_metric='mae', # Mean Absolute Error
            n_estimators=1000, # Number of boosting rounds
            learning_rate=0.05,
            max_depth=6, # Maximum depth of a tree
            subsample=0.8, # Subsample ratio of the training instance
            colsample_bytree=0.8, # Subsample ratio of columns when constructing each tree
            random_state=42,
            n_jobs=-1 # Use all available CPU cores
        )

        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=50, # Stop if no improvement for 50 rounds
                  verbose=False) # Suppress verbose output during training

        return model

    def evaluate_model(self, y_true, y_pred, model_name):
        """Evaluate model performance"""
        # Ensure y_true and y_pred are clean before evaluation
        y_true_clean = np.nan_to_num(y_true, nan=0.0, posinf=1e6, neginf=-1e6)
        y_pred_clean = np.nan_to_num(y_pred, nan=0.0, posinf=1e6, neginf=-1e6)

        mae = mean_absolute_error(y_true_clean, y_pred_clean)

        # Pearson correlation requires at least 2 non-constant values for both arrays
        correlation = 0.0
        if len(np.unique(y_true_clean)) > 1 and len(np.unique(y_pred_clean)) > 1:
            try:
                correlation, _ = pearsonr(y_true_clean, y_pred_clean)
            except ValueError:
                # Can happen if inputs have 0 variance after cleaning
                correlation = 0.0

        print(f"{model_name} - MAE: {mae:.4f}, Pearson Correlation: {correlation:.4f}")
        return correlation

    def fit(self, train_data_raw_initial_load):
        """Main training pipeline with robust error handling"""
        print("Starting training pipeline...")

        # Determine raw X_n columns from the initial load structure
        X_n_cols_raw = [col for col in train_data_raw_initial_load.columns if col.startswith('X') and col != 'label']
        basic_features_cols = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']

        preselected_X_n_features = X_n_cols_raw[:min(self.top_X_features_to_preselect, len(X_n_cols_raw))]
        print(f"Initially selected {len(preselected_X_n_features)} X_n features for direct Pandas load.")

        columns_to_process_raw_for_fit = basic_features_cols + preselected_X_n_features + ['label']

        train_df = None
        if os.path.exists(self._engineered_data_checkpoint_path):
            print(f"Checkpoint found. Loading processed data from {self._engineered_data_checkpoint_path}...")
            train_df = pd.read_parquet(self._engineered_data_checkpoint_path)
            if 'timestamp' in train_df.columns:
                train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
        else:
            print(f"Selecting columns from initial raw training data...")
            train_df = train_data_raw_initial_load[columns_to_process_raw_for_fit].copy()
            if 'timestamp' not in train_df.columns and train_df.index.name == 'timestamp':
                train_df = train_df.reset_index(names=['timestamp'])
            train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
            gc.collect()
            train_df = self.optimize_memory(train_df)
            train_df = self.create_time_features(train_df)
            print(f"Saving engineered data to checkpoint: {self._engineered_data_checkpoint_path}...")
            train_df.to_parquet(self._engineered_data_checkpoint_path, index=False)
            print("Engineered data checkpoint saved.")

        print(f"Data shape after feature engineering: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
        feature_cols_final = [col for col in train_df.columns
                              if col not in ['timestamp', 'label']]
        X_df = train_df[feature_cols_final]
        y_df = train_df['label']
        print(f"Features shape before final selection: {X_df.shape[0]} rows, {X_df.shape[1]} columns")
        print(f"Target shape: {y_df.shape[0]} rows")
        
        X_selected, valid_idx = self.select_features(X_df, y_df)
        y_for_training = y_df.loc[valid_idx].astype(np.float32)

        del X_df, y_df
        gc.collect()

        X_scaled = self.scaler.fit_transform(X_selected)
        print("Final data validation (after scaling)...")
        if np.any(np.isnan(X_scaled)) or np.any(np.isinf(X_scaled)):
            print("ERROR: Still have invalid values after preprocessing! Applying emergency cleanup.")
            X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
        print(f"Final training data shape: {X_scaled.shape}")
        
        del X_selected
        gc.collect()

        # Split data into training and validation sets
        temp_train_full_timestamps = train_data_raw_initial_load[['timestamp']].copy()
        if 'timestamp' not in temp_train_full_timestamps.columns and temp_train_full_timestamps.index.name == 'timestamp':
            temp_train_full_timestamps = temp_train_full_timestamps.reset_index(names=['timestamp'])
        temp_train_full_timestamps['timestamp'] = pd.to_datetime(temp_train_full_timestamps['timestamp'])
        
        VALIDATION_SPLIT_DATE = '2024-01-01'
        original_train_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] < VALIDATION_SPLIT_DATE].index
        original_val_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] >= VALIDATION_SPLIT_DATE].index
        
        del temp_train_full_timestamps
        gc.collect()

        # Create a temporary DataFrame from X_scaled to map to original indices for splitting
        X_scaled_temp_df = pd.DataFrame(X_scaled, index=valid_idx, columns=self.selected_features)
        
        del X_scaled, y_for_training
        gc.collect()

        actual_train_indices = original_train_indices.intersection(X_scaled_temp_df.index)
        actual_val_indices = original_val_indices.intersection(X_scaled_temp_df.index)

        # Extract X_train, y_train, X_val, y_val as NumPy arrays for direct use by LGBM/XGBoost
        X_train_final = X_scaled_temp_df.loc[actual_train_indices].values.astype(np.float32)
        y_train_final = pd.Series(train_df['label'].loc[actual_train_indices].values).astype(np.float32)
        X_val_final = X_scaled_temp_df.loc[actual_val_indices].values.astype(np.float32)
        y_val_final = pd.Series(train_df['label'].loc[actual_val_indices].values).astype(np.float32)
        
        # Free up train_df and X_scaled_temp_df
        del train_df, X_scaled_temp_df, valid_idx
        gc.collect()

        print(f"Training set shapes: X={X_train_final.shape}, Y={y_train_final.shape}")
        print(f"Validation set shapes: X={X_val_final.shape}, Y={y_val_final.shape}")

        # Train LightGBM model
        try:
            lgb_model = self.train_lightgbm(X_train_final, y_train_final, X_val_final, y_val_final)
            lgb_pred = lgb_model.predict(X_val_final)
            lgb_score = self.evaluate_model(y_val_final, lgb_pred, "LightGBM")
            self.models['lightgbm'] = lgb_model
        except Exception as e:
            print(f"LightGBM training or prediction failed: {e}")
            lgb_score = 0
            lgb_pred = np.zeros_like(y_val_final) # Ensure lgb_pred is defined even on failure

        # Train XGBoost model
        try:
            xgb_model = self.train_xgboost(X_train_final, y_train_final, X_val_final, y_val_final)
            xgb_pred = xgb_model.predict(X_val_final)
            xgb_score = self.evaluate_model(y_val_final, xgb_pred, "XGBoost")
            self.models['xgboost'] = xgb_model
        except Exception as e:
            print(f"XGBoost training or prediction failed: {e}")
            xgb_score = 0
            xgb_pred = np.zeros_like(y_val_final) # Ensure xgb_pred is defined even on failure

        # Ensemble logic
        ensemble_predictions_list = []
        ensemble_weights = []

        if 'lightgbm' in self.models and lgb_score > 0:
            ensemble_predictions_list.append(lgb_pred)
            ensemble_weights.append(0.5) # Initial equal weight

        if 'xgboost' in self.models and xgb_score > 0:
            ensemble_predictions_list.append(xgb_pred)
            ensemble_weights.append(0.5) # Initial equal weight

        if ensemble_predictions_list:
            # Ensure all predictions have the same length before ensembling
            min_len = min(len(p) for p in ensemble_predictions_list)
            ensemble_predictions_list = [p[:min_len] for p in ensemble_predictions_list]
            y_val_ensemble_aligned = y_val_final[:min_len] # Align true labels for evaluation

            ensemble_pred = np.average(ensemble_predictions_list, axis=0, weights=ensemble_weights)
            ensemble_score = self.evaluate_model(y_val_ensemble_aligned, ensemble_pred, "Ensemble (LGBM + XGBoost)")
            print(f"\nEnsemble score: {ensemble_score:.4f}")
        else:
            print("No successful models to include in ensemble. Ensemble score defaulting to 0.")
            ensemble_score = 0

        print(f"\nBest individual model score: {max(lgb_score, xgb_score):.4f}")
        print(f"Final overall ensemble score: {ensemble_score:.4f}")
        
        # Clean up memory after training
        del X_train_final, y_train_final, X_val_final, y_val_final
        gc.collect()
        return self

    def predict(self, test_data_raw_initial_load):
        """Generate predictions for test data with robust error handling"""
        print("Generating predictions...")

        temp_df_for_id_map = test_data_raw_initial_load.copy()
        if 'ID' not in temp_df_for_id_map.columns and temp_df_for_id_map.index.name == 'ID':
            temp_df_for_id_map = temp_df_for_id_map.reset_index(names=['ID'])
        elif 'ID' not in temp_df_for_id_map.columns and temp_df_for_id_map.index is not None and temp_df_for_id_map.index.name is None and len(temp_df_for_id_map.index) == len(temp_df_for_id_map):
            temp_df_for_id_map = temp_df_for_id_map.reset_index()
            temp_df_for_id_map.rename(columns={'index': 'ID'}, inplace=True)

        id_to_original_index_map = pd.Series(temp_df_for_id_map.index.values, index=temp_df_for_id_map['ID'])
        original_test_ids = temp_df_for_id_map['ID'].copy()

        del temp_df_for_id_map
        gc.collect()

        basic_features_cols_test = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'ID']
        X_n_cols_raw_test = [col for col in test_data_raw_initial_load.columns if col.startswith('X') and col != 'label']
        preselected_X_n_features_test = X_n_cols_raw_test[:min(self.top_X_features_to_preselect, len(X_n_cols_raw_test))]

        columns_to_process_raw_for_predict = basic_features_cols_test + preselected_X_n_features_test

        missing_columns = [col for col in columns_to_process_raw_for_predict if col not in test_data_raw_initial_load.columns]
        if missing_columns:
            raise KeyError(f"The following required columns are missing from the test data: {missing_columns}. Please ensure your test.parquet file contains these columns.")

        print(f"Selecting columns from initial raw test data (only {len(columns_to_process_raw_for_predict)} columns)...")
        test_df = test_data_raw_initial_load[columns_to_process_raw_for_predict].copy()

        del test_data_raw_initial_load
        gc.collect()

        if 'ID' not in test_df.columns and test_df.index.name == 'ID':
            test_df = test_df.reset_index()

        if 'timestamp' in test_df.columns:
            test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

        test_df = self.optimize_memory(test_df)
        test_df = self.create_time_features(test_df)

        print(f"Test data shape after feature engineering: {test_df.shape[0]} rows, {test_df.shape[1]} columns")

        if self.selected_features is None:
            raise ValueError("Model must be fitted before making predictions (selected_features is None)")

        X_test_df_final = test_df[self.selected_features]

        X_test_df_final = X_test_df_final.replace([np.inf, -np.inf], np.nan)
        X_test_df_final = X_test_df_final.ffill().bfill().fillna(0)

        X_test_scaled = self.scaler.transform(X_test_df_final)

        if np.any(np.isnan(X_test_scaled)) or np.any(np.isinf(X_test_scaled)):
            print("WARNING: Invalid values in test data after scaling, applying emergency cleanup.")
            X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=1e6, neginf=-1e6)

        X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test_df_final.index, columns=self.selected_features)

        del X_test_df_final, test_df
        gc.collect()

        predictions = np.zeros(len(original_test_ids), dtype=np.float32)
        indexed_predictions_by_internal_idx = {}

        # LightGBM prediction
        if 'lightgbm' in self.models:
            try:
                X_test_scaled_for_lgbm = X_test_scaled_df.copy()
                if np.any(np.isnan(X_test_scaled_for_lgbm.values)) or np.any(np.isinf(X_test_scaled_for_lgbm.values)):
                    print("WARNING: NaNs/Infs found in X_test_scaled_for_lgbm. Applying emergency cleanup.")
                    X_test_scaled_for_lgbm = pd.DataFrame(
                        np.nan_to_num(X_test_scaled_for_lgbm.values, nan=0.0, posinf=1e6, neginf=-1e6),
                        columns=X_test_scaled_for_lgbm.columns, index=X_test_scaled_for_lgbm.index
                    )
                lgb_pred_full = self.models['lightgbm'].predict(X_test_scaled_for_lgbm)
                for i, idx in enumerate(X_test_scaled_for_lgbm.index):
                    indexed_predictions_by_internal_idx[idx] = lgb_pred_full[i]
                print("LightGBM predictions generated.")
                del X_test_scaled_for_lgbm
                gc.collect()
            except Exception as e:
                print(f"LightGBM prediction failed: {e}")

        # XGBoost prediction
        if 'xgboost' in self.models:
            try:
                X_test_scaled_for_xgb = X_test_scaled_df.copy()
                if np.any(np.isnan(X_test_scaled_for_xgb.values)) or np.any(np.isinf(X_test_scaled_for_xgb.values)):
                    print("WARNING: NaNs/Infs found in X_test_scaled_for_xgb. Applying emergency cleanup.")
                    X_test_scaled_for_xgb = pd.DataFrame(
                        np.nan_to_num(X_test_scaled_for_xgb.values, nan=0.0, posinf=1e6, neginf=-1e6),
                        columns=X_test_scaled_for_xgb.columns, index=X_test_scaled_for_xgb.index
                    )
                xgb_pred_full = self.models['xgboost'].predict(X_test_scaled_for_xgb)
                # If LGBM already populated, we need to store both for ensemble
                for i, idx in enumerate(X_test_scaled_for_xgb.index):
                    # Store XGBoost predictions separately for ensemble calculation
                    # Or update if only one model is used
                    if idx in indexed_predictions_by_internal_idx:
                        indexed_predictions_by_internal_idx[idx] = (indexed_predictions_by_internal_idx[idx] * 0.5) + (xgb_pred_full[i] * 0.5)
                    else:
                        indexed_predictions_by_internal_idx[idx] = xgb_pred_full[i]
                print("XGBoost predictions generated.")
                del X_test_scaled_for_xgb
                gc.collect()
            except Exception as e:
                print(f"XGBoost prediction failed: {e}")

        del X_test_scaled_df
        gc.collect()

        for i, current_id in enumerate(original_test_ids):
            original_idx_in_raw_df = id_to_original_index_map.get(current_id)

            if original_idx_in_raw_df is not None and original_idx_in_raw_df in indexed_predictions_by_internal_idx:
                predictions[i] = indexed_predictions_by_internal_idx[original_idx_in_raw_df]
            else:
                predictions[i] = 0.0

        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
            print("WARNING: Invalid predictions detected (NaN/Inf), cleaning...")
            predictions = np.nan_to_num(predictions, nan=0.0, posinf=1.0, neginf=-1.0)

        print(f"Generated {len(predictions)} predictions")
        print(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")

        return predictions
    
    # Removed _prepare_sequences_for_inference as it's no longer needed
    # def _prepare_sequences_for_inference(self, data): ...


# Main execution function
def run_competition_pipeline():
    """Run the complete competition pipeline"""

    print("Loading data...")
    train_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')

    if 'timestamp' not in train_full_raw.columns:
        if train_full_raw.index.name == 'timestamp':
            train_full_raw = train_full_raw.reset_index(names=['timestamp'])
            print(f"DEBUG: Resetting index 'timestamp' for train_full_raw. New columns: {train_full_raw.columns.tolist()}")
        else:
            train_full_raw = train_full_raw.reset_index()
            if 'index' in train_full_raw.columns and 'timestamp' not in train_full_raw.columns:
                 train_full_raw.rename(columns={'index': 'timestamp'}, inplace=True)
                 print(f"DEBUG: Renamed 'index' to 'timestamp' for train_full_raw. New columns: {train_full_raw.columns.tolist()}")

    if 'timestamp' in train_full_raw.columns:
        train_full_raw['timestamp'] = pd.to_datetime(train_full_raw['timestamp'])
        print(f"DEBUG: 'timestamp' column found and converted to datetime in train_full_raw.")
    else:
        print("CRITICAL WARNING: 'timestamp' column still not found in train_full_raw after all attempts. This will likely cause issues.")
        print(f"DEBUG: train_full_raw columns are: {train_full_raw.columns.tolist()}")

    test_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')

    if 'ID' not in test_full_raw.columns:
        if test_full_raw.index.name == 'ID':
            test_full_raw = test_full_raw.reset_index(names=['ID'])
            print(f"DEBUG: Resetting index 'ID' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")
        else:
            test_full_raw = test_full_raw.reset_index()
            if 'index' in test_full_raw.columns and 'ID' not in test_full_raw.columns:
                test_full_raw.rename(columns={'index': 'ID'}, inplace=True)
                print(f"DEBUG: Renamed 'index' to 'ID' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")

    if 'timestamp' not in test_full_raw.columns:
        if test_full_raw.index.name == 'timestamp':
            test_full_raw = test_full_raw.reset_index(names=['timestamp'])
            print(f"DEBUG: Resetting index 'timestamp' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")
        else:
            test_full_raw = test_full_raw.reset_index()
            if 'index' in test_full_raw.columns and 'timestamp' not in test_full_raw.columns:
                test_full_raw.rename(columns={'index': 'timestamp'}, inplace=True)
                print(f"DEBUG: Renamed 'index' to 'timestamp' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")

    if 'timestamp' in test_full_raw.columns:
        test_full_raw['timestamp'] = pd.to_datetime(test_full_raw['timestamp'])
        print(f"DEBUG: 'timestamp' column found and converted to datetime in test_full_raw.")
    else:
        print("CRITICAL WARNING: 'timestamp' column still not found in test_full_raw after all attempts. This will likely cause issues.")
        print(f"DEBUG: test_full_raw columns are: {test_full_raw.columns.tolist()}")

    print(f"\nTrain shape: {train_full_raw.shape}")
    print(f"Test shape: {test_full_raw.shape}")

    # Initialize and train model
    predictor = CryptoMarketPredictor(
        sequence_length=30, # Sequence length is still used for feature engineering, but not directly by XGBoost
        top_features=100,
        top_X_features_to_preselect=30
    )
    predictor.fit(train_full_raw)

    predictions = predictor.predict(test_full_raw)

    # Create submission
    submission = pd.DataFrame({
        'ID': test_full_raw['ID'],
        'Prediction': predictions
    })

    # Save submission
    submission.to_csv('/kaggle/working/submission.csv', index=False)
    print(f"Submission saved with {len(submission)} predictions")
    print(f"Prediction statistics - Mean: {predictions.mean():.4f}, Std: {predictions.std():.4f}")

    return submission

# Run the pipeline
if __name__ == "__main__":
    submission = run_competition_pipeline()


2025-07-05 07:29:27.516410: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751700567.757777      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751700567.821380      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading data...
DEBUG: Resetting index 'timestamp' for train_full_raw. New columns: ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83', 'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92', 'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100', 'X101', 'X102', 'X103', 'X104', 'X105', 'X106', 'X107', 'X108', 'X109', 'X110', 'X111', 'X112', 'X113', 'X114', 'X115', 'X116', 'X117', 'X118', 'X119', 'X1