In [None]:
# DRW Crypto Market Prediction Competition Pipeline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os # Import os for path checking

# Memory optimization and data processing
import gc
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

# Removed Dask: import dask.dataframe as dd 

# Deep learning and modeling 
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (LSTM, ConvLSTM2D, Dense, Dropout, 
                                     BatchNormalization, Input, Conv1D, MaxPooling1D,
                                     Flatten, Reshape, TimeDistributed)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Tree-based models for ensemble
import lightgbm as lgb
from scipy.stats import pearsonr

# Set memory growth for GPU (if available) - still good practice
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

class CryptoMarketPredictor:
    def __init__(self, sequence_length=30, top_features=100, top_X_features_to_preselect=30): 
        self.sequence_length = sequence_length 
        self.top_features = top_features 
        self.top_X_features_to_preselect = top_X_features_to_preselect 
        self.scaler = RobustScaler()
        self.feature_selector = None
        self.selected_features = None # Stores final selected feature names
        self.models = {}
        # Re-added checkpoint path for robustness, now for Pandas DataFrames
        self._checkpoint_path = './processed_train_data_checkpoint.parquet' 
        
    def optimize_memory(self, df):
        """
        Optimize Pandas DataFrame memory usage and clean data.
        """
        print(f"Memory usage before optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # Clean data first
        df = self.clean_data(df)
        
        # Optimize numeric columns (Pandas directly)
        for col in df.select_dtypes(include=[np.number]).columns:
            if col == 'timestamp' or col == 'ID' or col == 'label':
                continue # Don't downcast timestamp, ID, or label
            df[col] = pd.to_numeric(df[col], downcast='float')
            
        print(f"Memory usage after optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        return df
    
    def clean_data(self, df):
        """
        Clean Pandas DataFrame by handling inf, -inf, and extreme values.
        """
        print("Cleaning data...")
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        # Replace inf and -inf with NaN first
        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)

        # Fill NaN values with forward fill, then backward fill, then 0
        for col in numeric_cols:
            df[col] = df[col].ffill().bfill().fillna(0)

        # Handle extreme outliers (values beyond 3*IQR)
        for col in df.select_dtypes(include=[np.float32, np.float64]).columns:
            if col in ['timestamp', 'ID', 'label']: continue 
            if df[col].nunique() > 1:
                q25 = df[col].quantile(0.25)
                q75 = df[col].quantile(0.75)
                iqr = q75 - q25
                
                if iqr != 0 and not pd.isna(iqr):
                    lower_bound = q25 - 3 * iqr
                    upper_bound = q75 + 3 * iqr
                    df[col] = df[col].clip(lower_bound, upper_bound)
        print("Data cleaning applied.")
        return df
    
    def create_time_features(self, df):
        """
        Create time-based features with robust calculations using Pandas.
        Significantly reduced complexity for faster execution.
        """
        print("Creating time-based features...")
        
        # Basic market features
        df['mid_price'] = (df['bid_qty'] + df['ask_qty']) / 2
        df['spread'] = df['ask_qty'] - df['bid_qty']
        
        # Safe division for imbalance
        denominator = df['bid_qty'] + df['ask_qty'] + 1e-10
        df['imbalance'] = (df['bid_qty'] - df['ask_qty']) / denominator
        
        # Safe division for buy/sell ratio
        df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)
        
        # Rolling statistics - significantly reduced windows for speed
        windows = [10, 30] 
        base_cols_for_rolling = ['volume', 'mid_price', 'buy_qty', 'sell_qty', 'imbalance'] 

        for col in base_cols_for_rolling:
            for window in windows:
                df[f'{col}_ma_{window}'] = df[col].rolling(window, min_periods=1).mean()
                df[f'{col}_std_{window}'] = df[col].rolling(window, min_periods=1).std().fillna(0) 
        
        # Lagged features - significantly reduced lags for speed
        lags = [1, 5] 
        base_cols_for_lag = ['mid_price', 'imbalance'] 

        for col in base_cols_for_lag:
            for lag in lags:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        
        # Technical indicators - reduced
        df['rsi_proxy'] = self.calculate_rsi_proxy(df['mid_price'], window=10) 
        df['momentum'] = df['mid_price'] - df['mid_price'].shift(5) 
        
        # Final check for any inf/nan values that might have been introduced
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            df[col] = df[col].ffill().bfill().fillna(0)

        print(f"Time-based features created. Current shape: {df.shape[0]} rows, {df.shape[1]} columns")
        return df
    
    def calculate_rsi_proxy(self, prices_series, window=14):
        """Calculate RSI-like indicator with safe operations for Pandas Series."""
        delta = prices_series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()
        
        rs = gain / (loss + 1e-10)
        rsi = 100 - (100 / (1 + rs))
        
        rsi = rsi.replace([np.inf, -np.inf], np.nan).fillna(50)
        
        return rsi
    
    def select_features(self, X_df, y_df, method='mutual_info'):
        """
        Feature selection to reduce dimensionality with robust handling.
        Operates directly on Pandas DataFrames.
        """
        print(f"Selecting top {self.top_features} features from {X_df.shape[1]} features...")
        
        print("Validating data before final feature selection...")
        
        # Check for any remaining inf/nan values
        inf_mask = np.isinf(X_df).any(axis=1)
        nan_mask = np.isnan(X_df).any(axis=1)
        invalid_mask = inf_mask | nan_mask
        
        if invalid_mask.sum() > 0:
            print(f"Removing {invalid_mask.sum()} rows with invalid values before final selection.")
            X_df = X_df[~invalid_mask]
            y_df = y_df[~invalid_mask]
        
        # Check for constant or near-constant features (can cause issues for some selectors)
        feature_std = X_df.std()
        constant_features_mask = feature_std < 1e-8 
        
        if constant_features_mask.all():
            print("Warning: All features are constant. Cannot perform feature selection.")
            non_constant_features = X_df.columns.tolist() 
        else:
            non_constant_features = X_df.columns[~constant_features_mask].tolist()
            if constant_features_mask.sum() > 0:
                print(f"Removing {constant_features_mask.sum()} constant features.")
            X_df = X_df[non_constant_features]


        print(f"Final data shape for feature selection: {X_df.shape}")
        
        n_features_to_select = min(self.top_features, X_df.shape[1])
        
        if method == 'mutual_info':
            selector = SelectKBest(score_func=mutual_info_regression, k=n_features_to_select)
        else:
            selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
        
        X_selected = selector.fit_transform(X_df, y_df)
        self.feature_selector = selector
        self.selected_features = X_df.columns[selector.get_support()].tolist()
        
        print(f"\n--- Selected Features ({len(self.selected_features)}) ---")
        for feature in self.selected_features:
            print(f"- {feature}")
        print("---------------------------------------\n")

        # Return as NumPy array and Pandas Index to maintain alignment with y
        return X_selected.astype(np.float32), X_df.index
    
    def prepare_sequences(self, data, target=None):
        """Prepare sequences for time series models"""
        sequences = []
        targets = []
        
        for i in range(self.sequence_length, len(data)):
            sequences.append(data[i-self.sequence_length:i])
            if target is not None:
                targets.append(target[i])
        
        # Ensure outputs are float32 for TensorFlow models to save memory
        return np.array(sequences).astype(np.float32), np.array(targets).astype(np.float32) if target is not None else None
    
    def build_convlstm_model(self, input_shape):
        """Build ConvLSTM model for spatial-temporal patterns"""
        model = Sequential([
            # Reshape for ConvLSTM (samples, time_steps, rows, cols, channels)
            # input_shape is (sequence_length, num_features)
            # We need (sequence_length, 1, num_features, 1)
            Reshape((self.sequence_length, 1, input_shape[1], 1), input_shape=input_shape),
            
            ConvLSTM2D(filters=64, kernel_size=(1, 3), 
                       activation='tanh', recurrent_activation='sigmoid',
                       return_sequences=True, dropout=0.2, padding='same'), # Added padding='same'
            BatchNormalization(),
            
            ConvLSTM2D(filters=32, kernel_size=(1, 3),
                       activation='tanh', recurrent_activation='sigmoid',
                       return_sequences=False, dropout=0.2, padding='same'), # Added padding='same'
            BatchNormalization(),
            
            Flatten(),
            Dense(50, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='linear')
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), 
                      loss='mae', metrics=['mae'])
        return model
    
    def build_lstm_model(self, input_shape):
        """Build standard LSTM model"""
        model = Sequential([
            LSTM(100, return_sequences=True, input_shape=input_shape, dropout=0.2),
            BatchNormalization(),
            LSTM(50, return_sequences=False, dropout=0.2),
            BatchNormalization(),
            Dense(50, activation='relu'),
            Dropout(0.3),
            Dense(1, activation='linear')
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), 
                      loss='mae', metrics=['mae'])
        return model

    def build_cnn_lstm_model(self, input_shape):
        print("CNN-LSTM model building skipped for current speed optimization, but can be reinstated later.")
        return None 
    
    def train_lightgbm(self, X_train, y_train, X_val, y_val):
        """Train LightGBM model"""
        print("Training LightGBM model...")
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        params = {
            'objective': 'regression',
            'metric': 'mae',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'random_state': 42,
            'n_estimators': 1000 # Increased estimators for potentially better learning
        }
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=params['n_estimators'], 
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        return model
    
    def evaluate_model(self, y_true, y_pred, model_name):
        """Evaluate model performance"""
        mae = mean_absolute_error(y_true, y_pred)
        correlation, _ = pearsonr(y_true, y_pred)
        
        print(f"{model_name} - MAE: {mae:.4f}, Pearson Correlation: {correlation:.4f}")
        return correlation
    
    def fit(self, train_data_raw_initial_load): 
        """Main training pipeline with robust error handling"""
        print("Starting training pipeline...")
        
        # Determine raw X_n columns from the initial load structure
        X_n_cols_raw = [col for col in train_data_raw_initial_load.columns if col.startswith('X') and col != 'label']    
        basic_features_cols = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']
        
        # --- Aggressive initial column selection for Pandas memory management ---
        # We will ONLY select these columns from the already loaded raw DataFrame
        preselected_X_n_features = X_n_cols_raw[:min(self.top_X_features_to_preselect, len(X_n_cols_raw))]
        print(f"Initially selected {len(preselected_X_n_features)} X_n features for direct Pandas load (due to memory constraints without Dask).")

        # Ensure 'timestamp' and 'label' are in columns_to_process for the fit method
        columns_to_process_raw_for_fit = basic_features_cols + preselected_X_n_features + ['label']
        
        train_df = None
        if os.path.exists(self._checkpoint_path):
            print(f"Checkpoint found. Loading processed data from {self._checkpoint_path}...")
            train_df = pd.read_parquet(self._checkpoint_path)
            # Ensure timestamp is datetime for Pandas DataFrame if not already
            if 'timestamp' in train_df.columns:
                train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
        else:
            print(f"Selecting columns from initial raw training data...")
            # Select columns directly from the already loaded train_data_raw_initial_load
            # This is where the fix is: train_data_raw_initial_load is already full, so 'timestamp' is a column here
            train_df = train_data_raw_initial_load[columns_to_process_raw_for_fit].copy()
            
            # Ensure 'timestamp' is a column if it was index (less likely with initial full load, but safe check)
            if 'timestamp' not in train_df.columns and train_df.index.name == 'timestamp':
                train_df = train_df.reset_index()

            # Ensure timestamp is datetime
            train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
            
            # Cleanup initial raw load (as it's now copied/subsetted)
            # No need to del train_data_raw_initial_load here, it's passed as arg
            gc.collect()

            # Memory optimization and data cleaning (now on Pandas DataFrame)
            train_df = self.optimize_memory(train_df)    
            
            # Feature engineering (now on Pandas DataFrame)
            train_df = self.create_time_features(train_df)
            
            print(f"Saving processed data to checkpoint: {self._checkpoint_path}...")
            train_df.to_parquet(self._checkpoint_path, index=False) # Save checkpoint, index=False to avoid writing Pandas index
            print("Checkpoint saved.")
            
        print(f"Data shape after feature engineering: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
        
        # Prepare features and target for final selection and modeling
        feature_cols_final = [col for col in train_df.columns 
                              if col not in ['timestamp', 'label']]
        X_df = train_df[feature_cols_final]
        y_df = train_df['label']
        
        print(f"Features shape before final selection: {X_df.shape[0]} rows, {X_df.shape[1]} columns")
        print(f"Target shape: {y_df.shape[0]} rows")
        
        # Cleanup df reference
        del train_df # Only delete the train_df here as X_df and y_df are derived from it
        gc.collect()

        # Feature selection with robust handling (operates on Pandas DataFrames)
        X_selected, valid_idx = self.select_features(X_df, y_df)    
        y_for_training = y_df.loc[valid_idx].astype(np.float32)

        del X_df, y_df # Clean up X_df and y_df after feature selection
        gc.collect()

        # Scale features
        X_scaled = self.scaler.fit_transform(X_selected)
        
        # Final validation of X_scaled
        print("Final data validation (after scaling)...")
        if np.any(np.isnan(X_scaled)) or np.any(np.isinf(X_scaled)):
            print("ERROR: Still have invalid values after preprocessing! Applying emergency cleanup.")
            X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
        
        print(f"Final training data shape: {X_scaled.shape}")
        
        # Time-based split (use last 2-3 months for validation)
        # We need original timestamps to split; use the main train file's timestamps
        temp_train_full_timestamps = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet', columns=['timestamp'])
        if 'timestamp' not in temp_train_full_timestamps.columns and temp_train_full_timestamps.index.name == 'timestamp':
            temp_train_full_timestamps = temp_train_full_timestamps.reset_index()
        temp_train_full_timestamps['timestamp'] = pd.to_datetime(temp_train_full_timestamps['timestamp'])

        # Global VALIDATION_SPLIT_DATE for consistency
        VALIDATION_SPLIT_DATE = '2024-01-01'

        # Filter the original indices based on the validation split date
        original_train_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] < VALIDATION_SPLIT_DATE].index
        original_val_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] >= VALIDATION_SPLIT_DATE].index

        # Create a temporary DataFrame from X_scaled to use .loc with original indices
        # This uses X's original index (valid_idx from select_features)
        X_scaled_temp_df = pd.DataFrame(X_scaled, index=valid_idx, columns=self.selected_features)

        # Intersect indices to ensure we only select data that exists in X_scaled_temp_df
        actual_train_indices = original_train_indices.intersection(X_scaled_temp_df.index)
        actual_val_indices = original_val_indices.intersection(X_scaled_temp_df.index)

        X_train = X_scaled_temp_df.loc[actual_train_indices].values.astype(np.float32)
        y_train = y_for_training.loc[actual_train_indices].values.astype(np.float32)
        X_val = X_scaled_temp_df.loc[actual_val_indices].values.astype(np.float32)
        y_val = y_for_training.loc[actual_val_indices].values.astype(np.float32)

        del temp_train_full_timestamps, X_scaled_temp_df, y_for_training # Clean up more
        gc.collect() # Aggressive GC

        print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}")
        
        # Train LightGBM (baseline)
        try:
            lgb_model = self.train_lightgbm(X_train, y_train, X_val, y_val)
            lgb_pred = lgb_model.predict(X_val)
            lgb_score = self.evaluate_model(y_val, lgb_pred, "LightGBM")
            self.models['lightgbm'] = lgb_model
        except Exception as e:
            print(f"LightGBM training failed: {e}")
            lgb_score = 0
        
        # Deep learning model training and ensemble logic (reinstated)
        try:
            X_train_seq, y_train_seq = self.prepare_sequences(X_train, y_train)
            X_val_seq, y_val_seq = self.prepare_sequences(X_val, y_val)
            
            if len(X_train_seq) > 0:
                print(f"Sequence data - Train: {X_train_seq.shape}, Val: {X_val_seq.shape}")
                
                callbacks = [
                    EarlyStopping(patience=10, restore_best_weights=True, monitor='val_mae'),
                    ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, monitor='val_mae')
                ]
                
                # Train ConvLSTM
                convlstm_score = 0 # Initialize score
                try:
                    print("Training ConvLSTM model...")
                    convlstm_model = self.build_convlstm_model(X_train_seq.shape[1:])
                    if convlstm_model: # Check if model was actually built
                        convlstm_model.fit(
                            X_train_seq, y_train_seq,
                            validation_data=(X_val_seq, y_val_seq),
                            epochs=50, batch_size=64,
                            callbacks=callbacks, verbose=1 # Changed verbose to 1 for logging
                        )
                        convlstm_pred = convlstm_model.predict(X_val_seq).flatten()
                        convlstm_score = self.evaluate_model(y_val_seq, convlstm_pred, "ConvLSTM")
                        self.models['convlstm'] = convlstm_model
                except Exception as e:
                    print(f"ConvLSTM training failed: {e}")
                    convlstm_score = 0
                
                # Train LSTM
                lstm_score = 0 # Initialize score
                try:
                    print("Training LSTM model...")
                    lstm_model = self.build_lstm_model(X_train_seq.shape[1:])
                    if lstm_model: # Check if model was actually built
                        lstm_model.fit(
                            X_train_seq, y_train_seq,
                            validation_data=(X_val_seq, y_val_seq),
                            epochs=50, batch_size=64,
                            callbacks=callbacks, verbose=1 # Changed verbose to 1 for logging
                        )
                        lstm_pred = lstm_model.predict(X_val_seq).flatten()
                        lstm_score = self.evaluate_model(y_val_seq, lstm_pred, "LSTM")
                        self.models['lstm'] = lstm_model
                except Exception as e:
                    print(f"LSTM training failed: {e}")
                    lstm_score = 0
                
                # Ensemble predictions if we have multiple models (including DL)
                if len(self.models) > 1:
                    # Align lgb_pred to match the length of DL predictions
                    lgb_pred_aligned = lgb_pred[self.sequence_length:]

                    ensemble_pred_dl = []
                    dl_weights = []

                    if 'convlstm' in self.models and convlstm_score > 0: # Only include if successful
                        ensemble_pred_dl.append(convlstm_pred)
                        dl_weights.append(0.35)
                    if 'lstm' in self.models and lstm_score > 0: # Only include if successful
                        ensemble_pred_dl.append(lstm_pred) 
                        dl_weights.append(0.25)

                    if ensemble_pred_dl:
                        # Combine DL predictions first, then ensemble with LGBM
                        dl_ensemble_weighted = np.average(ensemble_pred_dl, axis=0, weights=dl_weights)
                        ensemble_pred = (lgb_pred_aligned * 0.4) + (dl_ensemble_weighted * 0.6)
                        
                        ensemble_score = self.evaluate_model(y_val_seq, ensemble_pred, "Ensemble")
                        print(f"\nEnsemble score: {ensemble_score:.4f}")
                    else:
                        print("No successful deep learning models to include in ensemble.")
                        ensemble_score = lgb_score # Fallback to LGBM score if DL failed

            else:
                print("Skipping deep learning models as no sequences could be prepared.")
                ensemble_score = lgb_score # Only LGBM was trained
            
            print(f"\nBest individual model score: {max(lgb_score, convlstm_score, lstm_score) if 'convlstm' in self.models else lgb_score:.4f}")
            print(f"Ensemble score: {ensemble_score:.4f}") 
        except Exception as e:
            print(f"Deep learning training failed during ensemble: {e}")
            ensemble_score = lgb_score # Fallback to LGBM score
        
        # Cleanup memory for sequence data
        del X_train, y_train, X_val, y_val 
        if 'X_train_seq' in locals() and X_train_seq is not None:
             del X_train_seq, y_train_seq, X_val_seq, y_val_seq
        gc.collect()
        
        return self
    
    def predict(self, test_data_raw_initial_load): 
        """Generate predictions for test data with robust error handling"""
        print("Generating predictions...")
        
        # --- Aggressive initial column selection for Pandas memory management ---
        # We will ONLY select these columns from the already loaded raw DataFrame
        # self.selected_features should be populated after fit()
        
        basic_features_cols_test = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'ID']
        # The test set does NOT have a 'label' column during prediction
        # Get the original X_n columns from the raw test data (matching what was selected in training)
        X_n_cols_raw_test = [col for col in test_data_raw_initial_load.columns if col.startswith('X') and col != 'label']
        preselected_X_n_features_test = X_n_cols_raw_test[:min(self.top_X_features_to_preselect, len(X_n_cols_raw_test))]

        # Select only the raw columns needed for feature engineering from the test set
        columns_to_process_raw_for_predict = basic_features_cols_test + preselected_X_n_features_test
        
        print(f"Selecting columns from initial raw test data (only {len(columns_to_process_raw_for_predict)} columns)...")
        # Select columns directly from the already loaded test_data_raw_initial_load
        test_df = test_data_initial_load[columns_to_process_raw_for_predict].copy() # FIX: Use _initial_load
        
        # Ensure 'ID' is a column if it was index
        if 'ID' not in test_df.columns and test_df.index.name == 'ID':
            test_df = test_df.reset_index()

        # Store original test IDs for final submission mapping
        original_test_ids = test_df['ID'].copy()    
        
        # Ensure timestamp is datetime
        if 'timestamp' in test_df.columns:
            test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

        del test_data_raw_initial_load # Free up memory from initial test data load
        gc.collect()

        # Memory optimization and data cleaning (now on Pandas DataFrame)
        test_df = self.optimize_memory(test_df)
        
        # Feature engineering (same as training, now on Pandas DataFrame)
        test_df = self.create_time_features(test_df)
        
        print(f"Test data shape after feature engineering: {test_df.shape[0]} rows, {test_df.shape[1]} columns")
        
        # Prepare features for prediction
        if self.selected_features is None:
            raise ValueError("Model must be fitted before making predictions (selected_features is None)")
        
        # Filter test_data to include only the final selected features (from training)
        # This is now safe as test_df has all engineered features
        X_test_df_final = test_df[self.selected_features] # Ensure order matches training features

        # Handle any remaining invalid values before scaling
        X_test_df_final = X_test_df_final.replace([np.inf, -np.inf], np.nan)
        X_test_df_final = X_test_df_final.fillna(method='ffill').fillna(method='bfill').fillna(0)
        
        # Scale features
        X_test_scaled = self.scaler.transform(X_test_df_final)
        
        # Final validation
        if np.any(np.isnan(X_test_scaled)) or np.any(np.isinf(X_test_scaled)):
            print("WARNING: Invalid values in test data after scaling, applying emergency cleanup.")
            X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
        
        # Create a DataFrame from X_test_scaled to maintain original indices for prediction mapping
        X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test_df_final.index, columns=self.selected_features)
        
        del X_test_df_final, test_df # Clean up
        gc.collect()

        predictions = np.zeros(len(original_test_ids), dtype=np.float32) # Initialize full predictions

        # Dictionary to store predictions mapped to original DataFrame indices
        indexed_predictions = {}
        
        # LightGBM predictions (baseline/fallback)
        if 'lightgbm' in self.models:
            try:
                lgb_pred_full = self.models['lightgbm'].predict(X_test_scaled_df)
                for i, idx in enumerate(X_test_scaled_df.index):
                    indexed_predictions[idx] = lgb_pred_full[i]
                print("LightGBM predictions generated.")
            except Exception as e:
                print(f"LightGBM prediction failed: {e}")
                
        # Deep learning prediction logic for ensemble
        if ('convlstm' in self.models and self.models['convlstm'] is not None) or \
           ('lstm' in self.models and self.models['lstm'] is not None):
            try:
                X_test_seq, _ = self.prepare_sequences(X_test_scaled) 
                
                if len(X_test_seq) > 0:
                    dl_predictions_list = []
                    dl_weights_list = []
                    
                    if 'convlstm' in self.models and self.models['convlstm'] is not None:
                        try:
                            convlstm_pred = self.models['convlstm'].predict(X_test_seq).flatten()
                            dl_predictions_list.append(convlstm_pred)
                            dl_weights_list.append(0.6) # Weight for ConvLSTM in DL ensemble
                        except Exception as e:
                            print(f"ConvLSTM prediction during test failed: {e}")
                    
                    if 'lstm' in self.models and self.models['lstm'] is not None:
                        try:
                            lstm_pred = self.models['lstm'].predict(X_test_seq).flatten()
                            dl_predictions_list.append(lstm_pred)
                            dl_weights_list.append(0.4) # Weight for LSTM in DL ensemble
                        except Exception as e:
                            print(f"LSTM prediction during test failed: {e}")
                    
                    # Combine deep learning predictions
                    if dl_predictions_list:
                        dl_ensemble_weighted = np.average(dl_predictions_list, axis=0, weights=dl_weights_list)
                        
                        # Apply ensemble to the part of the test data covered by sequences
                        # These predictions map to indices starting from `self.sequence_length`
                        sequence_covered_indices = X_test_scaled_df.index[self.sequence_length:].tolist()
                        
                        for i, original_idx in enumerate(sequence_covered_indices):
                            # Combine LGBM's initial prediction for this index with DL ensemble
                            # If LGBM didn't run or failed, its value might be missing,
                            # so get it safely or default to 0.0
                            lgbm_part = indexed_predictions.get(original_idx, 0.0)    
                            indexed_predictions[original_idx] = (lgbm_part * 0.4) + (dl_ensemble_weighted[i] * 0.6)
                        
                        print("Deep learning models included in predictions.")
                    else:
                        print("No successful deep learning models to include in test predictions.")
                else:
                    print("Test data too short for deep learning sequences. Using LightGBM only.")
            except Exception as e:
                print(f"Deep learning prediction setup failed: {e}")
                print("Falling back to LightGBM predictions only for test set.")
        
        # Populate the final predictions array based on original_test_ids order
        # Get the original IDs and their corresponding internal DataFrame indices from the full raw test data
        temp_test_raw_full = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet', columns=['ID'])
        if 'ID' not in temp_test_raw_full.columns and temp_test_raw_full.index.name == 'ID':
            temp_test_raw_full = temp_test_raw_full.reset_index()

        id_to_original_index_map = pd.Series(temp_test_raw_full.index.values, index=temp_test_raw_full['ID'])

        # Iterate through the original_test_ids to populate predictions
        for i, current_id in enumerate(original_test_ids):
            original_idx_in_raw_df = id_to_original_index_map.get(current_id)
            
            if original_idx_in_raw_df is not None and original_idx_in_raw_df in indexed_predictions:
                predictions[i] = indexed_predictions[original_idx_in_raw_df]
            else:
                predictions[i] = 0.0    

        del temp_test_raw_full, id_to_original_index_map
        gc.collect()

        # Final validation of predictions
        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
            print("WARNING: Invalid predictions detected (NaN/Inf), cleaning...")
            predictions = np.nan_to_num(predictions, nan=0.0, posinf=1.0, neginf=-1.0)
        
        print(f"Generated {len(predictions)} predictions")
        print(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")
        
        return predictions

# Main execution function
def run_competition_pipeline():
    """Run the complete competition pipeline"""
    
    # Load data (initial raw load to get shape and display head)
    print("Loading data...")
    # Loading full dataset for shape check (this could be memory intensive)
    train_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
    # Explicitly convert index to column if 'timestamp' is in index name
    if 'timestamp' in train_full_raw.index.names:
        train_full_raw = train_full_raw.reset_index(names=['timestamp']) # Ensure it becomes a named column
    # Ensure it's datetime type
    train_full_raw['timestamp'] = pd.to_datetime(train_full_raw['timestamp'])

    test_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')
    # Explicitly convert index to column if 'ID' or 'timestamp' is in index name for test
    if 'ID' in test_full_raw.index.names:
        test_full_raw = test_full_raw.reset_index(names=['ID']) # Ensure it becomes a named column
    elif 'timestamp' in test_full_raw.index.names: # Also check for timestamp as index in test
        test_full_raw = test_full_raw.reset_index(names=['timestamp'])
    # Ensure timestamp is datetime if it exists (for features)
    if 'timestamp' in test_full_raw.columns:
        test_full_raw['timestamp'] = pd.to_datetime(test_full_raw['timestamp'])
    
    print(f"\nTrain shape: {train_full_raw.shape}")
    print(f"Test shape: {test_full_raw.shape}")
    
    # Initialize and train model
    predictor = CryptoMarketPredictor(
        sequence_length=30,    
        top_features=100,      
        top_X_features_to_preselect=30 
    )    
    # Pass the full raw DataFrame to allow determining X_n_cols_raw
    predictor.fit(train_full_raw) 
    
    # Generate predictions
    # Pass the full raw DataFrame to allow initial column selection
    predictions = predictor.predict(test_full_raw) 
    
    # Create submission
    submission = pd.DataFrame({
        'ID': test_full_raw['ID'], # Use the 'ID' column from the original test DataFrame
        'Prediction': predictions
    })
    
    # Save submission
    submission.to_csv('submission.csv', index=False)
    print(f"Submission saved with {len(submission)} predictions")
    print(f"Prediction statistics - Mean: {predictions.mean():.4f}, Std: {predictions.std():.4f}")
    
    return submission

# Run the pipeline
if __name__ == "__main__":
    submission = run_competition_pipeline()
