In [None]:
# DRW Crypto Market Prediction Competition Pipeline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os # Import os for path checking

# Memory optimization and data processing
import gc
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

# Deep learning and modeling 
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (LSTM, ConvLSTM2D, Dense, Dropout, 
                                     BatchNormalization, Input, Conv1D, MaxPooling1D,
                                     Flatten, Reshape, TimeDistributed)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Tree-based models for ensemble
import lightgbm as lgb
from scipy.stats import pearsonr

# Set memory growth for GPU (if available) - still good practice
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

class CryptoMarketPredictor:
    def __init__(self, sequence_length=30, top_features=100, top_X_features_to_preselect=30): # Reverted: sequence_length, top_features, top_X_features_to_preselect
        self.sequence_length = sequence_length 
        self.top_features = top_features 
        self.top_X_features_to_preselect = top_X_features_to_preselect 
        self.scaler = RobustScaler()
        self.feature_selector = None
        self.selected_features = None # Stores final selected feature names
        self.models = {}
        # This checkpoint path will remain local to the Colab session for temporary storage
        self._checkpoint_path = './processed_train_data_checkpoint.parquet' 
        
    def optimize_memory(self, df):
        """
        Optimize Pandas DataFrame memory usage and clean data.
        """
        print(f"Memory usage before optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # Clean data first
        df = self.clean_data(df)
        
        # Optimize numeric columns (Pandas directly)
        for col in df.select_dtypes(include=[np.number]).columns:
            if col == 'timestamp' or col == 'ID' or col == 'label':
                continue # Don't downcast timestamp, ID, or label
            df[col] = pd.to_numeric(df[col], downcast='float')
            
        print(f"Memory usage after optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        return df
    
    def clean_data(self, df):
        """
        Clean Pandas DataFrame by handling inf, -inf, and extreme values.
        """
        print("Cleaning data...")
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        # Replace inf and -inf with NaN first
        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)

        # Fill NaN values with forward fill, then backward fill, then 0
        for col in numeric_cols:
            df[col] = df[col].ffill().bfill().fillna(0)

        # Handle extreme outliers (values beyond 3*IQR)
        for col in df.select_dtypes(include=[np.float32, np.float64]).columns:
            if col in ['timestamp', 'ID', 'label']: continue 
            if df[col].nunique() > 1:
                q25 = df[col].quantile(0.25)
                q75 = df[col].quantile(0.75)
                iqr = q75 - q25
                
                if iqr != 0 and not pd.isna(iqr):
                    lower_bound = q25 - 3 * iqr
                    upper_bound = q75 + 3 * iqr
                    df[col] = df[col].clip(lower_bound, upper_bound)
        print("Data cleaning applied.")
        return df
    
    def create_time_features(self, df):
        """
        Create time-based features with robust calculations using Pandas.
        Significantly reduced complexity for faster execution.
        """
        print("Creating time-based features...")
        
        # Basic market features
        df['mid_price'] = (df['bid_qty'] + df['ask_qty']) / 2
        df['spread'] = df['ask_qty'] - df['bid_qty']
        
        # Safe division for imbalance
        denominator = df['bid_qty'] + df['ask_qty'] + 1e-10
        df['imbalance'] = (df['bid_qty'] - df['ask_qty']) / denominator
        
        # Safe division for buy/sell ratio
        df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)
        
        # Rolling statistics - significantly reduced windows for speed
        windows = [10, 30] # Reverted windows
        base_cols_for_rolling = ['volume', 'mid_price', 'buy_qty', 'sell_qty', 'imbalance'] 

        for col in base_cols_for_rolling:
            for window in windows:
                df[f'{col}_ma_{window}'] = df[col].rolling(window, min_periods=1).mean()
                df[f'{col}_std_{window}'] = df[col].rolling(window, min_periods=1).std().fillna(0) 
        
        # Lagged features - significantly reduced lags for speed
        lags = [1, 5] # Reverted lags
        base_cols_for_lag = ['mid_price', 'imbalance'] 

        for col in base_cols_for_lag:
            for lag in lags:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        
        # Technical indicators - reduced
        df['rsi_proxy'] = self.calculate_rsi_proxy(df['mid_price'], window=10) # Reverted window
        df['momentum'] = df['mid_price'] - df['mid_price'].shift(5) # Reverted shift
        
        # Final check for any inf/nan values that might have been introduced
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            df[col] = df[col].ffill().bfill().fillna(0)

        print(f"Time-based features created. Current shape: {df.shape[0]} rows, {df.shape[1]} columns")
        return df
    
    def calculate_rsi_proxy(self, prices_series, window=14):
        """Calculate RSI-like indicator with safe operations for Pandas Series."""
        delta = prices_series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()
        
        rs = gain / (loss + 1e-10)
        rsi = 100 - (100 / (1 + rs))
        
        rsi = rsi.replace([np.inf, -np.inf], np.nan).fillna(50)
        
        return rsi
    
    def select_features(self, X_df, y_df, method='mutual_info'):
        """
        Feature selection to reduce dimensionality with robust handling.
        Operates directly on Pandas DataFrames.
        """
        print(f"Selecting top {self.top_features} features from {X_df.shape[1]} features...")
        
        print("Validating data before final feature selection...")
        
        # Check for any remaining inf/nan values
        inf_mask = np.isinf(X_df).any(axis=1)
        nan_mask = np.isnan(X_df).any(axis=1)
        invalid_mask = inf_mask | nan_mask
        
        if invalid_mask.sum() > 0:
            print(f"Removing {invalid_mask.sum()} rows with invalid values before final selection.")
            X_df = X_df[~invalid_mask]
            y_df = y_df[~invalid_mask]
        
        # Check for constant or near-constant features (can cause issues for some selectors)
        feature_std = X_df.std()
        constant_features_mask = feature_std < 1e-8 
        
        if constant_features_mask.all():
            print("Warning: All features are constant. Cannot perform feature selection.")
            non_constant_features = X_df.columns.tolist() 
        else:
            non_constant_features = X_df.columns[~constant_features_mask].tolist()
            if constant_features_mask.sum() > 0:
                print(f"Removing {constant_features_mask.sum()} constant features.")
            X_df = X_df[non_constant_features]


        print(f"Final data shape for feature selection: {X_df.shape}")
        
        n_features_to_select = min(self.top_features, X_df.shape[1])
        
        if method == 'mutual_info':
            selector = SelectKBest(score_func=mutual_info_regression, k=n_features_to_select)
        else:
            selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
        
        X_selected = selector.fit_transform(X_df, y_df)
        self.feature_selector = selector
        self.selected_features = X_df.columns[selector.get_support()].tolist()
        
        print(f"\n--- Selected Features ({len(self.selected_features)}) ---")
        for feature in self.selected_features:
            print(f"- {feature}")
        print("---------------------------------------\n")

        # Return as NumPy array and Pandas Index to maintain alignment with y
        return X_selected.astype(np.float32), X_df.index
    
    def prepare_sequences(self, data, target=None):
        """Prepare sequences for time series models"""
        sequences = []
        targets = []
        
        for i in range(self.sequence_length, len(data)):
            sequences.append(data[i-self.sequence_length:i])
            if target is not None:
                targets.append(target[i])
        
        # Ensure outputs are float32 for TensorFlow models to save memory
        return np.array(sequences).astype(np.float32), np.array(targets).astype(np.float32) if target is not None else None
    
    def build_convlstm_model(self, input_shape):
        """Build ConvLSTM model for spatial-temporal patterns"""
        model = Sequential([
            # Reshape for ConvLSTM (samples, time_steps, rows, cols, channels)
            # input_shape is (sequence_length, num_features)
            # We need (sequence_length, 1, num_features, 1)
            Reshape((self.sequence_length, 1, input_shape[1], 1), input_shape=input_shape),
            
            ConvLSTM2D(filters=64, kernel_size=(1, 3), # Reverted filters
                       activation='tanh', recurrent_activation='sigmoid',
                       return_sequences=True, dropout=0.2, padding='same'), 
            BatchNormalization(),
            
            ConvLSTM2D(filters=32, kernel_size=(1, 3), # Reverted filters
                       activation='tanh', recurrent_activation='sigmoid',
                       return_sequences=False, dropout=0.2, padding='same'), 
            BatchNormalization(),
            
            Flatten(),
            Dense(50, activation='relu'), # Reverted dense units
            Dropout(0.3),
            Dense(1, activation='linear')
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), 
                      loss='mae', metrics=['mae'])
        return model
    
    def build_lstm_model(self, input_shape):
        """Build standard LSTM model"""
        model = Sequential([
            LSTM(100, return_sequences=True, input_shape=input_shape, dropout=0.2), # Reverted LSTM units
            BatchNormalization(),
            LSTM(50, return_sequences=False, dropout=0.2), # Reverted LSTM units
            BatchNormalization(),
            Dense(50, activation='relu'), # Reverted dense units
            Dropout(0.3),
            Dense(1, activation='linear')
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), 
                      loss='mae', metrics=['mae'])
        return model

    def build_cnn_lstm_model(self, input_shape):
        print("CNN-LSTM model building skipped for current speed optimization, but can be reinstated later.")
        return None 
    
    def train_lightgbm(self, X_train, y_train, X_val, y_val):
        """Train LightGBM model"""
        print("Training LightGBM model...")
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        params = {
            'objective': 'regression',
            'metric': 'mae',
            'boosting_type': 'gbdt',
            'num_leaves': 31, # Reverted leaves
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'random_state': 42,
            'n_estimators': 1000 # Reverted estimators
        }
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=params['n_estimators'], 
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)] # Reverted early stopping rounds
        )
        
        return model
    
    def evaluate_model(self, y_true, y_pred, model_name):
        """Evaluate model performance"""
        mae = mean_absolute_error(y_true, y_pred)
        correlation, _ = pearsonr(y_true, y_pred)
        
        print(f"{model_name} - MAE: {mae:.4f}, Pearson Correlation: {correlation:.4f}")
        return correlation
    
    def fit(self, train_data_raw_initial_load): 
        """Main training pipeline with robust error handling"""
        print("Starting training pipeline...")
        
        # Determine raw X_n columns from the initial load structure
        X_n_cols_raw = [col for col in train_data_raw_initial_load.columns if col.startswith('X') and col != 'label']   
        basic_features_cols = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']
        
        # --- Aggressive initial column selection for Pandas memory management ---
        # We will ONLY select these columns from the already loaded raw DataFrame
        preselected_X_n_features = X_n_cols_raw[:min(self.top_X_features_to_preselect, len(X_n_cols_raw))]
        print(f"Initially selected {len(preselected_X_n_features)} X_n features for direct Pandas load (due to memory constraints without Dask).")

        # Ensure 'timestamp' and 'label' are in columns_to_process for the fit method
        columns_to_process_raw_for_fit = basic_features_cols + preselected_X_n_features + ['label']
        
        train_df = None
        if os.path.exists(self._checkpoint_path):
            print(f"Checkpoint found. Loading processed data from {self._checkpoint_path}...")
            train_df = pd.read_parquet(self._checkpoint_path)
            # Ensure timestamp is datetime for Pandas DataFrame if not already
            if 'timestamp' in train_df.columns:
                train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
        else:
            print(f"Selecting columns from initial raw training data...")
            train_df = train_data_raw_initial_load[columns_to_process_raw_for_fit].copy()
            if 'timestamp' not in train_df.columns and train_df.index.name == 'timestamp':
                train_df = train_df.reset_index(names=['timestamp'])
            train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
            gc.collect()
            train_df = self.optimize_memory(train_df)   
            train_df = self.create_time_features(train_df)
            print(f"Saving processed data to checkpoint: {self._checkpoint_path}...")
            train_df.to_parquet(self._checkpoint_path, index=False) 
            print("Checkpoint saved.")
            
        print(f"Data shape after feature engineering: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
        feature_cols_final = [col for col in train_df.columns 
                              if col not in ['timestamp', 'label']]
        X_df = train_df[feature_cols_final]
        y_df = train_df['label']
        print(f"Features shape before final selection: {X_df.shape[0]} rows, {X_df.shape[1]} columns")
        print(f"Target shape: {y_df.shape[0]} rows")
        del train_df 
        gc.collect()
        X_selected, valid_idx = self.select_features(X_df, y_df)    
        y_for_training = y_df.loc[valid_idx].astype(np.float32)
        del X_df, y_df 
        gc.collect()
        X_scaled = self.scaler.fit_transform(X_selected)
        print("Final data validation (after scaling)...")
        if np.any(np.isnan(X_scaled)) or np.any(np.isinf(X_scaled)):
            print("ERROR: Still have invalid values after preprocessing! Applying emergency cleanup.")
            X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
        print(f"Final training data shape: {X_scaled.shape}")
        temp_train_full_timestamps = train_data_raw_initial_load[['timestamp']].copy()
        if 'timestamp' not in temp_train_full_timestamps.columns and temp_train_full_timestamps.index.name == 'timestamp':
            temp_train_full_timestamps = temp_train_full_timestamps.reset_index(names=['timestamp'])
        temp_train_full_timestamps['timestamp'] = pd.to_datetime(temp_train_full_timestamps['timestamp'])
        VALIDATION_SPLIT_DATE = '2024-01-01' # Reverted validation split date
        original_train_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] < VALIDATION_SPLIT_DATE].index
        original_val_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] >= VALIDATION_SPLIT_DATE].index
        X_scaled_temp_df = pd.DataFrame(X_scaled, index=valid_idx, columns=self.selected_features)
        actual_train_indices = original_train_indices.intersection(X_scaled_temp_df.index)
        actual_val_indices = original_val_indices.intersection(X_scaled_temp_df.index)
        X_train = X_scaled_temp_df.loc[actual_train_indices].values.astype(np.float32)
        y_train = y_for_training.loc[actual_train_indices].values.astype(np.float32)
        X_val = X_scaled_temp_df.loc[actual_val_indices].values.astype(np.float32)
        y_val = y_for_training.loc[actual_val_indices].values.astype(np.float32)
        del temp_train_full_timestamps, X_scaled_temp_df, y_for_training 
        gc.collect() 
        print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}")
        try:
            lgb_model = self.train_lightgbm(X_train, y_train, X_val, y_val)
            lgb_pred = lgb_model.predict(X_val)
            lgb_score = self.evaluate_model(y_val, lgb_pred, "LightGBM")
            self.models['lightgbm'] = lgb_model
        except Exception as e:
            print(f"LightGBM training failed: {e}")
            lgb_score = 0
        
        # Initialize DL prediction lists and weights here to ensure they are always defined
        dl_predictions_list = []
        dl_weights = [] 

        try:
            X_train_seq, y_train_seq = self.prepare_sequences(X_train, y_train)
            X_val_seq, y_val_seq = self.prepare_sequences(X_val, y_val)
            if len(X_train_seq) > 0:
                print(f"Sequence data - Train: {X_train_seq.shape}, Val: {X_val_seq.shape}")
                callbacks = [
                    EarlyStopping(patience=10, restore_best_weights=True, monitor='val_mae'), # Reverted patience
                    ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, monitor='val_mae') # Reverted patience
                ]
                convlstm_score = 0 
                try:
                    print("Training ConvLSTM model...")
                    convlstm_model = self.build_convlstm_model(X_train_seq.shape[1:])
                    if convlstm_model: 
                        convlstm_model.fit(
                            X_train_seq, y_train_seq,
                            validation_data=(X_val_seq, y_val_seq),
                            epochs=50, batch_size=64, # Reverted epochs and batch size
                            callbacks=callbacks, verbose=1 
                        )
                        convlstm_pred = convlstm_model.predict(X_val_seq).flatten()
                        convlstm_score = self.evaluate_model(y_val_seq, convlstm_pred, "ConvLSTM")
                        self.models['convlstm'] = convlstm_model
                except Exception as e:
                    print(f"ConvLSTM training failed: {e}")
                    convlstm_score = 0
                lstm_score = 0 
                try:
                    print("Training LSTM model...")
                    lstm_model = self.build_lstm_model(X_train_seq.shape[1:])
                    if lstm_model: 
                        lstm_model.fit(
                            X_train_seq, y_train_seq,
                            validation_data=(X_val_seq, y_val_seq),
                            epochs=50, batch_size=64, # Reverted epochs and batch size
                            callbacks=callbacks, verbose=1 
                        )
                        lstm_pred = lstm_model.predict(X_val_seq).flatten()
                        lstm_score = self.evaluate_model(y_val_seq, lstm_pred, "LSTM")
                        self.models['lstm'] = lstm_model
                except Exception as e:
                    print(f"LSTM training failed: {e}")
                    lstm_score = 0
                if len(self.models) > 1:
                    lgb_pred_aligned = lgb_pred[self.sequence_length:]
                    
                    # Populate dl_predictions_list and dl_weights only if models were successful
                    if 'convlstm' in self.models and convlstm_score > 0: 
                        dl_predictions_list.append(convlstm_pred)
                        dl_weights.append(0.35)
                    if 'lstm' in self.models and lstm_score > 0: 
                        dl_predictions_list.append(lstm_pred) 
                        dl_weights.append(0.25)

                    if dl_predictions_list: # Check if any DL predictions were added
                        dl_ensemble_weighted = np.average(dl_predictions_list, axis=0, weights=dl_weights)
                        ensemble_pred = (lgb_pred_aligned * 0.4) + (dl_ensemble_weighted * 0.6)
                        ensemble_score = self.evaluate_model(y_val_seq, ensemble_pred, "Ensemble")
                        print(f"\nEnsemble score: {ensemble_score:.4f}")
                    else:
                        print("No successful deep learning models to include in ensemble.")
                        ensemble_score = lgb_score 
                else:
                    print("Skipping deep learning models as no sequences could be prepared.")
                    ensemble_score = lgb_score 
                print(f"\nBest individual model score: {max(lgb_score, convlstm_score, lstm_score) if 'convlstm' in self.models else lgb_score:.4f}")
                print(f"Ensemble score: {ensemble_score:.4f}") 
        except Exception as e:
            print(f"Deep learning training failed during ensemble: {e}")
            ensemble_score = lgb_score 
        del X_train, y_train, X_val, y_val 
        if 'X_train_seq' in locals() and X_train_seq is not None:
             del X_train_seq, y_train_seq, X_val_seq, y_val_seq
        gc.collect()
        return self
    
    def predict(self, test_data_raw_initial_load): 
        """Generate predictions for test data with robust error handling"""
        print("Generating predictions...")
        
        # --- Capture original ID to index mapping early ---
        # Ensure test_data_raw_initial_load has 'ID' and is ready for mapping
        temp_df_for_id_map = test_data_raw_initial_load.copy()
        # Handle cases where 'ID' might be the index but not explicitly a column name
        if 'ID' not in temp_df_for_id_map.columns and temp_df_for_id_map.index.name == 'ID':
            temp_df_for_id_map = temp_df_for_id_map.reset_index(names=['ID'])
        elif 'ID' not in temp_df_for_id_map.columns and temp_df_for_id_map.index is not None and temp_df_for_id_map.index.name is None and len(temp_df_for_id_map.index) == len(temp_df_for_id_map):
            # Fallback if ID is an unnamed index that matches length
            temp_df_for_id_map = temp_df_for_id_map.reset_index()
            temp_df_for_id_map.rename(columns={'index': 'ID'}, inplace=True) # Rename the default 'index' column

        # Create the series mapping 'ID' values to their corresponding *original DataFrame indices*
        id_to_original_index_map = pd.Series(temp_df_for_id_map.index.values, index=temp_df_for_id_map['ID'])

        # Store original test IDs for final submission mapping (these are the IDs from the input order)
        original_test_ids = temp_df_for_id_map['ID'].copy()
        
        # Free up temporary mapping dataframe memory (now that we have the series and original IDs)
        del temp_df_for_id_map
        gc.collect()

        # --- Aggressive initial column selection for Pandas memory management ---
        basic_features_cols_test = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'ID']
        X_n_cols_raw_test = [col for col in test_data_raw_initial_load.columns if col.startswith('X') and col != 'label']
        preselected_X_n_features_test = X_n_cols_raw_test[:min(self.top_X_features_to_preselect, len(X_n_cols_raw_test))]

        columns_to_process_raw_for_predict = basic_features_cols_test + preselected_X_n_features_test
        
        # --- Crucial Check before subsetting ---
        missing_columns = [col for col in columns_to_process_raw_for_predict if col not in test_data_raw_initial_load.columns]
        if missing_columns:
            raise KeyError(f"The following required columns are missing from the test data: {missing_columns}. Please ensure your test.parquet file contains these columns.")

        print(f"Selecting columns from initial raw test data (only {len(columns_to_process_raw_for_predict)} columns)...")
        # Select columns directly from the already loaded test_data_raw_initial_load
        test_df = test_data_raw_initial_load[columns_to_process_raw_for_predict].copy()
        
        # Now it's safe to delete the original raw load as its necessary info has been extracted
        del test_data_raw_initial_load 
        gc.collect()

        # Ensure 'ID' is a column if it was index (this should be handled by temp_df_for_id_map now but a safety check)
        if 'ID' not in test_df.columns and test_df.index.name == 'ID':
            test_df = test_df.reset_index()
            
        # Ensure timestamp is datetime
        if 'timestamp' in test_df.columns:
            test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

        # Memory optimization and data cleaning (now on Pandas DataFrame)
        test_df = self.optimize_memory(test_df)
        
        # Feature engineering (same as training, now on Pandas DataFrame)
        test_df = self.create_time_features(test_df)
        
        print(f"Test data shape after feature engineering: {test_df.shape[0]} rows, {test_df.shape[1]} columns")
        
        # Prepare features for prediction
        if self.selected_features is None:
            raise ValueError("Model must be fitted before making predictions (selected_features is None)")
        
        # Filter test_data to include only the final selected features (from training)
        # This is now safe as test_df has all engineered features
        X_test_df_final = test_df[self.selected_features] # Ensure order matches training features

        # Handle any remaining invalid values before scaling
        X_test_df_final = X_test_df_final.replace([np.inf, -np.inf], np.nan)
        X_test_df_final = X_test_df_final.fillna(method='ffill').fillna(method='bfill').fillna(0)
        
        # Scale features
        X_test_scaled = self.scaler.transform(X_test_df_final)
        
        # Final validation
        if np.any(np.isnan(X_test_scaled)) or np.any(np.isinf(X_test_scaled)):
            print("WARNING: Invalid values in test data after scaling, applying emergency cleanup.")
            X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
        
        # Create a DataFrame from X_test_scaled to maintain original indices for prediction mapping
        X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test_df_final.index, columns=self.selected_features)
        
        del X_test_df_final, test_df # Clean up
        gc.collect()

        # Initialize full predictions array based on the number of original IDs
        predictions = np.zeros(len(original_test_ids), dtype=np.float32) 

        # Dictionary to store predictions mapped by the internal DataFrame index after sequence preparation
        indexed_predictions_by_internal_idx = {}
        
        # LightGBM predictions (baseline/fallback)
        if 'lightgbm' in self.models:
            try:
                lgb_pred_full = self.models['lightgbm'].predict(X_test_scaled_df)
                # Map LGBM predictions to their internal DataFrame indices
                for i, idx in enumerate(X_test_scaled_df.index):
                    indexed_predictions_by_internal_idx[idx] = lgb_pred_full[i]
                print("LightGBM predictions generated.")
            except Exception as e:
                print(f"LightGBM prediction failed: {e}")
                
        # Deep learning prediction logic for ensemble
        if ('convlstm' in self.models and self.models['convlstm'] is not None) or \
           ('lstm' in self.models and self.models['lstm'] is not None):
            try:
                X_test_seq, _ = self.prepare_sequences(X_test_scaled) 
                
                if len(X_test_seq) > 0:
                    dl_predictions_list_test = [] # Use distinct name for clarity in predict method
                    dl_weights_test = [] # Use distinct name for clarity in predict method
                    
                    if 'convlstm' in self.models and self.models['convlstm'] is not None:
                        try:
                            convlstm_pred = self.models['convlstm'].predict(X_test_seq).flatten()
                            dl_predictions_list_test.append(convlstm_pred)
                            dl_weights_test.append(0.6) # Weight for ConvLSTM in DL ensemble
                        except Exception as e:
                            print(f"ConvLSTM prediction during test failed: {e}")
                    
                    if 'lstm' in self.models and self.models['lstm'] is not None:
                        try:
                            lstm_pred = self.models['lstm'].predict(X_test_seq).flatten()
                            dl_predictions_list_test.append(lstm_pred)
                            dl_weights_test.append(0.4) # Weight for LSTM in DL ensemble
                        except Exception as e:
                            print(f"LSTM prediction during test failed: {e}")
                    
                    # Combine deep learning predictions
                    if dl_predictions_list_test: # Check if any DL predictions were added
                        dl_ensemble_weighted = np.average(dl_predictions_list_test, axis=0, weights=dl_weights_test)
                        
                        # Apply ensemble to the part of the test data covered by sequences
                        # These predictions map to indices starting from `self.sequence_length`
                        sequence_covered_indices_in_X_scaled_df = X_test_scaled_df.index[self.sequence_length:].tolist()
                        
                        for i, internal_idx in enumerate(sequence_covered_indices_in_X_scaled_df):
                            # Combine LGBM's initial prediction for this internal_idx with DL ensemble
                            lgbm_part = indexed_predictions_by_internal_idx.get(internal_idx, 0.0)    
                            indexed_predictions_by_internal_idx[internal_idx] = (lgbm_part * 0.4) + (dl_ensemble_weighted[i] * 0.6)
                        
                        print("Deep learning models included in predictions.")
                    else:
                        print("No successful deep learning models to include in test predictions.")
                else:
                    print("Test data too short for deep learning sequences. Using LightGBM only.")
            except Exception as e:
                print(f"Deep learning prediction setup failed: {e}")
                print("Falling back to LightGBM predictions only for test set.")
        
        # Populate the final predictions array using the original_test_ids order
        # Iterate through the original_test_ids to place predictions in the correct overall submission order
        for i, current_id in enumerate(original_test_ids):
            # Find the internal DataFrame index that corresponds to this 'ID'
            # We created id_to_original_index_map earlier, which maps ID to its original dataframe index
            # And then indexed_predictions_by_internal_idx stores predictions keyed by that original dataframe index
            original_idx_in_raw_df = id_to_original_index_map.get(current_id)
            
            if original_idx_in_raw_df is not None and original_idx_in_raw_df in indexed_predictions_by_internal_idx:
                predictions[i] = indexed_predictions_by_internal_idx[original_idx_in_raw_df]
            else:
                predictions[i] = 0.0 # Default to 0.0 if no prediction found (e.g., due to data cleaning/filtering)
        
        # Final validation of predictions
        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
            print("WARNING: Invalid predictions detected (NaN/Inf), cleaning...")
            predictions = np.nan_to_num(predictions, nan=0.0, posinf=1.0, neginf=-1.0)
        
        print(f"Generated {len(predictions)} predictions")
        print(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")
        
        return predictions

# Main execution function
def run_competition_pipeline():
    """Run the complete competition pipeline"""
    
    # Load data (initial raw load to get shape and display head)
    print("Loading data...")
    # train_full_raw = pd.read_parquet('/content/drive/MyDrive/CryptoPrediction/train.parquet') 
    train_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet') 
    
    # Ensure 'timestamp' is always a column in train_full_raw
    if 'timestamp' not in train_full_raw.columns:
        if train_full_raw.index.name == 'timestamp':
            train_full_raw = train_full_raw.reset_index(names=['timestamp'])
            print(f"DEBUG: Resetting index 'timestamp' for train_full_raw. New columns: {train_full_raw.columns.tolist()}")
        else: # Attempt to reset generic index if 'timestamp' not found as column or named index
            train_full_raw = train_full_raw.reset_index()
            # If after reset, 'timestamp' is still not a column, and 'index' is, try renaming 'index'
            if 'index' in train_full_raw.columns and 'timestamp' not in train_full_raw.columns:
                 train_full_raw.rename(columns={'index': 'timestamp'}, inplace=True)
                 print(f"DEBUG: Renamed 'index' to 'timestamp' for train_full_raw. New columns: {train_full_raw.columns.tolist()}")
    
    # Convert 'timestamp' to datetime, assuming it's now a column
    if 'timestamp' in train_full_raw.columns:
        train_full_raw['timestamp'] = pd.to_datetime(train_full_raw['timestamp'])
        print(f"DEBUG: 'timestamp' column found and converted to datetime in train_full_raw.")
    else:
        print("CRITICAL WARNING: 'timestamp' column still not found in train_full_raw after all attempts. This will likely cause issues.")
        print(f"DEBUG: train_full_raw columns are: {train_full_raw.columns.tolist()}")

    # test_full_raw = pd.read_parquet('/content/drive/MyDrive/CryptoPrediction/test.parquet')
    test_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')
    
    # Ensure 'ID' is always a column in test_full_raw
    if 'ID' not in test_full_raw.columns:
        if test_full_raw.index.name == 'ID':
            test_full_raw = test_full_raw.reset_index(names=['ID'])
            print(f"DEBUG: Resetting index 'ID' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")
        else: # Attempt to reset generic index if 'ID' not found as column or named index
            test_full_raw = test_full_raw.reset_index()
            if 'index' in test_full_raw.columns and 'ID' not in test_full_raw.columns:
                test_full_raw.rename(columns={'index': 'ID'}, inplace=True)
                print(f"DEBUG: Renamed 'index' to 'ID' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")

    # Ensure 'timestamp' is always a column in test_full_raw
    if 'timestamp' not in test_full_raw.columns:
        if test_full_raw.index.name == 'timestamp':
            test_full_raw = test_full_raw.reset_index(names=['timestamp'])
            print(f"DEBUG: Resetting index 'timestamp' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")
        else: # Attempt to reset generic index if 'timestamp' not found as column or named index
            test_full_raw = test_full_raw.reset_index()
            if 'index' in test_full_raw.columns and 'timestamp' not in test_full_raw.columns:
                test_full_raw.rename(columns={'index': 'timestamp'}, inplace=True)
                print(f"DEBUG: Renamed 'index' to 'timestamp' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")

    # Convert 'timestamp' to datetime for test_full_raw, assuming it's now a column
    if 'timestamp' in test_full_raw.columns:
        test_full_raw['timestamp'] = pd.to_datetime(test_full_raw['timestamp'])
        print(f"DEBUG: 'timestamp' column found and converted to datetime in test_full_raw.")
    else:
        print("CRITICAL WARNING: 'timestamp' column still not found in test_full_raw after all attempts. This will likely cause issues.")
        print(f"DEBUG: test_full_raw columns are: {test_full_raw.columns.tolist()}")
    
    print(f"\nTrain shape: {train_full_raw.shape}")
    print(f"Test shape: {test_full_raw.shape}")
    
    # Initialize and train model
    predictor = CryptoMarketPredictor(
        sequence_length=30, # Reverted
        top_features=100,    # Reverted
        top_X_features_to_preselect=30 # Reverted
    )    
    predictor.fit(train_full_raw) 
    
    predictions = predictor.predict(test_full_raw) 
    
    # Create submission
    submission = pd.DataFrame({
        'ID': test_full_raw['ID'], # Use the 'ID' column from the original test DataFrame
        'Prediction': predictions
    })
    
    # Save submission
    # Saving submission.csv to the same Google Drive folder as data
    # submission.to_csv('/content/drive/MyDrive/CryptoPrediction/submission.csv', index=False)
    submission.to_csv('/kaggle/working/submission.csv', index=False)
    print(f"Submission saved with {len(submission)} predictions")
    print(f"Prediction statistics - Mean: {predictions.mean():.4f}, Std: {predictions.std():.4f}")
    
    return submission

# Run the pipeline
if __name__ == "__main__":
    submission = run_competition_pipeline()


2025-06-27 00:30:35.676004: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750984235.865805      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750984235.925563      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-27 00:30:54.104321: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Loading data...
DEBUG: Resetting index 'timestamp' for train_full_raw. New columns: ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83', 'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92', 'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100', 'X101', 'X102', 'X103', 'X104', 'X105', 'X106', 'X107', 'X108', 'X109', 'X110', 'X111', 'X112', 'X113', 'X114', 'X115', 'X116', 'X117', 'X118', 'X119', 'X1