<a href="https://www.kaggle.com/code/nicholas33/drw-crypto-market-prediction-nb153?scriptVersionId=248751774" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# DRW Crypto Market Prediction Competition Pipeline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os # Import os for path checking
import gc # Import garbage collector
import math # For ceil in data generator

# Memory optimization and data processing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

# Deep learning and modeling
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (LSTM, ConvLSTM2D, Dense, Dropout,
                                     BatchNormalization, Input, Conv1D, MaxPooling1D,
                                     Flatten, Reshape, TimeDistributed)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence # Import Keras Sequence for data generation

# Tree-based models for ensemble
import lightgbm as lgb
from scipy.stats import pearsonr

# Set memory growth for GPU (if available) - still good practice
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


class TimeSeriesSequence(Sequence):
    """
    Keras Sequence for loading time-series data in batches from Parquet files,
    preparing sequences for deep learning models.
    """
    def __init__(self, X_filepath, y_filepath, sequence_length, batch_size, feature_cols, total_samples, original_indices_min, original_indices_max):
        self.X_filepath = X_filepath
        self.y_filepath = y_filepath
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.feature_cols = feature_cols # List of feature column names
        self.original_indices_min = original_indices_min # Min value of 'original_row_index' for this split
        self.original_indices_max = original_indices_max # Max value of 'original_row_index' for this split

        # total_samples is the count of rows in the underlying file for this split
        self.total_samples = total_samples

        # Calculate the effective length of the dataset for this generator.
        # This is the number of labels we can predict given the sequence length.
        self.num_predictable_samples = max(0, self.total_samples - self.sequence_length + 1)
        
        print(f"TimeSeriesSequence initialized for {X_filepath}: "
              f"Total rows in file: {total_samples}, Predictable samples: {self.num_predictable_samples}, "
              f"Batch size: {self.batch_size}, Sequence length: {self.sequence_length}, "
              f"Original index range: [{self.original_indices_min}, {self.original_indices_max}]")

    def __len__(self):
        """Denotes the number of batches per epoch."""
        if self.num_predictable_samples == 0:
            return 0
        return math.ceil(self.num_predictable_samples / self.batch_size)

    def __getitem__(self, idx):
        """Generates one batch of data."""
        # Calculate the range of original_row_index values for the labels in this batch
        # The labels for this batch correspond to original_row_index values from:
        # self.original_indices_min + (idx * self.batch_size)
        # to self.original_indices_min + (idx * self.batch_size) + self.batch_size - 1
        
        # Determine the start and end of the *labels* for this batch based on original row indices
        global_label_start_idx_value = self.original_indices_min + (idx * self.batch_size)
        # The last label index value for this batch
        global_label_end_idx_value = min(self.original_indices_max, global_label_start_idx_value + self.batch_size - 1)

        # Determine the start and end of the *features* needed for these labels
        # The first feature needed for the first label in this batch
        global_feature_start_idx_value = global_label_start_idx_value - (self.sequence_length - 1)
        # The last feature needed for the last sequence in this batch (which is the same as the last label)
        global_feature_end_idx_value = global_label_end_idx_value

        # Ensure we don't try to read before the actual min original index or beyond the max
        read_start_original_idx = max(self.original_indices_min, global_feature_start_idx_value)
        read_end_original_idx = min(self.original_indices_max, global_feature_end_idx_value)

        # Handle cases where the calculated read range is invalid (e.g., at the very beginning of the data)
        if read_start_original_idx > read_end_original_idx:
            print(f"Warning: Calculated read range [{read_start_original_idx}, {read_end_original_idx}] is invalid for batch {idx}. Returning empty.")
            return np.array([]).astype(np.float32).reshape(0, self.sequence_length, len(self.feature_cols)), \
                   np.array([]).astype(np.float32)

        # Read the slice of X data using the 'original_row_index' column
        # We need to read the 'original_row_index' column itself to re-index the DataFrame
        X_batch_df = pd.read_parquet(self.X_filepath, columns=self.feature_cols + ['original_row_index'],
                                     engine='pyarrow', 
                                     filters=[('original_row_index', '>=', read_start_original_idx), 
                                              ('original_row_index', '<=', read_end_original_idx)])
        
        # Read the slice of Y data using the 'original_row_index' column
        y_batch_df = pd.read_parquet(self.y_filepath, columns=['label', 'original_row_index'],
                                     engine='pyarrow', 
                                     filters=[('original_row_index', '>=', global_label_start_idx_value), 
                                              ('original_row_index', '<=', global_label_end_idx_value)])

        # Set 'original_row_index' as the DataFrame index for proper alignment and slicing
        X_batch_df = X_batch_df.set_index('original_row_index').sort_index()
        y_batch_df = y_batch_df.set_index('original_row_index').sort_index()

        # Convert to numpy arrays
        X_batch_np = X_batch_df.values.astype(np.float32)
        y_batch_np = y_batch_df['label'].values.flatten().astype(np.float32) # Ensure y is 1D

        del X_batch_df, y_batch_df # Free memory
        gc.collect()

        # Prepare sequences from the loaded chunk (this logic remains similar to original prepare_sequences)
        sequences = []
        targets = []

        # The indices within X_batch_np that correspond to the start of sequences for `y_batch_np` labels
        # The first sequence in the loaded chunk starts at X_batch_np[0].
        # Its corresponding label's original_row_index would be read_start_original_idx + sequence_length - 1.
        # We need to align the sequences with the labels in y_batch_np.
        
        # The number of rows in X_batch_np that can form a full sequence
        num_sequences_in_chunk = len(X_batch_np) - self.sequence_length + 1
        
        if num_sequences_in_chunk <= 0:
            # Not enough data in this chunk to form any sequences
            return np.array([]).astype(np.float32).reshape(0, self.sequence_length, len(self.feature_cols)), \
                   np.array([]).astype(np.float32)

        # Create sequences from the loaded X_batch_np
        X_sequences = np.array([X_batch_np[i : i + self.sequence_length] for i in range(num_sequences_in_chunk)])
        
        # The labels (y_sequences) correspond to the last element of each X_sequence.
        # So, if X_sequences[0] is X_batch_np[0:sequence_length], its label is y_batch_np corresponding to X_batch_np[sequence_length-1].
        # The y_batch_np contains labels for original_row_indices from `global_label_start_idx_value` to `global_label_end_idx_value`.
        # We need to slice y_batch_np to match the generated X_sequences.
        # The first label for X_sequences[0] is at global_label_start_idx_value.
        # The y_batch_np loaded corresponds to original_row_indices from global_label_start_idx_value to global_label_end_idx_value.
        y_sequences = y_batch_np[:num_sequences_in_chunk] # This assumes y_batch_np starts at the first label corresponding to the first sequence.

        del X_batch_np, y_batch_np # Clear memory after forming sequences
        gc.collect()

        return X_sequences, y_sequences


class CryptoMarketPredictor:
    def __init__(self, sequence_length=30, top_features=100, top_X_features_to_preselect=30):
        self.sequence_length = sequence_length
        self.top_features = top_features
        self.top_X_features_to_preselect = top_X_features_to_preselect
        self.scaler = RobustScaler()
        self.feature_selector = None
        self.selected_features = None # Stores final selected feature names
        self.models = {}
        # Path for the initial processed data after feature engineering
        self._engineered_data_checkpoint_path = './engineered_train_data_checkpoint.parquet'
        # Paths for scaled and feature-selected data, to be read by the generator
        self._scaled_X_path = './scaled_train_X.parquet'
        self._scaled_y_path = './scaled_train_y.parquet'
        self._scaled_val_X_path = './scaled_val_X.parquet'
        self._scaled_val_y_path = './scaled_val_y.parquet'


    def optimize_memory(self, df):
        """
        Optimize Pandas DataFrame memory usage and clean data.
        """
        print(f"Memory usage before optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

        # Clean data first
        df = self.clean_data(df)

        # Optimize numeric columns (Pandas directly)
        for col in df.select_dtypes(include=[np.number]).columns:
            if col == 'timestamp' or col == 'ID' or col == 'label':
                continue # Don't downcast timestamp, ID, or label
            df[col] = pd.to_numeric(df[col], downcast='float')

        print(f"Memory usage after optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        return df

    def clean_data(self, df):
        """
        Clean Pandas DataFrame by handling inf, -inf, and extreme values.
        """
        print("Cleaning data...")

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        # Replace inf and -inf with NaN first
        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)

        # Fill NaN values with forward fill, then backward fill, then 0
        for col in numeric_cols:
            df[col] = df[col].ffill().bfill().fillna(0)

        # Handle extreme outliers (values beyond 3*IQR)
        for col in df.select_dtypes(include=[np.float32, np.float64]).columns:
            if col in ['timestamp', 'ID', 'label']: continue
            if df[col].nunique() > 1:
                q25 = df[col].quantile(0.25)
                q75 = df[col].quantile(0.75)
                iqr = q75 - q25

                if iqr != 0 and not pd.isna(iqr):
                    lower_bound = q25 - 3 * iqr
                    upper_bound = q75 + 3 * iqr
                    df[col] = df[col].clip(lower_bound, upper_bound)
        print("Data cleaning applied.")
        return df

    def create_time_features(self, df):
        """
        Create time-based features with robust calculations using Pandas.
        Significantly reduced complexity for faster execution.
        """
        print("Creating time-based features...")

        # Basic market features
        df['mid_price'] = (df['bid_qty'] + df['ask_qty']) / 2
        df['spread'] = df['ask_qty'] - df['bid_qty']

        # Safe division for imbalance
        denominator = df['bid_qty'] + df['ask_qty'] + 1e-10
        df['imbalance'] = (df['bid_qty'] - df['ask_qty']) / denominator

        # Safe division for buy/sell ratio
        df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)

        # Rolling statistics - significantly reduced windows for speed
        windows = [10, 30]
        base_cols_for_rolling = ['volume', 'mid_price', 'buy_qty', 'sell_qty', 'imbalance']

        for col in base_cols_for_rolling:
            for window in windows:
                df[f'{col}_ma_{window}'] = df[col].rolling(window, min_periods=1).mean()
                df[f'{col}_std_{window}'] = df[col].rolling(window, min_periods=1).std().fillna(0)

        # Lagged features - significantly reduced lags for speed
        lags = [1, 5]
        base_cols_for_lag = ['mid_price', 'imbalance']

        for col in base_cols_for_lag:
            for lag in lags:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)

        # Technical indicators - reduced
        df['rsi_proxy'] = self.calculate_rsi_proxy(df['mid_price'], window=10)
        df['momentum'] = df['mid_price'] - df['mid_price'].shift(5)

        # Final check for any inf/nan values that might have been introduced
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numeric_cols = [col for col in numeric_cols if col not in ['timestamp', 'ID', 'label']]

        for col in numeric_cols:
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            df[col] = df[col].ffill().bfill().fillna(0)

        print(f"Time-based features created. Current shape: {df.shape[0]} rows, {df.shape[1]} columns")
        return df

    def calculate_rsi_proxy(self, prices_series, window=14):
        """Calculate RSI-like indicator with safe operations for Pandas Series."""
        delta = prices_series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()

        rs = gain / (loss + 1e-10)
        rsi = 100 - (100 / (1 + rs))

        rsi = rsi.replace([np.inf, -np.inf], np.nan).fillna(50)

        return rsi

    def select_features(self, X_df, y_df, method='mutual_info'):
        """
        Feature selection to reduce dimensionality with robust handling.
        Operates directly on Pandas DataFrames.
        """
        print(f"Selecting top {self.top_features} features from {X_df.shape[1]} features...")

        print("Validating data before final feature selection...")

        # Check for any remaining inf/nan values
        inf_mask = np.isinf(X_df).any(axis=1)
        nan_mask = np.isnan(X_df).any(axis=1)
        invalid_mask = inf_mask | nan_mask

        if invalid_mask.sum() > 0:
            print(f"Removing {invalid_mask.sum()} rows with invalid values before final selection.")
            X_df = X_df[~invalid_mask]
            y_df = y_df[~invalid_mask]

        # Check for constant or near-constant features (can cause issues for some selectors)
        feature_std = X_df.std()
        constant_features_mask = feature_std < 1e-8

        if constant_features_mask.all():
            print("Warning: All features are constant. Cannot perform feature selection.")
            non_constant_features = X_df.columns.tolist()
        else:
            non_constant_features = X_df.columns[~constant_features_mask].tolist()
            if constant_features_mask.sum() > 0:
                print(f"Removing {constant_features_mask.sum()} constant features.")
            X_df = X_df[non_constant_features]

        print(f"Final data shape for feature selection: {X_df.shape}")

        n_features_to_select = min(self.top_features, X_df.shape[1])

        if method == 'mutual_info':
            selector = SelectKBest(score_func=mutual_info_regression, k=n_features_to_select)
        else:
            selector = SelectKBest(score_func=f_regression, k=n_features_to_select)

        X_selected = selector.fit_transform(X_df, y_df)
        self.feature_selector = selector
        self.selected_features = X_df.columns[selector.get_support()].tolist()

        print(f"\n--- Selected Features ({len(self.selected_features)}) ---")
        for feature in self.selected_features:
            print(f"- {feature}")
        print("---------------------------------------\n")

        # Return as NumPy array and Pandas Index to maintain alignment with y
        return X_selected.astype(np.float32), X_df.index

    # Removed original prepare_sequences, as it's now embedded in TimeSeriesSequence and won't be called directly by fit
    # def prepare_sequences(self, data, target=None):
    #     """Prepare sequences for time series models"""
    #     ...

    # Removed build_convlstm_model
    # def build_convlstm_model(self, input_shape):
    #     """Build ConvLSTM model for spatial-temporal patterns"""
    #     ...

    # Removed build_lstm_model
    # def build_lstm_model(self, input_shape):
    #     """Build standard LSTM model"""
    #     ...

    def build_conv1d_model(self, input_shape):
        """Build a simplified Conv1D model for time series patterns."""
        model = Sequential([
            Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape), # Reverted to 32 filters
            MaxPooling1D(pool_size=2),
            Flatten(),
            Dense(32, activation='relu'), # Reverted to 32 units
            Dropout(0.3),
            Dense(1, activation='linear')
        ])
        model.compile(optimizer=Adam(learning_rate=0.001),
                      loss='mae', metrics=['mae'])
        return model

    def build_cnn_lstm_model(self, input_shape):
        print("CNN-LSTM model building skipped for current speed optimization, but can be reinstated later.")
        return None

    def train_lightgbm(self, X_train, y_train, X_val, y_val):
        """Train LightGBM model"""
        print("Training LightGBM model...")

        # Robust NaN/Inf handling just before LightGBM training
        if np.any(np.isnan(X_train)) or np.any(np.isinf(X_train)):
            print("WARNING: NaNs/Infs found in X_train for LightGBM. Applying emergency cleanup.")
            X_train = np.nan_to_num(X_train, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(X_val)) or np.any(np.isinf(X_val)):
            print("WARNING: NaNs/Infs found in X_val for LightGBM. Applying emergency cleanup.")
            X_val = np.nan_to_num(X_val, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(y_train)) or np.any(np.isinf(y_train)):
            print("WARNING: NaNs/Infs found in y_train for LightGBM. Applying emergency cleanup.")
            y_train = np.nan_to_num(y_train, nan=0.0, posinf=1e6, neginf=-1e6)
        if np.any(np.isnan(y_val)) or np.any(np.isinf(y_val)):
            print("WARNING: NaNs/Infs found in y_val for LightGBM. Applying emergency cleanup.")
            y_val = np.nan_to_num(y_val, nan=0.0, posinf=1e6, neginf=-1e6)


        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        params = {
            'objective': 'regression',
            'metric': 'mae',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'random_state': 42,
            'n_estimators': 1000
        }

        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=params['n_estimators'],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )

        return model

    def evaluate_model(self, y_true, y_pred, model_name):
        """Evaluate model performance"""
        mae = mean_absolute_error(y_true, y_pred)
        correlation, _ = pearsonr(y_true, y_pred)

        print(f"{model_name} - MAE: {mae:.4f}, Pearson Correlation: {correlation:.4f}")
        return correlation

    def fit(self, train_data_raw_initial_load):
        """Main training pipeline with robust error handling"""
        print("Starting training pipeline...")

        # Determine raw X_n columns from the initial load structure
        X_n_cols_raw = [col for col in train_data_raw_initial_load.columns if col.startswith('X') and col != 'label']
        basic_features_cols = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']

        preselected_X_n_features = X_n_cols_raw[:min(self.top_X_features_to_preselect, len(X_n_cols_raw))]
        print(f"Initially selected {len(preselected_X_n_features)} X_n features for direct Pandas load.")

        columns_to_process_raw_for_fit = basic_features_cols + preselected_X_n_features + ['label']

        train_df = None
        if os.path.exists(self._engineered_data_checkpoint_path):
            print(f"Checkpoint found. Loading processed data from {self._engineered_data_checkpoint_path}...")
            train_df = pd.read_parquet(self._engineered_data_checkpoint_path)
            if 'timestamp' in train_df.columns:
                train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
        else:
            print(f"Selecting columns from initial raw training data...")
            train_df = train_data_raw_initial_load[columns_to_process_raw_for_fit].copy()
            if 'timestamp' not in train_df.columns and train_df.index.name == 'timestamp':
                train_df = train_df.reset_index(names=['timestamp'])
            train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
            gc.collect()
            train_df = self.optimize_memory(train_df)
            train_df = self.create_time_features(train_df)
            print(f"Saving engineered data to checkpoint: {self._engineered_data_checkpoint_path}...")
            train_df.to_parquet(self._engineered_data_checkpoint_path, index=False)
            print("Engineered data checkpoint saved.")

        print(f"Data shape after feature engineering: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
        feature_cols_final = [col for col in train_df.columns
                              if col not in ['timestamp', 'label']]
        X_df = train_df[feature_cols_final]
        y_df = train_df['label']
        print(f"Features shape before final selection: {X_df.shape[0]} rows, {X_df.shape[1]} columns")
        print(f"Target shape: {y_df.shape[0]} rows")
        
        X_selected, valid_idx = self.select_features(X_df, y_df)
        y_for_training = y_df.loc[valid_idx].astype(np.float32)

        del X_df, y_df
        gc.collect()

        X_scaled = self.scaler.fit_transform(X_selected)
        print("Final data validation (after scaling)...")
        if np.any(np.isnan(X_scaled)) or np.any(np.isinf(X_scaled)):
            print("ERROR: Still have invalid values after preprocessing! Applying emergency cleanup.")
            X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=1e6, neginf=-1e6)
        print(f"Final training data shape: {X_scaled.shape}")
        
        del X_selected
        gc.collect()

        # Split data into training and validation sets
        temp_train_full_timestamps = train_data_raw_initial_load[['timestamp']].copy()
        if 'timestamp' not in temp_train_full_timestamps.columns and temp_train_full_timestamps.index.name == 'timestamp':
            temp_train_full_timestamps = temp_train_full_timestamps.reset_index(names=['timestamp'])
        temp_train_full_timestamps['timestamp'] = pd.to_datetime(temp_train_full_timestamps['timestamp'])
        
        VALIDATION_SPLIT_DATE = '2024-01-01'
        original_train_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] < VALIDATION_SPLIT_DATE].index
        original_val_indices = temp_train_full_timestamps[temp_train_full_timestamps['timestamp'] >= VALIDATION_SPLIT_DATE].index
        
        del temp_train_full_timestamps
        gc.collect()

        X_scaled_temp_df = pd.DataFrame(X_scaled, index=valid_idx, columns=self.selected_features)
        
        del X_scaled, y_for_training
        gc.collect()

        actual_train_indices = original_train_indices.intersection(X_scaled_temp_df.index)
        actual_val_indices = original_val_indices.intersection(X_scaled_temp_df.index)

        # --- IMPORTANT CHANGE: Save index as a named column for pyarrow filtering ---
        # Create DataFrames with the original_row_index as a named column
        X_train_df_to_save = X_scaled_temp_df.loc[actual_train_indices].reset_index().rename(columns={'index': 'original_row_index'})
        y_train_df_to_save = pd.DataFrame({'label': train_df['label'].loc[actual_train_indices].values}, index=actual_train_indices).reset_index().rename(columns={'index': 'original_row_index'})
        X_val_df_to_save = X_scaled_temp_df.loc[actual_val_indices].reset_index().rename(columns={'index': 'original_row_index'})
        y_val_df_to_save = pd.DataFrame({'label': train_df['label'].loc[actual_val_indices].values}, index=actual_val_indices).reset_index().rename(columns={'index': 'original_row_index'})
        
        # Save these DataFrames to Parquet files without the Pandas DataFrame index (it's now a column)
        X_train_df_to_save.to_parquet(self._scaled_X_path, index=False)
        y_train_df_to_save.to_parquet(self._scaled_y_path, index=False)
        X_val_df_to_save.to_parquet(self._scaled_val_X_path, index=False)
        y_val_df_to_save.to_parquet(self._scaled_val_y_path, index=False)
        
        print("Scaled training and validation data saved to separate Parquet files with 'original_row_index'.")

        # Delete the large DataFrames from memory right after saving
        del X_train_df_to_save, y_train_df_to_save, X_val_df_to_save, y_val_df_to_save
        del train_df, X_scaled_temp_df, valid_idx # Also clear original train_df
        gc.collect()

        # Create Data Generators - pass min/max of original indices for filtering
        train_generator = TimeSeriesSequence(
            X_filepath=self._scaled_X_path,
            y_filepath=self._scaled_y_path,
            sequence_length=self.sequence_length,
            batch_size=16, # Current batch_size
            feature_cols=self.selected_features,
            total_samples=len(actual_train_indices),
            original_indices_min=actual_train_indices.min(),
            original_indices_max=actual_train_indices.max()
        )
        val_generator = TimeSeriesSequence(
            X_filepath=self._scaled_val_X_path,
            y_filepath=self._scaled_y_path,
            sequence_length=self.sequence_length,
            batch_size=16, # Current batch_size
            feature_cols=self.selected_features,
            total_samples=len(actual_val_indices),
            original_indices_min=actual_val_indices.min(),
            original_indices_max=actual_val_indices.max()
        )
        print("Keras Data Generators created.")

        try:
            # LightGBM still needs in-memory data, so load from the new Parquet files
            lgb_X_train = pd.read_parquet(self._scaled_X_path)
            lgb_y_train = pd.read_parquet(self._scaled_y_path)['label'].values
            lgb_X_val = pd.read_parquet(self._scaled_val_X_path)
            lgb_y_val = pd.read_parquet(self._scaled_val_y_path)['label'].values

            # Drop the 'original_row_index' column before passing to LightGBM
            if 'original_row_index' in lgb_X_train.columns:
                lgb_X_train = lgb_X_train.drop(columns=['original_row_index'])
            if 'original_row_index' in lgb_X_val.columns:
                lgb_X_val = lgb_X_val.drop(columns=['original_row_index'])

            lgb_model = self.train_lightgbm(
                lgb_X_train.values, # Pass values directly
                lgb_y_train,
                lgb_X_val.values, # Pass values directly
                lgb_y_val
            )
            
            # For prediction, use the same X_val data
            lgb_pred = lgb_model.predict(lgb_X_val.values)
            lgb_score = self.evaluate_model(lgb_y_val, lgb_pred, "LightGBM")
            self.models['lightgbm'] = lgb_model

            del lgb_X_train, lgb_y_train, lgb_X_val, lgb_y_val, lgb_pred # Clear LGBM-specific data
            gc.collect()

        except Exception as e:
            print(f"LightGBM training or prediction failed: {e}")
            lgb_score = 0

        dl_predictions_list = []
        dl_weights = []
        
        full_y_val_for_dl_eval = pd.read_parquet(self._scaled_val_y_path)['label'].values
        y_val_dl_eval_aligned = full_y_val_for_dl_eval[self.sequence_length - 1:]

        print(f"Full Y_val for DL evaluation shape: {y_val_dl_eval_aligned.shape}")

        try:
            if len(train_generator) > 0:
                print(f"Proceeding with Deep Learning models. Training generator has {len(train_generator)} batches.")
                callbacks = [
                    EarlyStopping(patience=10, restore_best_weights=True, monitor='val_mae'),
                    ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, monitor='val_mae')
                ]
                conv1d_score = 0 # Changed from convlstm_score
                try:
                    print("Training Conv1D model...") # Changed print message
                    dummy_X, _ = train_generator.__getitem__(0)
                    input_shape_conv1d = dummy_X.shape[1:] # Changed from input_shape_convlstm
                    del dummy_X
                    gc.collect()

                    conv1d_model = self.build_conv1d_model(input_shape_conv1d) # Changed to build_conv1d_model
                    if conv1d_model:
                        with tf.device('/GPU:0'): # Explicitly place on GPU
                            conv1d_model.fit(
                                train_generator,
                                validation_data=val_generator,
                                epochs=5, # Reduced epochs for faster testing
                                callbacks=callbacks,
                                verbose=2, # Suppress per-batch logging
                                # workers=os.cpu_count(), use_multiprocessing=True # Reverted multiprocessing
                            )
                        with tf.device('/GPU:0'): # Explicitly place on GPU for prediction
                            conv1d_pred = conv1d_model.predict(val_generator).flatten()
                        conv1d_score = self.evaluate_model(y_val_dl_eval_aligned, conv1d_pred, "Conv1D") # Changed model name
                        self.models['conv1d'] = conv1d_model # Changed model key
                except Exception as e:
                    print(f"Conv1D training or prediction failed: {e}") # Changed print message
                    conv1d_score = 0
                
                # Removed LSTM training block
                # lstm_score = 0
                # try:
                #     print("Training LSTM model...")
                #     ...
                # except Exception as e:
                #     print(f"LSTM training or prediction failed: {e}")
                #     lstm_score = 0

                # Ensemble logic now with LightGBM and Conv1D
                if 'conv1d' in self.models and conv1d_score > 0: # Check if Conv1D was successful
                    lgb_pred_aligned = lgb_pred[self.sequence_length - 1:]

                    dl_predictions_list.append(conv1d_pred)
                    dl_weights.append(0.6) # Adjust weight for single DL model

                    dl_ensemble_weighted = np.average(dl_predictions_list, axis=0, weights=dl_weights)
                    ensemble_pred = (lgb_pred_aligned * 0.4) + (dl_ensemble_weighted * 0.6)
                    ensemble_score = self.evaluate_model(y_val_dl_eval_aligned, ensemble_pred, "Ensemble")
                    print(f"\nEnsemble score: {ensemble_score:.4f}")
                else:
                    print("No successful deep learning models to include in ensemble (only Conv1D was attempted).")
                    ensemble_score = lgb_score # Fallback to LGBM score if Conv1D failed

                print(f"\nBest individual model score: {max(lgb_score, conv1d_score):.4f}") # Adjusted max
                print(f"Final overall ensemble score: {ensemble_score:.4f}")
            else:
                print("Skipping deep learning models as training generator is empty (e.g., due to insufficient data for sequence_length).")
                ensemble_score = lgb_score
        except Exception as e:
            print(f"Deep learning training failed during overall process (likely model build or generator issue): {e}")
            ensemble_score = lgb_score
        
        for path in [self._scaled_X_path, self._scaled_y_path, self._scaled_val_X_path, self._scaled_val_y_path]:
            if os.path.exists(path):
                os.remove(path)
                print(f"Cleaned up temporary file: {path}")
        gc.collect()
        return self

    def predict(self, test_data_raw_initial_load):
        """Generate predictions for test data with robust error handling"""
        print("Generating predictions...")

        temp_df_for_id_map = test_data_raw_initial_load.copy()
        if 'ID' not in temp_df_for_id_map.columns and temp_df_for_id_map.index.name == 'ID':
            temp_df_for_id_map = temp_df_for_id_map.reset_index(names=['ID'])
        elif 'ID' not in temp_df_for_id_map.columns and temp_df_for_id_map.index is not None and temp_df_for_id_map.index.name is None and len(temp_df_for_id_map.index) == len(temp_df_for_id_map):
            temp_df_for_id_map = temp_df_for_id_map.reset_index()
            temp_df_for_id_map.rename(columns={'index': 'ID'}, inplace=True)

        id_to_original_index_map = pd.Series(temp_df_for_id_map.index.values, index=temp_df_for_id_map['ID'])
        original_test_ids = temp_df_for_id_map['ID'].copy()

        del temp_df_for_id_map
        gc.collect()

        basic_features_cols_test = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'ID']
        X_n_cols_raw_test = [col for col in test_data_raw_initial_load.columns if col.startswith('X') and col != 'label']
        preselected_X_n_features_test = X_n_cols_raw_test[:min(self.top_X_features_to_preselect, len(X_n_cols_raw_test))]

        columns_to_process_raw_for_predict = basic_features_cols_test + preselected_X_n_features_test

        missing_columns = [col for col in columns_to_process_raw_for_predict if col not in test_data_raw_initial_load.columns]
        if missing_columns:
            raise KeyError(f"The following required columns are missing from the test data: {missing_columns}. Please ensure your test.parquet file contains these columns.")

        print(f"Selecting columns from initial raw test data (only {len(columns_to_process_raw_for_predict)} columns)...")
        test_df = test_data_raw_initial_load[columns_to_process_raw_for_predict].copy()

        del test_data_raw_initial_load
        gc.collect()

        if 'ID' not in test_df.columns and test_df.index.name == 'ID':
            test_df = test_df.reset_index()

        if 'timestamp' in test_df.columns:
            test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

        test_df = self.optimize_memory(test_df)
        test_df = self.create_time_features(test_df)

        print(f"Test data shape after feature engineering: {test_df.shape[0]} rows, {test_df.shape[1]} columns")

        if self.selected_features is None:
            raise ValueError("Model must be fitted before making predictions (selected_features is None)")

        X_test_df_final = test_df[self.selected_features]

        X_test_df_final = X_test_df_final.replace([np.inf, -np.inf], np.nan)
        X_test_df_final = X_test_df_final.ffill().bfill().fillna(0)

        X_test_scaled = self.scaler.transform(X_test_df_final)

        if np.any(np.isnan(X_test_scaled)) or np.any(np.isinf(X_test_scaled)):
            print("WARNING: Invalid values in test data after scaling, applying emergency cleanup.")
            X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=1e6, neginf=-1e6)

        X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test_df_final.index, columns=self.selected_features)

        del X_test_df_final, test_df
        gc.collect()

        predictions = np.zeros(len(original_test_ids), dtype=np.float32)
        indexed_predictions_by_internal_idx = {}

        if 'lightgbm' in self.models:
            try:
                # Ensure X_test_scaled_df is cleaned before prediction
                X_test_scaled_for_lgbm = X_test_scaled_df.copy()
                if np.any(np.isnan(X_test_scaled_for_lgbm.values)) or np.any(np.isinf(X_test_scaled_for_lgbm.values)):
                    print("WARNING: NaNs/Infs found in X_test_scaled_for_lgbm. Applying emergency cleanup.")
                    X_test_scaled_for_lgbm = pd.DataFrame(
                        np.nan_to_num(X_test_scaled_for_lgbm.values, nan=0.0, posinf=1e6, neginf=-1e6),
                        columns=X_test_scaled_for_lgbm.columns, index=X_test_scaled_for_lgbm.index
                    )

                lgb_pred_full = self.models['lightgbm'].predict(X_test_scaled_for_lgbm)
                for i, idx in enumerate(X_test_scaled_for_lgbm.index): # Use cleaned df for iteration
                    indexed_predictions_by_internal_idx[idx] = lgb_pred_full[i]
                print("LightGBM predictions generated.")
                del X_test_scaled_for_lgbm # Clear temporary
                gc.collect()
            except Exception as e:
                print(f"LightGBM prediction failed: {e}")

        if ('conv1d' in self.models and self.models['conv1d'] is not None): # Changed from convlstm/lstm
            try:
                # For prediction, we need to save test data to a temp file and use a generator
                # to avoid OOM if test_data is also large.
                # First, save X_test_scaled_df to a temporary parquet file with original_row_index
                X_test_df_to_save = X_test_scaled_df.reset_index().rename(columns={'index': 'original_row_index'})
                temp_test_X_path = './scaled_test_X.parquet'
                X_test_df_to_save.to_parquet(temp_test_X_path, index=False)
                del X_test_df_to_save
                gc.collect()

                # Create a generator for test prediction
                test_generator = TimeSeriesSequence(
                    X_filepath=temp_test_X_path,
                    y_filepath=temp_test_X_path, # y_filepath is dummy for prediction, not used
                    sequence_length=self.sequence_length,
                    batch_size=16, # Current batch_size
                    feature_cols=self.selected_features,
                    total_samples=len(X_test_scaled_df),
                    original_indices_min=X_test_scaled_df.index.min(),
                    original_indices_max=X_test_scaled_df.index.max()
                )

                if len(test_generator) > 0:
                    dl_predictions_list_test = []
                    dl_weights_test = []

                    if 'conv1d' in self.models and self.models['conv1d'] is not None: # Changed from convlstm
                        try:
                            with tf.device('/GPU:0'): # Explicitly place on GPU for prediction
                                conv1d_pred = self.models['conv1d'].predict(test_generator).flatten() # Changed from convlstm_pred
                            dl_predictions_list_test.append(conv1d_pred)
                            dl_weights_test.append(0.6) # Weight for Conv1D
                        except Exception as e:
                            print(f"Conv1D prediction during test failed: {e}") # Changed print message

                    if dl_predictions_list_test: # Check if any DL predictions were added
                        dl_ensemble_weighted = np.average(dl_predictions_list_test, axis=0, weights=dl_weights_test)
                        
                        # Indices for DL predictions in the test set
                        # These are the original_row_index values for which DL predictions were made
                        # We need to load these indices from the temp_test_X_path to align
                        temp_test_df_for_indices = pd.read_parquet(temp_test_X_path, columns=['original_row_index'])
                        sequence_covered_indices_in_X_scaled_df = temp_test_df_for_indices['original_row_index'].values[self.sequence_length - 1:].tolist()
                        del temp_test_df_for_indices
                        gc.collect()

                        for i, internal_idx in enumerate(sequence_covered_indices_in_X_scaled_df):
                            lgbm_part = indexed_predictions_by_internal_idx.get(internal_idx, 0.0)
                            indexed_predictions_by_internal_idx[internal_idx] = (lgbm_part * 0.4) + (dl_ensemble_weighted[i] * 0.6)

                        print("Deep learning models included in predictions.")
                    else:
                        print("No successful deep learning models to include in test predictions.")
                else:
                    print("Test data too short for deep learning sequences. Using LightGBM only.")
            except Exception as e:
                print(f"Deep learning prediction setup failed: {e}")
                print("Falling back to LightGBM predictions only for test set.")
            finally:
                # Clean up temporary test data file
                if os.path.exists(temp_test_X_path):
                    os.remove(temp_test_X_path)
                    print(f"Cleaned up temporary test file: {temp_test_X_path}")
        
        del X_test_scaled_df
        gc.collect()

        for i, current_id in enumerate(original_test_ids):
            original_idx_in_raw_df = id_to_original_index_map.get(current_id)

            if original_idx_in_raw_df is not None and original_idx_in_raw_df in indexed_predictions_by_internal_idx:
                predictions[i] = indexed_predictions_by_internal_idx[original_idx_in_raw_df]
            else:
                predictions[i] = 0.0

        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
            print("WARNING: Invalid predictions detected (NaN/Inf), cleaning...")
            predictions = np.nan_to_num(predictions, nan=0.0, posinf=1.0, neginf=-1.0)

        print(f"Generated {len(predictions)} predictions")
        print(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")

        return predictions
    
    # Removed _prepare_sequences_for_inference as it's replaced by TimeSeriesSequence for prediction
    # def _prepare_sequences_for_inference(self, data):
    #     """Helper to prepare sequences for inference, similar to original prepare_sequences."""
    #     ...


# Main execution function
def run_competition_pipeline():
    """Run the complete competition pipeline"""

    print("Loading data...")
    train_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')

    if 'timestamp' not in train_full_raw.columns:
        if train_full_raw.index.name == 'timestamp':
            train_full_raw = train_full_raw.reset_index(names=['timestamp'])
            print(f"DEBUG: Resetting index 'timestamp' for train_full_raw. New columns: {train_full_raw.columns.tolist()}")
        else:
            train_full_raw = train_full_raw.reset_index()
            if 'index' in train_full_raw.columns and 'timestamp' not in train_full_raw.columns:
                 train_full_raw.rename(columns={'index': 'timestamp'}, inplace=True)
                 print(f"DEBUG: Renamed 'index' to 'timestamp' for train_full_raw. New columns: {train_full_raw.columns.tolist()}")

    if 'timestamp' in train_full_raw.columns:
        train_full_raw['timestamp'] = pd.to_datetime(train_full_raw['timestamp'])
        print(f"DEBUG: 'timestamp' column found and converted to datetime in train_full_raw.")
    else:
        print("CRITICAL WARNING: 'timestamp' column still not found in train_full_raw after all attempts. This will likely cause issues.")
        print(f"DEBUG: train_full_raw columns are: {train_full_raw.columns.tolist()}")

    test_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')

    if 'ID' not in test_full_raw.columns:
        if test_full_raw.index.name == 'ID':
            test_full_raw = test_full_raw.reset_index(names=['ID'])
            print(f"DEBUG: Resetting index 'ID' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")
        else:
            test_full_raw = test_full_raw.reset_index()
            if 'index' in test_full_raw.columns and 'ID' not in test_full_raw.columns:
                test_full_raw.rename(columns={'index': 'ID'}, inplace=True)
                print(f"DEBUG: Renamed 'index' to 'ID' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")

    if 'timestamp' not in test_full_raw.columns:
        if test_full_raw.index.name == 'timestamp':
            test_full_raw = test_full_raw.reset_index(names=['timestamp'])
            print(f"DEBUG: Resetting index 'timestamp' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")
        else:
            test_full_raw = test_full_raw.reset_index()
            if 'index' in test_full_raw.columns and 'timestamp' not in test_full_raw.columns:
                test_full_raw.rename(columns={'index': 'timestamp'}, inplace=True)
                print(f"DEBUG: Renamed 'index' to 'timestamp' for test_full_raw. New columns: {test_full_raw.columns.tolist()}")

    if 'timestamp' in test_full_raw.columns:
        test_full_raw['timestamp'] = pd.to_datetime(test_full_raw['timestamp'])
        print(f"DEBUG: 'timestamp' column found and converted to datetime in test_full_raw.")
    else:
        print("CRITICAL WARNING: 'timestamp' column still not found in test_full_raw after all attempts. This will likely cause issues.")
        print(f"DEBUG: test_full_raw columns are: {test_full_raw.columns.tolist()}")

    print(f"\nTrain shape: {train_full_raw.shape}")
    print(f"Test shape: {test_full_raw.shape}")

    # Initialize and train model
    predictor = CryptoMarketPredictor(
        sequence_length=30,
        top_features=100,
        top_X_features_to_preselect=30
    )
    predictor.fit(train_full_raw)

    predictions = predictor.predict(test_full_raw)

    # Create submission
    submission = pd.DataFrame({
        'ID': test_full_raw['ID'],
        'Prediction': predictions
    })

    # Save submission
    submission.to_csv('/kaggle/working/submission.csv', index=False)
    print(f"Submission saved with {len(submission)} predictions")
    print(f"Prediction statistics - Mean: {predictions.mean():.4f}, Std: {predictions.std():.4f}")

    return submission

# Run the pipeline
if __name__ == "__main__":
    submission = run_competition_pipeline()
