<a href="https://www.kaggle.com/code/nicholas33/drw-crypto-market-prediction-cb-lgbm-xgb-nb153?scriptVersionId=251045027" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Step 1: Installations
!pip install catboost lightgbm xgboost prophet -q # Added prophet

# Step 2: Imports
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
import gc
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import time
from prophet import Prophet # Added Prophet import

In [2]:
# Step 3: Class Definition (No changes needed here. The problem is in data loading/ID handling.)
class CryptoMarketPredictor:
    """
    An optimized pipeline using an ensemble of tree-based models (LGBM, XGBoost, CatBoost)
    with rich feature engineering to predict crypto market movements.
    """
    def __init__(self, top_features=100, top_X_features_to_preselect=30, use_future_lags=False):
        self.top_features = top_features
        self.top_X_features_to_preselect = top_X_features_to_preselect
        self.scaler = RobustScaler()
        self.feature_selector = None
        self.selected_features = None
        self.models = {}
        self.preselected_X_n_names = None
        self.use_future_lags = use_future_lags # New parameter

    def optimize_memory(self, df):
        """Optimize DataFrame memory usage."""
        print(f"Memory usage before optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        for col in df.columns:
            col_type = df[col].dtype
            if col_type != 'object' and col not in ['timestamp', 'ID', 'label']: # Exclude timestamp, ID, label from downcasting
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else: # float
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
        print(f"Memory usage after optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        return df

    def clean_data(self, df):
        """Clean DataFrame by handling inf, NaN values."""
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        # Use forward-fill then backward-fill to propagate last valid observation
        for col in df.columns:
            if df[col].dtype == 'float32' or df[col].dtype == 'float64':
                df[col] = df[col].ffill().bfill()
        df.fillna(0, inplace=True) # Fill any remaining NaNs with 0
        return df

    def calculate_rsi_proxy(self, prices_series, window=14):
        """Calculate a proxy for the Relative Strength Index (RSI)."""
        delta = prices_series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window, min_periods=1).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window, min_periods=1).mean()
        rs = gain / (loss + 1e-10) # Add epsilon to avoid division by zero
        rsi = 100 - (100 / (1 + rs))
        return rsi.fillna(50) # Fill initial NaNs with 50 (neutral RSI)

    def create_time_features(self, df):
        """Engineer a rich set of time-based features to capture market memory."""
        print("🛠️ Engineering rich time-based features...")

        # Basic price and imbalance features
        df['mid_price'] = (df['ask_qty'] + df['bid_qty']) / 2 if 'ask_qty' in df.columns and 'bid_qty' in df.columns else 0.0
        df['spread'] = df['ask_qty'] - df['bid_qty'] if 'ask_qty' in df.columns and 'bid_qty' in df.columns else 0.0
        df['imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-10) \
                          if 'bid_qty' in df.columns and 'ask_qty' in df.columns else 0.0

        # --- NEW CUSTOM FEATURES from the 0.9 notebook ---
        if 'bid_qty' in df.columns and 'ask_qty' in df.columns:
            df['bid_ask_ratio'] = df['bid_qty'] / (df['ask_qty'] + 1e-10)
        else:
            df['bid_ask_ratio'] = 0.0

        if 'buy_qty' in df.columns and 'sell_qty' in df.columns:
            df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-10)
        else:
            df['buy_sell_ratio'] = 0.0
        # --- END NEW CUSTOM FEATURES ---
        
        # Ensure 'volume' column exists before creating rolling features
        cols_for_rolling = ['mid_price', 'volume', 'imbalance']
        cols_for_rolling = [col for col in cols_for_rolling if col in df.columns]

        # Rolling window features (MA and STD)
        rolling_windows = [5, 10, 30, 60]
        for col in cols_for_rolling:
            for window in rolling_windows:
                df[f'{col}_ma_{window}'] = df[col].rolling(window, min_periods=1).mean()
                df[f'{col}_std_{window}'] = df[col].rolling(window, min_periods=1).std()

        # Lag features - adjusted based on use_future_lags
        lag_periods = [1, 2, 5, 10]
        cols_for_lags = ['mid_price', 'imbalance']
        cols_for_lags = [col for col in cols_for_lags if col in df.columns]
        for col in cols_for_lags:
            for lag in lag_periods:
                if self.use_future_lags:
                    df[f'{col}_lag_{lag}'] = df[col].shift(-lag) # Future lags (lead values)
                else:
                    df[f'{col}_lag_{lag}'] = df[col].shift(lag) # Past lags

        # Momentum feature - ensure 'mid_price' exists
        if 'mid_price' in df.columns:
            df['rsi_proxy_14'] = self.calculate_rsi_proxy(df['mid_price'], window=14)
        else:
            df['rsi_proxy_14'] = 50.0 # Default if mid_price is not available

        # Final cleanup to handle NaNs introduced by rolling/shifting
        df = self.clean_data(df)
        print(f"Feature engineering complete. Shape: {df.shape}")
        return df


    def select_features(self, X_df, y_df):
        """Select top k features using f_regression to reduce dimensionality."""
        print(f"Selecting top {self.top_features} features from {X_df.shape[1]} features...")
        n_features_to_select = min(self.top_features, X_df.shape[1])
        selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
        
        X_df_clean = X_df.replace([np.inf, -np.inf], np.nan).fillna(0)
        y_df_clean = y_df.replace([np.inf, -np.inf], np.nan).fillna(0)

        if y_df_clean.nunique() == 1:
            print("Warning: Target variable is constant. Skipping feature selection.")
            self.selected_features = X_df_clean.columns.tolist()
            self.feature_selector = None
            return X_df_clean.values
            
        selector.fit(X_df_clean, y_df_clean)
        self.feature_selector = selector
        self.selected_features = X_df_clean.columns[selector.get_support()].tolist()
        print(f"Selected {len(self.selected_features)} features.")
        return X_df_clean[self.selected_features].values

    def evaluate_model(self, y_true, y_pred, model_name):
        """Evaluate model performance and print results."""
        mae = mean_absolute_error(y_true, y_pred)
        
        if np.std(y_true) == 0 or np.std(y_pred) == 0:
            correlation = np.nan
        else:
            correlation, _ = pearsonr(y_true, y_pred)
        
        print(f"📊 {model_name} - MAE: {mae:.4f}, Pearson Correlation: {correlation:.4f}")
        return correlation

    def train_lightgbm(self, X_train, y_train, X_val, y_val):
        """Train a LightGBM model."""
        print("\nTraining LightGBM model...")
        params = {
            'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 1500,
            'learning_rate': 0.03, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
            'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1,
            'num_leaves': 31, 'verbose': -1, 'n_jobs': -1, 'seed': 42
        }
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(100, verbose=False)])
        return model

    def train_xgboost(self, X_train, y_train, X_val, y_val):
        """Train an XGBoost model."""
        print("Training XGBoost model...")
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'mae',
            'n_estimators': 1500,
            'learning_rate': 0.03,
            'tree_method': 'hist',
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'seed': 42,
            'n_jobs': -1,
        }
        model = xgb.XGBRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                  early_stopping_rounds=100, verbose=False) 
        return model

    def train_catboost(self, X_train, y_train, X_val, y_val):
        """Train a CatBoost model."""
        print("Training CatBoost model...")
        params = {
            'objective': 'MAE', 'eval_metric': 'MAE', 'iterations': 1500,
            'learning_rate': 0.03, 'random_seed': 42, 'logging_level': 'Silent',
            'l2_leaf_reg': 3, 'bagging_temperature': 1,
        }
        model = cb.CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
        return model

    # ---------------------------------FIT METHOD----------------------------------------------------
    def fit(self, train_data_raw):
        """Main training pipeline."""
        fit_start_time = time.time()
        print("🚀 Starting training pipeline...")

        # Ensure 'timestamp' exists and is datetime
        if 'timestamp' not in train_data_raw.columns:
            if train_data_raw.index.name == 'timestamp':
                train_data_raw.reset_index(inplace=True)
            elif 'index' in train_data_raw.columns:
                 train_data_raw.rename(columns={'index': 'timestamp'}, inplace=True)
            else:
                raise KeyError("The 'timestamp' column is missing from the DataFrame and cannot be inferred.")
        
        train_data_raw['timestamp'] = pd.to_datetime(train_data_raw['timestamp'])

        # --- 1. Filter data to the most recent 3 months ---
        print("Filtering data to the last 3 months for relevance and speed...")
        max_date_full = train_data_raw['timestamp'].max()
        three_months_prior = max_date_full - pd.DateOffset(months=3)
        train_df_filtered = train_data_raw[train_data_raw['timestamp'] >= three_months_prior].copy()
        
        if train_df_filtered.empty:
            raise ValueError("Training DataFrame is empty after filtering to the last 3 months. Adjust filter or provide more data.")

        print(f"Training on data from {train_df_filtered['timestamp'].min().date()} to {train_df_filtered['timestamp'].max().date()}. Shape: {train_df_filtered.shape}")
        
        # Ensure train_df is sorted by timestamp after filtering
        print("Sorting train_df by timestamp for chronological splitting...")
        train_df_filtered.sort_values(by='timestamp', inplace=True)
        train_df_filtered.reset_index(drop=True, inplace=True) # Reset index after sort for clean slicing

        # Now, use this filtered and sorted DataFrame for all subsequent steps
        train_df = train_df_filtered 
        del train_data_raw, train_df_filtered; gc.collect() # Free up memory from original and temp filtered df

        # --- 1a. More intelligent preselection of the anonymous features ---
        sample_df = train_df.sample(n=min(50000, len(train_df)), random_state=42)
        X_n_cols_raw = [c for c in sample_df.columns if c.startswith('X')]
        X_raw_sample = sample_df[X_n_cols_raw].copy()
        X_raw_sample.replace([np.inf, -np.inf], np.nan, inplace=True)
        X_raw_sample.fillna(0, inplace=True)
        y_raw_sample = sample_df['label'].fillna(0)

        pre_selector = SelectKBest(score_func=f_regression, k=self.top_X_features_to_preselect)
        if y_raw_sample.nunique() == 1:
            print("Warning: Sample target variable is constant. Preselecting all 'X' features.")
            self.preselected_X_n_names = X_raw_sample.columns.tolist()
        else:
            pre_selector.fit(X_raw_sample, y_raw_sample)
            self.preselected_X_n_names = X_raw_sample.columns[pre_selector.get_support()].tolist()

        print(f"Found the top {len(self.preselected_X_n_names)} 'X' features: {self.preselected_X_n_names}")
        del sample_df, X_raw_sample, y_raw_sample, pre_selector; gc.collect()

        # --- 2. Preprocessing and Feature Engineering ---
        # Add new custom features to the list of columns to use
        base_cols_for_fe = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'label']
        cols_to_use = base_cols_for_fe + self.preselected_X_n_names
        cols_to_use = [col for col in cols_to_use if col in train_df.columns]

        print("\n--- 1. Use the intelligent Pre-selected Subset for Engineering ---")
        print(f"Using {len(cols_to_use)} columns to generate features")
        print("-----------------------------------------------------\n")

        train_df = train_df[cols_to_use]
        train_df = self.optimize_memory(train_df)
        train_df = self.create_time_features(train_df) # This will now create new features like bid_ask_ratio

        print("\n--- 2. All Features After Engineering Step ---")
        engineered_features_list = train_df.columns.tolist()
        print(f"Created {len(engineered_features_list)} total features: {engineered_features_list}")
        print("------------------------------------------\n")

        # --- 3. Feature Selection ---
        feature_cols = [c for c in train_df.columns if c not in ['timestamp', 'ID', 'label']] # Exclude ID here too
        X_df = train_df[feature_cols]
        y_df_aligned = train_df['label'] # Align y_df with the current train_df after all processing

        X_selected_array = self.select_features(X_df, y_df_aligned)
        
        # --- 4. Scaling and Time-Based Validation Split (using last N rows) ---
        print("Splitting data into training and validation sets using last N rows for validation...")
        X_scaled = self.scaler.fit_transform(X_selected_array)
        
        total_samples = len(X_scaled)
        min_samples_required = 2 
        val_percentage = 0.2 # Use 20% of data for validation

        val_size = max(min_samples_required, int(total_samples * val_percentage))
        train_size = total_samples - val_size

        if train_size < min_samples_required:
            raise ValueError(f"Not enough data to create valid training set ({train_size} samples) after reserving {val_size} for validation. Total samples: {total_samples}")
        if val_size < min_samples_required:
            raise ValueError(f"Not enough data to create valid validation set ({val_size} samples). Total samples: {total_samples}")

        X_train, X_val = X_scaled[:train_size], X_scaled[train_size:]
        y_train, y_val = y_df_aligned.iloc[:train_size], y_df_aligned.iloc[train_size:]

        print(f"Train set size: {X_train.shape[0]}, Validation set size: {X_val.shape[0]}")
        del X_df, y_df_aligned, X_selected_array, X_scaled, train_df; gc.collect()

        # --- 5. Train Models ---
        if X_train.shape[0] < min_samples_required or X_val.shape[0] < min_samples_required:
            raise ValueError(f"Train or Validation set has fewer than {min_samples_required} samples after splitting. Cannot train models.")

        self.models['lgb'] = self.train_lightgbm(X_train, y_train, X_val, y_val)
        self.models['xgb'] = self.train_xgboost(X_train, y_train, X_val, y_val)
        self.models['cat'] = self.train_catboost(X_train, y_train, X_val, y_val)

        # --- 6. Evaluate Ensemble on Validation Set ---
        print("\n--- Validation Set Evaluation ---")
        val_predictions = {}
        for name, model in self.models.items():
            val_predictions[name] = model.predict(X_val)
            self.evaluate_model(y_val, val_predictions[name], name.upper())

        ensemble_pred = np.mean([val_predictions['lgb'], val_predictions['xgb'], val_predictions['cat']], axis=0)
        self.evaluate_model(y_val, ensemble_pred, "ENSEMBLE")

        print("\n✅ Training pipeline complete.")
        return self

    def predict(self, test_data_raw):
        """
        Generate predictions for test data in chunks to manage memory.
        """
        predict_start_time = time.time()
        print("\nGenerating predictions in chunks to manage memory...")
        if not self.models:
            raise ValueError("Models must be trained first. Call fit().")
        if not self.selected_features:
             raise ValueError("Features must be selected first. Call fit().")
        if not self.scaler:
            raise ValueError("Scaler must be fitted first. Call fit().")

        chunk_size = 100000
        num_chunks = int(np.ceil(len(test_data_raw) / chunk_size))
        all_predictions = []
        overlap = 60

        # Ensure 'timestamp' is correctly identified in test_data_raw
        if 'timestamp' not in test_data_raw.columns:
            if test_data_raw.index.name == 'timestamp':
                test_data_raw.reset_index(inplace=True)
            elif 'index' in test_data_raw.columns:
                test_data_raw.rename(columns={'index': 'timestamp'}, inplace=True)
            else:
                raise KeyError("The 'timestamp' column is missing from the test DataFrame and cannot be inferred.")

        # Store the original IDs before chunking, as predictions will be generated in order
        original_test_ids = test_data_raw['ID'].copy()

        for i in range(num_chunks):
            print(f"--> Processing chunk {i+1}/{num_chunks}...")

            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, len(test_data_raw))
            start_with_overlap = max(0, start_idx - overlap) 
            chunk = test_data_raw.iloc[start_with_overlap:end_idx].copy()
            
            chunk['timestamp'] = pd.to_datetime(chunk['timestamp'])

            # Include new custom features in base_cols
            base_cols = ['timestamp', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']
            cols_to_use_in_chunk = [col for col in base_cols + self.preselected_X_n_names if col in chunk.columns]
            
            if not cols_to_use_in_chunk:
                raise ValueError("No valid columns found in test chunk for feature engineering.")

            chunk = chunk[cols_to_use_in_chunk]

            chunk_featured = self.create_time_features(chunk) # This will now create new features like bid_ask_ratio

            chunk_featured = chunk_featured.iloc[start_idx - start_with_overlap : ]

            missing_features = [f for f in self.selected_features if f not in chunk_featured.columns]
            if missing_features:
                print(f"Warning: Missing features in test chunk: {missing_features}. Filling with 0.")
                for mf in missing_features:
                    chunk_featured[mf] = 0.0

            X_test_df = chunk_featured[self.selected_features]
            X_test_scaled = self.scaler.transform(X_test_df)
            X_test_scaled = np.nan_to_num(X_test_scaled)

            chunk_predictions = []
            for name, model in self.models.items():
                pred = model.predict(X_test_scaled)
                chunk_predictions.append(pred)

            ensemble_prediction = np.mean(chunk_predictions, axis=0)
            all_predictions.append(ensemble_prediction)

        final_predictions = np.concatenate(all_predictions)
        final_predictions = np.nan_to_num(final_predictions, nan=0.0)

        predict_duration = time.time() - predict_start_time
        print(f"Generated {len(final_predictions)} predictions in {predict_duration:.2f} seconds.")
        return final_predictions, original_test_ids

# --- NEW FUNCTION FOR PROPHET ENHANCEMENT ---
def train_prophet_enhancement(df_ensemble, prophet_weight=0.085):
    """
    Train a Prophet model on the ensemble predictions and blend with small weight.
    
    Parameters:
    df_ensemble: DataFrame with 'ID' and 'Prediction' columns from the initial ensemble.
    prophet_weight: Weight for Prophet predictions in the final blend.
    """
    print("\n--- Starting Prophet Enhancement ---")
    print("Training Prophet model for ensemble enhancement...")
    
    # Prepare data for Prophet: 'ds' (datetime) and 'y' (value)
    prophet_df = pd.DataFrame()
    # Convert IDs to hours from a base date to represent time for Prophet
    # Assuming IDs are sequential and represent time steps
    base_date = pd.to_datetime('2024-01-01') # Use a recent, arbitrary base date
    prophet_df['ds'] = base_date + pd.to_timedelta(df_ensemble['ID'] - df_ensemble['ID'].min(), unit='H')
    prophet_df['y'] = df_ensemble['Prediction'] # Use the ensemble's prediction as Prophet's target

    # Prophet model configuration (tuned from the 0.9 notebook)
    model = Prophet(
        growth='linear',
        changepoint_prior_scale=0.06,
        changepoint_range=0.9,
        n_changepoints=75,
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=False, # Assuming data points are hourly, daily seasonality might be too fine-grained or not present
        seasonality_mode='multiplicative',
        seasonality_prior_scale=10.0,
        holidays_prior_scale=10.0,
        interval_width=0.95,
        uncertainty_samples=0, # Set to 0 for faster prediction, 300 for full Bayesian inference (slower)
    )
    
    # Add custom seasonalities for potential patterns (based on hourly IDs)
    # 730 hours ~ 30 days (monthly pattern)
    model.add_seasonality(name='pattern_730h', period=730, fourier_order=5, prior_scale=10)
    # 2190 hours ~ 3 months (quarterly pattern)
    model.add_seasonality(name='pattern_2190h', period=2190, fourier_order=10, prior_scale=5)
    
    # Add 'abs(mx-m)' as a regressor if you decide to calculate it in your pipeline
    # For now, we'll skip it as your current pipeline doesn't produce it.
    # If you want to add it, you'd need to modify your CryptoMarketPredictor to output it
    # and then pass it to this function via df_ensemble.
    # Example: if 'abs(mx-m)' in df_ensemble.columns:
    #     prophet_df['abs_mx_m'] = df_ensemble['abs(mx-m)']
    #     model.add_regressor('abs_mx_m', prior_scale=5.0)

    print("Fitting Prophet model...")
    model.fit(prophet_df)
    
    # Make predictions for the future (which is just the existing IDs)
    future = prophet_df[['ds']].copy()
    # If you added regressors, you'd need to add them to 'future' too:
    # if 'abs_mx_m' in prophet_df.columns:
    #    future['abs_mx_m'] = prophet_df['abs_mx_m']

    forecast = model.predict(future)
    
    prophet_predictions = forecast['yhat'].values
    
    # Blend with initial ensemble predictions
    ensemble_predictions = df_ensemble['Prediction'].values # Note: 'Prediction' column from your ensemble
    final_predictions = (1 - prophet_weight) * ensemble_predictions + prophet_weight * prophet_predictions
    
    df_final_prophet = pd.DataFrame({
        'ID': df_ensemble['ID'],
        'Prediction': final_predictions
    })
    
    print(f"\nProphet Enhancement Statistics:")
    print(f"Prophet weight used: {prophet_weight}")
    print(f"Mean adjustment (Prophet_pred - Ensemble_pred): {np.mean(prophet_predictions - ensemble_predictions):.6f}")
    print(f"Std adjustment: {np.std(prophet_predictions - ensemble_predictions):.6f}")
    print(f"Max absolute adjustment: {np.max(np.abs(prophet_predictions - ensemble_predictions)):.6f}")
    print("--- Prophet Enhancement Complete ---")
    
    return df_final_prophet


In [3]:
# Cell 3 (Main function with the crucial ID handling fix and print statements)
def main():
    """
    A wrapper function to define all variables locally and allow for cleanup.
    """
    print("Loading data...")
    train_full_raw = None
    test_full_raw = None
    predictor = None
    # Initialize all variables that will be 'del'eted in finally block
    initial_submission_df = None
    final_submission_df = None
    sample_submission = None 

    try:
        # 1. LOAD DATA
        train_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
        test_full_raw = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')
        
        # --- ROBUST ID/TIMESTAMP HANDLING ---
        def ensure_id_and_timestamp(df, df_name):
            df = df.reset_index()

            timestamp_col = None
            for col in df.columns:
                try:
                    temp_series = pd.to_datetime(df[col], errors='coerce')
                    if not temp_series.isna().all() and col not in ['index', 'ID']:
                        timestamp_col = col
                        break
                except Exception:
                    pass

            if timestamp_col and timestamp_col != 'timestamp':
                df.rename(columns={timestamp_col: 'timestamp'}, inplace=True)
            elif not timestamp_col and 'timestamp' not in df.columns:
                raise KeyError(f"The 'timestamp' column is missing from the {df_name} DataFrame and cannot be inferred.")

            id_col = None
            if 'ID' in df.columns:
                id_col = 'ID'
            else:
                potential_id_cols = [col for col in df.columns if col not in ['timestamp'] and df[col].dtype in ['int64', 'int32', 'Int64']]
                
                level_id_col = None
                for p_id_col in potential_id_cols:
                    if p_id_col.startswith('level_'):
                        level_id_col = p_id_col
                        break
                
                if level_id_col:
                    id_col = level_id_col
                    df.rename(columns={id_col: 'ID'}, inplace=True)
                elif len(potential_id_cols) >= 1:
                    id_col = potential_id_cols[0]
                    df.rename(columns={id_col: 'ID'}, inplace=True)
                else:
                    df['ID'] = df.index
                    print(f"Warning: 'ID' column not found in {df_name}. Using DataFrame index as 'ID'. This might cause submission issues.")

            df['timestamp'] = pd.to_datetime(df['timestamp'])
            
            if not pd.api.types.is_integer_dtype(df['ID']):
                try:
                    df['ID'] = pd.to_numeric(df['ID'], errors='coerce').astype('Int64') 
                    if df['ID'].isnull().any():
                        print(f"Warning: NaNs introduced in 'ID' column of {df_name} during conversion to integer. Filling with sequential numbers.")
                        df.loc[df['ID'].isnull(), 'ID'] = np.arange(df['ID'].isnull().sum()) 
                except Exception:
                    print(f"Warning: Could not convert 'ID' column to integer in {df_name}. Keeping original type.")

            return df

        train_full_raw = ensure_id_and_timestamp(train_full_raw, 'train')
        test_full_raw = ensure_id_and_timestamp(test_full_raw, 'test')
        # --- END NEW ROBUST ID/TIMESTAMP HANDLING ---


        print(f"\nTrain shape: {train_full_raw.shape}")
        print(f"Test shape: {test_full_raw.shape}")

        # --- NEW: Timestamp Reconstruction for Test Data ---
        timestamp_recon_path = '/kaggle/input/the-order-of-the-test-rows-2/closest_rows.csv'
        use_timestamp_reconstruction = os.path.exists(timestamp_recon_path)

        if use_timestamp_reconstruction:
            print("\nApplying timestamp reconstruction for test data...")
            try:
                t_recon = pd.read_csv(timestamp_recon_path, header=None).iloc[:, 0]
                new_order_indices = t_recon.to_numpy()
                
                test_full_raw.reset_index(drop=True, inplace=True)
                
                test_full_raw = test_full_raw.iloc[new_order_indices].copy()
                test_full_raw.reset_index(drop=True, inplace=True)
                print("Test data sorted by reconstructed timestamps.")

            except Exception as e:
                print(f"Warning: Failed to apply timestamp reconstruction: {e}. Proceeding without it.")
                traceback.print_exc()
                use_timestamp_reconstruction = False
        else:
            print("No timestamp reconstruction file found.")
        # --- END NEW: Timestamp Reconstruction ---


        # 2. INITIALIZE AND TRAIN MODEL
        predictor = CryptoMarketPredictor(top_features=80, top_X_features_to_preselect=25, use_future_lags=False)
        predictor.fit(train_full_raw)

        # 3. GENERATE PREDICTIONS
        predictions, original_test_ids = predictor.predict(test_full_raw)
        
        # Final sanity checks on predictions before saving
        print("\nPerforming final checks on generated predictions...")
        predictions = np.nan_to_num(predictions, nan=0.0, posinf=1.0, neginf=-1.0)
        predictions = np.clip(predictions, -1.0, 1.0)
        print(f"Predictions min: {np.min(predictions):.4f}, max: {np.max(predictions):.4f}")
        print(f"Number of NaNs after final check: {np.isnan(predictions).sum()}")
        print(f"Number of Infs after final check: {np.isinf(predictions).sum()}")

        # 4. CREATE INITIAL SUBMISSION DF (before Prophet)
        initial_submission_df = pd.DataFrame({'ID': original_test_ids, 'Prediction': predictions})
        
        # --- NEW: Apply Prophet Enhancement ---
        prophet_weight_to_use = 0.085
        final_submission_df = train_prophet_enhancement(initial_submission_df, prophet_weight=prophet_weight_to_use)
        # --- END NEW ---

        # 5. SAVE FINAL SUBMISSION
        final_submission_df.to_csv('submission.csv', index=False)
        print("\nSubmission file 'submission.csv' created successfully.")
        
        print("\nFirst 5 rows of submission:")
        print(final_submission_df.head())
        print("\nLast 5 rows of submission:")
        print(final_submission_df.tail())

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # 6. CLEANUP
        print("\n🧹 Cleaning up variables to free memory...")
        # Check if variables are defined before attempting to delete them
        if 'train_full_raw' in locals() and train_full_raw is not None: del train_full_raw
        if 'test_full_raw' in locals() and test_full_raw is not None: del test_full_raw
        if 'predictor' in locals() and predictor is not None: del predictor
        if 'initial_submission_df' in locals() and initial_submission_df is not None: del initial_submission_df
        if 'final_submission_df' in locals() and final_submission_df is not None: del final_submission_df
        if 'sample_submission' in locals() and sample_submission is not None: del sample_submission # This was the problematic one
        gc.collect()
        print("Cleanup complete. Ready for another run.")

# --- This line calls the main function to start the pipeline ---
main()

Loading data...

Train shape: (525886, 788)
Test shape: (538150, 787)
No timestamp reconstruction file found.
🚀 Starting training pipeline...
Filtering data to the last 3 months for relevance and speed...
Training on data from 1970-01-01 to 1970-01-01. Shape: (525886, 788)
Sorting train_df by timestamp for chronological splitting...
Found the top 25 'X' features: ['X19', 'X20', 'X21', 'X22', 'X27', 'X28', 'X29', 'X218', 'X219', 'X287', 'X289', 'X291', 'X293', 'X295', 'X297', 'X298', 'X299', 'X465', 'X466', 'X614', 'X744', 'X751', 'X752', 'X753', 'X759']

--- 1. Use the intelligent Pre-selected Subset for Engineering ---
Using 31 columns to generate features
-----------------------------------------------------

Memory usage before optimization: 124.38 MB
Memory usage after optimization: 66.20 MB
🛠️ Engineering rich time-based features...
Feature engineering complete. Shape: (525886, 69)

--- 2. All Features After Engineering Step ---
Created 69 total features: ['timestamp', 'ask_qty', 

15:35:28 - cmdstanpy - INFO - Chain [1] start processing
15:41:43 - cmdstanpy - INFO - Chain [1] done processing



Prophet Enhancement Statistics:
Prophet weight used: 0.085
Mean adjustment (Prophet_pred - Ensemble_pred): -0.000131
Std adjustment: 0.387907
Max absolute adjustment: 1.003302
--- Prophet Enhancement Complete ---

Submission file 'submission.csv' created successfully.

First 5 rows of submission:
   ID  Prediction
0   1    0.140888
1   2   -0.353134
2   3   -0.421817
3   4   -0.116849
4   5   -0.647758

Last 5 rows of submission:
            ID  Prediction
538145  538146   -0.268505
538146  538147   -0.353183
538147  538148   -0.597914
538148  538149   -0.140501
538149  538150    0.038379

🧹 Cleaning up variables to free memory...
Cleanup complete. Ready for another run.
