In [None]:
# Cell 1: Imports
import mlflow
import mlflow.sklearn
import mlflow.lightgbm # Add other flavors if more models are used later (xgboost, catboost)

import pandas as pd
import numpy as np
import os
import joblib # For potential model saving/loading, though MLflow handles most
import time
import shutil # For cleaning up temp directories if any

from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import lightgbm as lgb
from sklearn.linear_model import Ridge # Example meta-learner

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, udf # Keep for future Spark ML preprocessing part
# from pyspark.sql.types import ArrayType, DoubleType # Keep for future Spark ML preprocessing part
# from pyspark.ml.linalg import VectorUDT, DenseVector, SparseVector # Keep for future Spark ML preprocessing part

# Suppress LightGBM verbosity for HPO trials
import logging
logging.getLogger('lightgbm').setLevel(logging.ERROR)


# Ensure spark session is available (Databricks notebooks usually provide 'spark')
if 'spark' not in locals():
    spark = SparkSession.builder.appName("AdvancedML_MVP_Sequential").getOrCreate()

print("Imports successful.")

In [None]:
# Cell 2: Init Cell - Global Configurations

# --- MLflow Configuration ---
# !!! IMPORTANT: SET YOUR MLFLOW EXPERIMENT PATH !!!
EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Regression_HPO_Ensemble" # e.g., /Users/your.email@domain.com/MyProjectExperiment
# Get your username from dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() if needed

# --- Data Paths (Unity Catalog Volumes) ---
# !!! IMPORTANT: SET YOUR UNITY CATALOG VOLUME BASE PATH !!!
# Example: "/Volumes/my_main_catalog/my_bronze_schema/my_project_volume/"
UC_BASE_DATA_PATH = "/Volumes/delfos/" # As per your input

# --- Paths for Preprocessed Data (Output of your Spark ML Preprocessing Pipeline) ---
# These paths MUST point to Parquet files (or directories of Parquet files)
# containing 'features_array' and your label column.
# We'll assume a single version of preprocessed data for this MVP.
# !!! IMPORTANT: UPDATE THESE AFTER YOUR PREPROCESSING STEP SAVES DATA !!!
SHARED_PROCESSED_TRAIN_PATH = f"{UC_BASE_DATA_PATH}processed_data/train_processed.parquet"
SHARED_PROCESSED_TEST_PATH = f"{UC_BASE_DATA_PATH}processed_data/test_processed.parquet"

# !!! IMPORTANT: SET YOUR ACTUAL LABEL COLUMN NAME AS IT APPEARS IN THE PARQUET FILES !!!
YOUR_LABEL_COLUMN_NAME = "target"

# --- Paths for Intermediate OOF/Test Predictions (will be created under UC_BASE_DATA_PATH) ---
UC_OOF_PREDS_DIR = os.path.join(UC_BASE_DATA_PATH, "oof_predictions")
UC_TEST_PREDS_DIR = os.path.join(UC_BASE_DATA_PATH, "test_predictions")
UC_FINAL_MODELS_DIR = os.path.join(UC_BASE_DATA_PATH, "final_models_hpo") # For non-MLflow saved models if any

# --- HPO Configuration ---
NUM_HPO_TRIALS = 25  # Number of trials for EACH base algorithm's HPO (keep low for MVP testing, increase later)
PRIMARY_METRIC = "rmse"  # Choose 'rmse' (to minimize) or 'r2' (to maximize, HPO will minimize -r2)

# --- Base Algorithms to Run ---
# For MVP: DecisionTree, RandomForest, ExtraTrees, LightGBM
BASE_ALGORITHMS_TO_RUN = ['decision_tree', 'random_forest', 'extra_trees', 'lightgbm']

# --- Cross-Validation for OOF ---
K_FOLDS_OOF = 5 # Number of folds for generating Out-of-Fold predictions

# --- Reproducibility ---
GLOBAL_SEED = 117

# --- MLflow Setup ---
# Function get_or_create_experiment will be defined in the next cell
# experiment_id = get_or_create_experiment(EXPERIMENT_PATH, spark) # Pass spark if needed by function
# if experiment_id:
#    mlflow.set_experiment(experiment_id=experiment_id)
# else:
#    print("Error: MLflow experiment could not be set.")

# --- Ensemble Configuration ---
META_LEARNERS_FOR_STACKING = {
    'ridge': Ridge(random_state=GLOBAL_SEED),
    'lgbm_meta': lgb.LGBMRegressor(random_state=GLOBAL_SEED, verbose=-1, n_jobs=-1) # Simple LGBM for meta
}

# --- Other Global Settings ---
MAX_METRICS_TO_LOG = 5 # Max number of metrics to log per MLflow run besides primary

# Create directories if they don't exist (use dbutils for UC volumes if direct os.makedirs fails)
# For UC Volumes, direct os.makedirs might not work from driver for non /dbfs/ paths.
# Spark can write to these paths, and for local operations, you might need to use /dbfs/ equivalent if copying.
# For saving pandas DFs, ensure the path is accessible.
# For now, we assume spark.write.parquet will handle UC Volume paths.
# For pandas.to_parquet, use /dbfs/Volumes/... path.
DBFS_UC_OOF_PREDS_DIR = f"/dbfs{UC_OOF_PREDS_DIR}"
DBFS_UC_TEST_PREDS_DIR = f"/dbfs{UC_TEST_PREDS_DIR}"

try:
    os.makedirs(DBFS_UC_OOF_PREDS_DIR, exist_ok=True)
    os.makedirs(DBFS_UC_TEST_PREDS_DIR, exist_ok=True)
    print(f"Created/checked OOF directory: {DBFS_UC_OOF_PREDS_DIR}")
    print(f"Created/checked Test Preds directory: {DBFS_UC_TEST_PREDS_DIR}")
except Exception as e:
    print(f"Warning: Could not create directories using os.makedirs on {DBFS_UC_OOF_PREDS_DIR} or {DBFS_UC_TEST_PREDS_DIR}. This might be okay if Spark handles it or if paths are purely for Spark. Error: {e}")


print(f"--- Global Configurations Initialized ---")
print(f"MLflow Experiment Path: {EXPERIMENT_PATH}")
print(f"Unity Catalog Base Data Path: {UC_BASE_DATA_PATH}")
print(f"Processed Train Data Path: {SHARED_PROCESSED_TRAIN_PATH}")
print(f"Processed Test Data Path: {SHARED_PROCESSED_TEST_PATH}")
print(f"Label Column: {YOUR_LABEL_COLUMN_NAME}")
print(f"Global Seed: {GLOBAL_SEED}")
print(f"Primary Metric for HPO: {PRIMARY_METRIC.upper()}")
print(f"Number of HPO Trials per Algorithm: {NUM_HPO_TRIALS}")
print(f"K-Folds for OOF: {K_FOLDS_OOF}")
print(f"Base Algorithms to run: {BASE_ALGORITHMS_TO_RUN}")
print(f"--------------------------------------")

In [None]:
# Cell 3: Utility Functions & HPO/Model Training Components

# --- MLflow Utility ---
def get_or_create_experiment(experiment_name, spark_session):
    """Safely creates or fetches an MLflow experiment."""
    try:
        # Check if running in a Databricks notebook environment
        if hasattr(spark_session, 'databricks'): # A bit of a hacky check
            # In Databricks, experiment names can be full paths
            # client = mlflow.tracking.MlflowClient() # Not needed if using mlflow.set_experiment
            experiment = mlflow.get_experiment_by_name(experiment_name)
            if experiment:
                print(f"MLflow experiment '{experiment_name}' found with ID: {experiment.experiment_id}")
                return experiment.experiment_id
            else:
                print(f"MLflow experiment '{experiment_name}' not found. Attempting to create.")
                experiment_id = mlflow.create_experiment(name=experiment_name)
                print(f"MLflow experiment '{experiment_name}' created with ID: {experiment_id}")
                return experiment_id
        else: # Fallback for local execution if needed, though UC Volumes imply Databricks
            if not mlflow.get_experiment_by_name(experiment_name):
                mlflow.create_experiment(experiment_name)
            return mlflow.get_experiment_by_name(experiment_name).experiment_id

    except mlflow.exceptions.MlflowException as e:
        if "RESOURCE_ALREADY_EXISTS" in str(e) or "Experiment with name" in str(e) and "already exists" in str(e):
            print(f"Race condition or experiment '{experiment_name}' was created concurrently. Fetching again.")
            experiment = mlflow.get_experiment_by_name(experiment_name)
            if experiment:
                print(f"Successfully fetched concurrently created experiment '{experiment_name}' with ID: {experiment.experiment_id}")
                return experiment.experiment_id
        print(f"MLflowException: Could not get or create experiment '{experiment_name}'. Error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error in get_or_create_experiment for '{experiment_name}'. Error: {e}")
        return None

# --- Algorithm Search Spaces (Top ~5 HPs) ---
ALGORITHM_SEARCH_SPACES = {
    'decision_tree': {
        'model_params': {
            'max_depth': hp.choice('dt_max_depth', [3, 5, 7, 10, 15, None]),
            'min_samples_split': hp.quniform('dt_min_samples_split', 2, 20, 1),
            'min_samples_leaf': hp.quniform('dt_min_samples_leaf', 1, 20, 1),
            'criterion': hp.choice('dt_criterion', ['squared_error', 'friedman_mse', 'absolute_error']), # Poisson removed as it's for counts
            'splitter': hp.choice('dt_splitter', ['best', 'random'])
        }
    },
    'random_forest': {
        'model_params': {
            'n_estimators': hp.quniform('rf_n_estimators', 20, 200, 10), # Reduced for MVP speed
            'max_depth': hp.choice('rf_max_depth', [5, 10, 15, None]),
            'min_samples_split': hp.quniform('rf_min_samples_split', 2, 10, 1),
            'min_samples_leaf': hp.quniform('rf_min_samples_leaf', 1, 10, 1),
            'max_features': hp.choice('rf_max_features', ['sqrt', 'log2', None])
        }
    },
    'extra_trees': {
        'model_params': {
            'n_estimators': hp.quniform('et_n_estimators', 20, 200, 10), # Reduced for MVP speed
            'max_depth': hp.choice('et_max_depth', [5, 10, 15, None]),
            'min_samples_split': hp.quniform('et_min_samples_split', 2, 10, 1),
            'min_samples_leaf': hp.quniform('et_min_samples_leaf', 1, 10, 1),
            'max_features': hp.choice('et_max_features', ['sqrt', 'log2', None])
        }
    },
    'lightgbm': {
        'model_params': {
            'n_estimators': hp.quniform('lgbm_n_estimators', 20, 200, 10), # Reduced for MVP speed
            'learning_rate': hp.loguniform('lgbm_learning_rate', np.log(0.01), np.log(0.2)),
            'num_leaves': hp.quniform('lgbm_num_leaves', 10, 100, 5), # Reduced for MVP speed
            'max_depth': hp.quniform('lgbm_max_depth', 3, 10, 1), # More constrained for MVP
            'subsample': hp.uniform('lgbm_subsample', 0.7, 1.0),
            'reg_alpha': hp.uniform('lgbm_reg_alpha', 0.0, 0.5) # L1 regularization
        }
    }
    # Add XGBoostRegressor, CatBoostRegressor search spaces here when you include them
}
print("Search spaces defined.")


# --- HPO Objective Function (Generalized for Regression) ---
# Note: HPO_PARENT_RUN_ID_FOR_OBJECTIVE, SHARED_PROCESSED_TRAIN_PATH_FOR_OBJECTIVE etc.
# will be set dynamically before calling fmin for each algorithm.
# This function relies on these being in its execution scope.

# Define these as placeholders, they will be updated by the HPO orchestrator for each algorithm
HPO_PARENT_RUN_ID_FOR_OBJECTIVE = None
CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE = None
# Primary metric and seed are from global scope
# Data paths also need to be accessible

def load_processed_data_for_sklearn(train_path, test_path, label_col_name):
    """Loads preprocessed data from Parquet and prepares for scikit-learn."""
    try:
        train_pdf = pd.read_parquet(train_path)
        test_pdf = pd.read_parquet(test_path)

        X_train = np.array(train_pdf['features_array'].tolist())
        y_train = train_pdf[label_col_name].values.astype(float)
        X_test = np.array(test_pdf['features_array'].tolist())
        y_test = test_pdf[label_col_name].values.astype(float)
        return X_train, y_train, X_test, y_test
    except Exception as e:
        print(f"ERROR loading/processing data from {train_path} or {test_path}: {e}")
        raise

def objective_function_regression(hyperparams_from_hyperopt):
    """
    Objective function for Hyperopt HPO.
    Trains a specific regressor, logs to MLflow, returns loss.
    Relies on HPO_PARENT_RUN_ID_FOR_OBJECTIVE and CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE being set.
    Also SHARED_PROCESSED_TRAIN_PATH, SHARED_PROCESSED_TEST_PATH, YOUR_LABEL_COLUMN_NAME,
    PRIMARY_METRIC, GLOBAL_SEED, MAX_METRICS_TO_LOG from global scope.
    """
    
    # Sanitize hyperparams (convert numpy types to native Python for model constructors)
    # This is crucial because Hyperopt can pass np.int64 etc.
    sanitized_hyperparams = {}
    for k, v in hyperparams_from_hyperopt.items():
        if isinstance(v, np.generic):
            sanitized_hyperparams[k] = v.item()
        elif k in ['max_depth'] and v is not None: # Max_depth can be None or int
             sanitized_hyperparams[k] = int(v) if v is not None else None
        # Ensure integer types for specific hyperparameters expected by models
        elif k in ['min_samples_split', 'min_samples_leaf', 'n_estimators', 'num_leaves', 'iterations'] and v is not None:
             sanitized_hyperparams[k] = int(v)
        else:
            sanitized_hyperparams[k] = v
    
    trial_run_name = f"Trial_{CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE}_{time.strftime('%Y%m%d-%H%M%S')}"

    with mlflow.start_run(run_id=HPO_PARENT_RUN_ID_FOR_OBJECTIVE, run_name=trial_run_name, nested=True) as trial_run:
        mlflow.log_param("model_type_trial", CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE) # Log the actual model type for this trial
        mlflow.log_params(sanitized_hyperparams)
        mlflow.set_tag("seed", GLOBAL_SEED)

        try:
            X_train, y_train, X_test, y_test = load_processed_data_for_sklearn(
                SHARED_PROCESSED_TRAIN_PATH, SHARED_PROCESSED_TEST_PATH, YOUR_LABEL_COLUMN_NAME
            )

            model = None
            if CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE == 'decision_tree':
                model = DecisionTreeRegressor(**sanitized_hyperparams, random_state=GLOBAL_SEED)
            elif CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE == 'random_forest':
                model = RandomForestRegressor(**sanitized_hyperparams, random_state=GLOBAL_SEED, n_jobs=-1)
            elif CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE == 'extra_trees':
                model = ExtraTreesRegressor(**sanitized_hyperparams, random_state=GLOBAL_SEED, n_jobs=-1)
            elif CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE == 'lightgbm':
                model = lgb.LGBMRegressor(**sanitized_hyperparams, random_state=GLOBAL_SEED, n_jobs=-1, verbose=-1)
            else:
                raise ValueError(f"Unsupported model type in objective function: {CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE}")
            
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

            rmse = np.sqrt(mean_squared_error(y_test, predictions))
            r2 = r2_score(y_test, predictions)
            mae = mean_absolute_error(y_test, predictions)
            
            metrics_to_log = {"rmse": rmse, "r2": r2, "mae": mae}
            logged_metrics_count = 0
            # Log up to MAX_METRICS_TO_LOG, prioritizing based on sorted name for consistency
            for m_name, m_val in sorted(metrics_to_log.items()):
                if logged_metrics_count < MAX_METRICS_TO_LOG:
                    mlflow.log_metric(m_name, m_val)
                    logged_metrics_count +=1

            if CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE == 'lightgbm':
                mlflow.lightgbm.log_model(model, "model", signature=mlflow.models.infer_signature(X_test, pd.Series(predictions, name=YOUR_LABEL_COLUMN_NAME)))
            else:
                mlflow.sklearn.log_model(model, "model", signature=mlflow.models.infer_signature(X_test, pd.Series(predictions, name=YOUR_LABEL_COLUMN_NAME)))
            
            mlflow.set_tag("status", "success")

            loss = None
            if PRIMARY_METRIC == 'rmse':
                loss = rmse
            elif PRIMARY_METRIC == 'r2':
                loss = -r2 
            else:
                raise ValueError(f"Unsupported PRIMARY_METRIC for loss calculation: {PRIMARY_METRIC}")

            return {'loss': loss, 'status': STATUS_OK, 'run_id': trial_run.info.run_id, 
                    'attachments': {'rmse': rmse, 'r2': r2, 'mae': mae, 'model_type': CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE}}

        except Exception as e:
            error_message_short = str(e)[:250] # MLflow param limit
            mlflow.log_param("error", error_message_short)
            mlflow.set_tag("status", "failed")
            print(f"TRIAL ERROR in run {trial_run.info.run_id} for model {CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE}: {e}")
            # Ensure a sensible worst loss is returned
            worst_loss = float('inf') if PRIMARY_METRIC == 'rmse' else float('inf') # if minimizing -r2, larger positive is worse
            return {'loss': worst_loss, 'status': 'fail', 'run_id': trial_run.info.run_id, 
                    'error_message': error_message_short}

print("Objective function defined.")


# --- OOF Generation and Final Model Training Function ---
def train_final_model_and_generate_oof(model_type, best_hyperparams,
                                     train_data_path, test_data_path, label_col_name,
                                     k_folds, seed, mlflow_parent_run_name_prefix):
    """
    Trains a final model with best HPs, generates OOF predictions on train set
    and predictions on test set. Logs everything to MLflow.
    """
    with mlflow.start_run(run_name=f"{mlflow_parent_run_name_prefix}_{model_type}_OOF_Final", nested=False) as oof_parent_run:
        mlflow.log_params(best_hyperparams)
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("k_folds_for_oof", k_folds)
        mlflow.set_tag("seed", seed)

        final_model_run_id = oof_parent_run.info.run_id
        print(f"Starting OOF generation and final model training for {model_type}. MLflow Run ID: {final_model_run_id}")

        try:
            X_full_train, y_full_train, X_test, y_test = load_processed_data_for_sklearn(
                train_data_path, test_data_path, label_col_name
            )

            oof_predictions = np.zeros(len(y_full_train))
            test_predictions_from_folds = np.zeros((len(y_test), k_folds)) # To average later

            kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed)

            for fold_num, (train_idx, val_idx) in enumerate(kf.split(X_full_train, y_full_train)):
                print(f"  Processing Fold {fold_num+1}/{k_folds} for {model_type}...")
                X_fold_train, X_fold_val = X_full_train[train_idx], X_full_train[val_idx]
                y_fold_train, y_fold_val = y_full_train[train_idx], y_full_train[val_idx]

                model_fold = None
                if model_type == 'decision_tree': model = DecisionTreeRegressor(**best_hyperparams, random_state=seed)
                elif model_type == 'random_forest': model = RandomForestRegressor(**best_hyperparams, random_state=seed, n_jobs=-1)
                elif model_type == 'extra_trees': model = ExtraTreesRegressor(**best_hyperparams, random_state=seed, n_jobs=-1)
                elif model_type == 'lightgbm': model = lgb.LGBMRegressor(**best_hyperparams, random_state=seed, n_jobs=-1, verbose=-1)
                else: raise ValueError(f"Unsupported model type for OOF: {model_type}")

                model.fit(X_fold_train, y_fold_train)
                oof_predictions[val_idx] = model.predict(X_fold_val)
                test_predictions_from_folds[:, fold_num] = model.predict(X_test)
            
            # Average test predictions from all folds
            final_test_predictions = np.mean(test_predictions_from_folds, axis=1)

            # Calculate OOF metrics
            oof_rmse = np.sqrt(mean_squared_error(y_full_train, oof_predictions))
            oof_r2 = r2_score(y_full_train, oof_predictions)
            mlflow.log_metric("oof_rmse", oof_rmse)
            mlflow.log_metric("oof_r2", oof_r2)
            print(f"  {model_type} OOF RMSE: {oof_rmse:.4f}, OOF R2: {oof_r2:.4f}")

            # Save OOF and Test predictions as artifacts (and to DBFS for ensembling)
            oof_df = pd.DataFrame({'true_label': y_full_train, f'oof_pred_{model_type}': oof_predictions})
            test_preds_df = pd.DataFrame({'true_label': y_test, f'test_pred_{model_type}': final_test_predictions}) # Store true_label if available for test

            oof_file_path_parquet = os.path.join(DBFS_UC_OOF_PREDS_DIR, f"oof_preds_{model_type}.parquet")
            test_preds_file_path_parquet = os.path.join(DBFS_UC_TEST_PREDS_DIR, f"test_preds_{model_type}.parquet")
            
            oof_df.to_parquet(oof_file_path_parquet, index=False)
            test_preds_df.to_parquet(test_preds_file_path_parquet, index=False)

            mlflow.log_artifact(oof_file_path_parquet)
            mlflow.log_artifact(test_preds_file_path_parquet)
            mlflow.set_tag(f"oof_preds_path_{model_type}", oof_file_path_parquet.replace("/dbfs","dbfs:")) # Log path for reference
            mlflow.set_tag(f"test_preds_path_{model_type}", test_preds_file_path_parquet.replace("/dbfs","dbfs:"))


            # Train final model on ALL training data
            print(f"  Training final {model_type} model on all training data...")
            final_model = None
            if model_type == 'decision_tree': final_model = DecisionTreeRegressor(**best_hyperparams, random_state=seed)
            elif model_type == 'random_forest': final_model = RandomForestRegressor(**best_hyperparams, random_state=seed, n_jobs=-1)
            elif model_type == 'extra_trees': final_model = ExtraTreesRegressor(**best_hyperparams, random_state=seed, n_jobs=-1)
            elif model_type == 'lightgbm': final_model = lgb.LGBMRegressor(**best_hyperparams, random_state=seed, n_jobs=-1, verbose=-1)
            else: raise ValueError(f"Unsupported model type for final training: {model_type}")

            final_model.fit(X_full_train, y_full_train)

            # Evaluate final model on test set
            final_model_test_preds = final_model.predict(X_test) # These should be similar to final_test_predictions
            final_model_rmse = np.sqrt(mean_squared_error(y_test, final_model_test_preds))
            final_model_r2 = r2_score(y_test, final_model_test_preds)
            final_model_mae = mean_absolute_error(y_test, final_model_test_preds)

            mlflow.log_metric("final_model_test_rmse", final_model_rmse)
            mlflow.log_metric("final_model_test_r2", final_model_r2)
            mlflow.log_metric("final_model_test_mae", final_model_mae)
            print(f"  {model_type} Final Model Test RMSE: {final_model_rmse:.4f}, R2: {final_model_r2:.4f}")
            
            # Log final model
            if model_type == 'lightgbm':
                mlflow.lightgbm.log_model(final_model, "final_model", signature=mlflow.models.infer_signature(X_test, pd.Series(final_model_test_preds, name=YOUR_LABEL_COLUMN_NAME)))
            else:
                mlflow.sklearn.log_model(final_model, "final_model", signature=mlflow.models.infer_signature(X_test, pd.Series(final_model_test_preds, name=YOUR_LABEL_COLUMN_NAME)))
            
            mlflow.set_tag("status", "success_oof_final")
            return {
                "status": "success", "model_type": model_type, "final_model_run_id": final_model_run_id,
                "oof_rmse": oof_rmse, "final_model_test_rmse": final_model_rmse,
                "oof_predictions_path": oof_file_path_parquet,
                "test_predictions_path": test_preds_file_path_parquet
            }

        except Exception as e:
            print(f"ERROR during OOF/Final training for {model_type}: {e}")
            mlflow.set_tag("status", "failed_oof_final")
            mlflow.log_param("error_oof_final", str(e)[:250])
            return {"status": "failed", "model_type": model_type, "error_message": str(e)}

print("OOF generation and final model training function defined.")


# --- Ensemble Functions ---
def create_ensemble_features(base_model_types, oof_dir, test_preds_dir, label_col_name, train_data_path_for_true_labels):
    """Loads OOF and test predictions for base models to create meta-features."""
    all_oof_preds = []
    all_test_preds = []
    
    # Load true labels for training set (needed for meta-learner training)
    # This assumes the original train_pdf (or equivalent) is available or can be re-read
    # For simplicity, let's assume we need to load it to get y_train.
    # It's better if y_train was saved with OOF preds.
    # The oof_df from train_final_model_and_generate_oof already contains 'true_label'
    # We just need to merge them.
    
    y_train_true_df = None

    for model_type in base_model_types:
        oof_path = os.path.join(oof_dir, f"oof_preds_{model_type}.parquet")
        test_path = os.path.join(test_preds_dir, f"test_preds_{model_type}.parquet")
        
        if not os.path.exists(oof_path) or not os.path.exists(test_path):
            print(f"Warning: Prediction files for {model_type} not found. Skipping for ensemble.")
            continue
            
        oof_pdf = pd.read_parquet(oof_path)
        test_pdf = pd.read_parquet(test_path)
        
        # Capture y_train_true from the first OOF file if not already done
        if y_train_true_df is None and 'true_label' in oof_pdf.columns:
            y_train_true_df = oof_pdf[['true_label']].copy()
            # Make sure index aligns if we plan to concat later, or just use the values
            
        all_oof_preds.append(oof_pdf[[f'oof_pred_{model_type}']])
        all_test_preds.append(test_pdf[[f'test_pred_{model_type}']])

    if not all_oof_preds or y_train_true_df is None:
        print("Error: Not enough OOF predictions to build ensemble features, or true labels missing.")
        return None, None, None

    X_meta_train = pd.concat(all_oof_preds, axis=1)
    X_meta_test = pd.concat(all_test_preds, axis=1)
    y_meta_train = y_train_true_df['true_label'].values
    
    return X_meta_train, y_meta_train, X_meta_test


def train_stacked_ensemble(meta_learner_name, meta_learner_model, 
                           X_meta_train, y_meta_train, X_meta_test, y_true_test, # y_true_test needed for evaluation
                           mlflow_parent_run_name_prefix, seed, primary_metric_config, max_metrics_config):
    """Trains a stacked ensemble meta-learner."""
    with mlflow.start_run(run_name=f"{mlflow_parent_run_name_prefix}_Stacked_{meta_learner_name}", nested=False) as stack_run:
        mlflow.log_param("meta_learner_type", meta_learner_name)
        mlflow.set_tag("ensemble_type", "stacking")
        mlflow.set_tag("seed", seed)

        print(f"Training Stacked Ensemble with Meta-Learner: {meta_learner_name}")
        try:
            meta_learner_model.fit(X_meta_train, y_meta_train)
            
            # It's good practice to log the meta-learner's own parameters if it's configurable
            if hasattr(meta_learner_model, 'get_params'):
                 mlflow.log_params({f"meta_{k}":v for k,v in meta_learner_model.get_params().items() if isinstance(v, (str, int, float, bool))})


            stacked_test_predictions = meta_learner_model.predict(X_meta_test)

            rmse = np.sqrt(mean_squared_error(y_true_test, stacked_test_predictions))
            r2 = r2_score(y_true_test, stacked_test_predictions)
            mae = mean_absolute_error(y_true_test, stacked_test_predictions)
            
            metrics_to_log = {"stacked_rmse": rmse, "stacked_r2": r2, "stacked_mae": mae}
            logged_metrics_count = 0
            for m_name, m_val in sorted(metrics_to_log.items()):
                if logged_metrics_count < max_metrics_config:
                    mlflow.log_metric(m_name, m_val)
                    logged_metrics_count +=1

            print(f"  Stacked ({meta_learner_name}) Test RMSE: {rmse:.4f}, R2: {r2:.4f}")

            # Log the meta-learner model
            # This requires a PyFunc model if you want to package base models + meta for inference
            # For now, just log the meta-learner itself. Productionizing stacker is more complex.
            if isinstance(meta_learner_model, lgb.LGBMRegressor):
                 mlflow.lightgbm.log_model(meta_learner_model, f"meta_learner_{meta_learner_name}", signature=mlflow.models.infer_signature(X_meta_test, pd.Series(stacked_test_predictions, name=YOUR_LABEL_COLUMN_NAME)))
            else:
                 mlflow.sklearn.log_model(meta_learner_model, f"meta_learner_{meta_learner_name}", signature=mlflow.models.infer_signature(X_meta_test, pd.Series(stacked_test_predictions, name=YOUR_LABEL_COLUMN_NAME)))

            mlflow.set_tag("status", "success_stacking")
            return {"status": "success", "meta_learner": meta_learner_name, "rmse": rmse, "r2": r2, "run_id": stack_run.info.run_id}
        
        except Exception as e:
            print(f"ERROR training stacked ensemble ({meta_learner_name}): {e}")
            mlflow.set_tag("status", "failed_stacking")
            mlflow.log_param("error_stacking", str(e)[:250])
            return {"status": "failed", "meta_learner": meta_learner_name, "error_message": str(e)}


def calculate_weighted_ensemble(base_model_metrics_oof, # List of dicts: [{'model_type':'rf', 'oof_rmse':0.5, 'test_pred_path':'...'}, ...]
                                X_meta_test, y_true_test,
                                primary_metric_config, max_metrics_config,
                                mlflow_parent_run_name_prefix):
    """Calculates and evaluates a weighted ensemble."""
    with mlflow.start_run(run_name=f"{mlflow_parent_run_name_prefix}_WeightedEnsemble", nested=False) as weighted_run:
        mlflow.set_tag("ensemble_type", "weighted_average")
        print("Calculating Weighted Ensemble...")

        try:
            weights = []
            if primary_metric_config == 'rmse': # Lower RMSE is better, so inverse for weight
                total_inverse_rmse = sum(1.0 / m['oof_rmse'] for m in base_model_metrics_oof if m['oof_rmse'] > 0)
                if total_inverse_rmse == 0: 
                    print("Warning: Sum of inverse OOF RMSE is zero, cannot calculate RMSE-based weights. Defaulting to equal weights.")
                    weights = [1.0 / len(base_model_metrics_oof)] * len(base_model_metrics_oof)
                else:
                    weights = [(1.0 / m['oof_rmse']) / total_inverse_rmse for m in base_model_metrics_oof]
            
            elif primary_metric_config == 'r2': # Higher R2 is better
                # Ensure R2 values are positive for weighting, shift if necessary or use rank
                # For simplicity, let's assume R2 values are mostly > 0. Normalize positive R2s.
                # A more robust method might be softmax of R2 scores or rank-based weighting.
                positive_r2s = [max(0, m['oof_r2']) for m in base_model_metrics_oof] # Cap at 0
                total_r2 = sum(positive_r2s)
                if total_r2 == 0:
                    print("Warning: Sum of positive OOF R2 is zero, cannot calculate R2-based weights. Defaulting to equal weights.")
                    weights = [1.0 / len(base_model_metrics_oof)] * len(base_model_metrics_oof)
                else:
                    weights = [r2 / total_r2 for r2 in positive_r2s]
            else: # Default to equal weights if metric unknown for weighting
                print(f"Warning: Unknown primary metric '{primary_metric_config}' for weighting. Defaulting to equal weights.")
                weights = [1.0 / len(base_model_metrics_oof)] * len(base_model_metrics_oof)

            mlflow.log_param("weighting_strategy", f"based_on_oof_{primary_metric_config}")
            for i, model_info in enumerate(base_model_metrics_oof):
                mlflow.log_param(f"weight_{model_info['model_type']}", weights[i])
                mlflow.log_metric(f"oof_metric_for_weight_{model_info['model_type']}", model_info['oof_rmse'] if primary_metric_config == 'rmse' else model_info['oof_r2'])

            # Combine test predictions using weights
            # X_meta_test is already a DataFrame of test_pred_algo1, test_pred_algo2, ...
            weighted_predictions = np.zeros(len(X_meta_test))
            if X_meta_test.shape[1] != len(weights):
                raise ValueError(f"Mismatch between number of models for weighting ({len(weights)}) and available test predictions ({X_meta_test.shape[1]})")

            for i, col in enumerate(X_meta_test.columns): # Assumes columns are in same order as base_model_metrics_oof
                weighted_predictions += X_meta_test[col] * weights[i]
            
            rmse = np.sqrt(mean_squared_error(y_true_test, weighted_predictions))
            r2 = r2_score(y_true_test, weighted_predictions)
            mae = mean_absolute_error(y_true_test, weighted_predictions)

            metrics_to_log = {"weighted_rmse": rmse, "weighted_r2": r2, "weighted_mae": mae}
            logged_metrics_count = 0
            for m_name, m_val in sorted(metrics_to_log.items()):
                if logged_metrics_count < max_metrics_config:
                    mlflow.log_metric(m_name, m_val)
                    logged_metrics_count +=1

            print(f"  Weighted Ensemble Test RMSE: {rmse:.4f}, R2: {r2:.4f}")
            mlflow.set_tag("status", "success_weighted")
            return {"status": "success", "rmse": rmse, "r2": r2, "run_id": weighted_run.info.run_id}

        except Exception as e:
            print(f"ERROR calculating weighted ensemble: {e}")
            mlflow.set_tag("status", "failed_weighted")
            mlflow.log_param("error_weighted", str(e)[:250])
            return {"status": "failed", "error_message": str(e)}

print("Ensemble functions defined.")
print("--- All Utility Functions and HPO Components Defined ---")

In [None]:
# Cell 4: Main Orchestration Logic

print("--- Starting Main Orchestration ---")

# --- 0. Setup MLflow Experiment ---
# Experiment ID is fetched/created using the global EXPERIMENT_PATH
# This needs to be done once.
try:
    # Make sure spark session from Init cell is used if get_or_create_experiment needs it
    experiment_id = get_or_create_experiment(EXPERIMENT_PATH, spark)
    if experiment_id:
        mlflow.set_experiment(experiment_id=experiment_id)
        print(f"MLflow experiment '{EXPERIMENT_PATH}' is set with ID: {experiment_id}")
    else:
        raise Exception("MLflow experiment could not be set. Halting.")
except Exception as e:
    print(f"CRITICAL: Could not initialize MLflow experiment. Please check EXPERIMENT_PATH and permissions. Error: {e}")
    # dbutils.notebook.exit("MLflow experiment setup failed") # If in Databricks notebook and want to halt

# This is a global variable that the objective function will read
# It will be updated for each algorithm's HPO campaign.
global HPO_PARENT_RUN_ID_FOR_OBJECTIVE
global CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE


# --- 1. Individual HPO for each Base Algorithm (Sequential) ---
print("\n--- Phase X.A: Individual Hyperparameter Optimization for Base Models ---")
best_hpo_results_per_algorithm = {} # To store best HPs and trial run_id for each algo

for algo_type in BASE_ALGORITHMS_TO_RUN:
    print(f"\nStarting HPO for Algorithm: {algo_type}...")
    if algo_type not in ALGORITHM_SEARCH_SPACES:
        print(f"Warning: Search space for {algo_type} not defined. Skipping HPO.")
        continue

    with mlflow.start_run(run_name=f"HPO_Campaign_{algo_type}", nested=False) as hpo_campaign_parent_run:
        HPO_PARENT_RUN_ID_FOR_OBJECTIVE = hpo_campaign_parent_run.info.run_id
        CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE = algo_type # Set for the objective function

        mlflow.log_param("algorithm_being_optimized", algo_type)
        mlflow.log_param("num_hpo_trials_config", NUM_HPO_TRIALS)
        mlflow.log_param("primary_metric_config", PRIMARY_METRIC)
        mlflow.log_param("global_seed", GLOBAL_SEED)
        # Log the specific search space for this algorithm if possible (can be complex for hp objects)
        # mlflow.log_dict(ALGORITHM_SEARCH_SPACES[algo_type]['model_params'], f"search_space_{algo_type}.json")


        hpo_trials_database = Trials() # Sequential trials

        try:
            # The objective function now implicitly uses CURRENT_ALGORITHM_TYPE_FOR_OBJECTIVE
            # and HPO_PARENT_RUN_ID_FOR_OBJECTIVE.
            # It also uses global vars for data paths, label col, seed, primary metric.
            
            # The search space for `fmin` should be just the 'model_params' part
            current_search_space = ALGORITHM_SEARCH_SPACES[algo_type]['model_params']

            best_hyperopt_indices = fmin(
                fn=objective_function_regression, # This now gets hyperparams for the current_algo_type
                space=current_search_space,
                algo=tpe.suggest,
                max_evals=NUM_HPO_TRIALS,
                trials=hpo_trials_database,
                rstate=np.random.default_rng(GLOBAL_SEED) # For hyperopt's internal randomness
            )
            
            best_actual_params = space_eval(current_search_space, best_hyperopt_indices)
            
            # Find the best trial from the hpo_trials_database
            best_trial_obj = hpo_trials_database.best_trial
            best_trial_run_id = None
            best_trial_loss = float('inf')
            best_trial_attachments = {}

            if best_trial_obj and best_trial_obj['result']['status'] == STATUS_OK:
                best_trial_run_id = best_trial_obj['result'].get('run_id')
                best_trial_loss = best_trial_obj['result']['loss']
                best_trial_attachments = best_trial_obj['result'].get('attachments', {})
                
                print(f"  Best HPO trial for {algo_type}: Loss={best_trial_loss:.4f}, Params={best_actual_params}, MLflow Run ID={best_trial_run_id}")
                mlflow.log_params({f"best_hpo_{k}": v for k,v in best_actual_params.items()})
                mlflow.log_metric("best_hpo_loss", best_trial_loss)
                if best_trial_run_id:
                    mlflow.set_tag("best_hpo_trial_run_id", best_trial_run_id)
                for att_k, att_v in best_trial_attachments.items():
                     if isinstance(att_v, (int, float)) and att_k != "model_type": # model_type is already a param
                        mlflow.log_metric(f"best_hpo_trial_{att_k}", att_v)


                best_hpo_results_per_algorithm[algo_type] = {
                    "best_params": best_actual_params,
                    "best_trial_run_id": best_trial_run_id, # This is the run for the specific trial
                    "hpo_campaign_run_id": HPO_PARENT_RUN_ID_FOR_OBJECTIVE,
                    "attachments": best_trial_attachments
                }
                mlflow.set_tag("status_hpo_campaign", "success")
            else:
                print(f"  HPO for {algo_type} did not yield a successful best trial.")
                mlflow.set_tag("status_hpo_campaign", "no_successful_trial")

        except Exception as e:
            print(f"ERROR during HPO campaign for {algo_type}: {e}")
            mlflow.set_tag("status_hpo_campaign", "failed")
            mlflow.log_param("error_hpo_campaign", str(e)[:250])

# --- 2. OOF Generation & Final Base Model Training (Sequential) ---
print("\n--- Phase X.B: OOF Prediction Generation & Final Base Model Training ---")
final_base_model_details = {} # To store info about final models and their prediction paths

for algo_type, hpo_result in best_hpo_results_per_algorithm.items():
    if hpo_result and hpo_result.get("best_params"):
        print(f"\nGenerating OOF & Final Model for: {algo_type} with best params: {hpo_result['best_params']}")
        
        oof_result = train_final_model_and_generate_oof(
            model_type=algo_type,
            best_hyperparams=hpo_result['best_params'],
            train_data_path=SHARED_PROCESSED_TRAIN_PATH,
            test_data_path=SHARED_PROCESSED_TEST_PATH,
            label_col_name=YOUR_LABEL_COLUMN_NAME,
            k_folds=K_FOLDS_OOF,
            seed=GLOBAL_SEED,
            mlflow_parent_run_name_prefix="MVP" # To distinguish these runs
        )
        if oof_result['status'] == 'success':
            final_base_model_details[algo_type] = oof_result
            print(f"  Successfully generated OOF and final model for {algo_type}.")
        else:
            print(f"  Failed to generate OOF/final model for {algo_type}: {oof_result.get('error_message')}")
    else:
        print(f"Skipping OOF for {algo_type} as no successful HPO result was found.")


# --- 3. Ensemble Creation ---
print("\n--- Phase X.C: Ensemble Creation ---")
if not final_base_model_details:
    print("No base models successfully processed for OOF. Skipping ensemble creation.")
else:
    # Prepare data for ensembling
    # We need y_true_test for evaluating ensembles. Let's load it once.
    try:
        _, _, _, y_true_test_for_ensemble = load_processed_data_for_sklearn(
            SHARED_PROCESSED_TRAIN_PATH, SHARED_PROCESSED_TEST_PATH, YOUR_LABEL_COLUMN_NAME
        )
    except Exception as e:
        print(f"CRITICAL: Could not load test labels for ensemble evaluation. Error: {e}. Skipping ensembles.")
        y_true_test_for_ensemble = None

    if y_true_test_for_ensemble is not None:
        ensemble_base_model_types = list(final_base_model_details.keys())
        
        # This creates X_meta_train, y_meta_train, X_meta_test
        # y_meta_train comes from the 'true_label' column in the OOF parquet files
        meta_features_tuple = create_ensemble_features(
            base_model_types=ensemble_base_model_types,
            oof_dir=DBFS_UC_OOF_PREDS_DIR, # Use DBFS path for pandas
            test_preds_dir=DBFS_UC_TEST_PREDS_DIR, # Use DBFS path for pandas
            label_col_name=YOUR_LABEL_COLUMN_NAME,
            train_data_path_for_true_labels=SHARED_PROCESSED_TRAIN_PATH # Only to get y_train if not in OOF df
        )

        if meta_features_tuple and meta_features_tuple[0] is not None:
            X_meta_train, y_meta_train, X_meta_test = meta_features_tuple
            print(f"  Meta features created for stacking: X_meta_train shape {X_meta_train.shape}, X_meta_test shape {X_meta_test.shape}")

            # --- 3.A Weighted Ensemble ---
            print("\n  Creating Weighted Ensemble...")
            # Need to gather OOF metrics for weighting
            base_model_oof_metrics_for_weighting = []
            for algo, details in final_base_model_details.items():
                if details['status'] == 'success':
                    base_model_oof_metrics_for_weighting.append({
                        'model_type': algo,
                        'oof_rmse': details['oof_rmse'], # This was logged from OOF calculation
                        'oof_r2': details.get('oof_r2', 1.0 - (details['oof_rmse']**2 / np.var(y_meta_train) if np.var(y_meta_train) > 0 else 0) ), # Estimate if not directly available, or log it directly
                        'test_pred_path': details['test_predictions_path'] # Not used by current weighted function, but good to have
                    })
            
            if base_model_oof_metrics_for_weighting:
                weighted_ensemble_result = calculate_weighted_ensemble(
                    base_model_metrics_oof=base_model_oof_metrics_for_weighting,
                    X_meta_test=X_meta_test.copy(), # X_meta_test has columns like 'test_pred_decision_tree', etc.
                    y_true_test=y_true_test_for_ensemble,
                    primary_metric_config=PRIMARY_METRIC,
                    max_metrics_config=MAX_METRICS_TO_LOG,
                    mlflow_parent_run_name_prefix="MVP"
                )
                if weighted_ensemble_result['status'] == 'success':
                    print(f"  Weighted Ensemble Test RMSE: {weighted_ensemble_result['rmse']:.4f}, R2: {weighted_ensemble_result['r2']:.4f}")
            else:
                print("  Not enough successful base models with OOF metrics to create weighted ensemble.")


            # --- 3.B Stacked Ensemble ---
            print("\n  Creating Stacked Ensembles...")
            for meta_learner_key, meta_learner_instance in META_LEARNERS_FOR_STACKING.items():
                print(f"    Stacking with Meta-Learner: {meta_learner_key}")
                stacking_result = train_stacked_ensemble(
                    meta_learner_name=meta_learner_key,
                    meta_learner_model=meta_learner_instance, # Pass the actual model instance
                    X_meta_train=X_meta_train.copy(),
                    y_meta_train=y_meta_train.copy(),
                    X_meta_test=X_meta_test.copy(),
                    y_true_test=y_true_test_for_ensemble.copy(),
                    mlflow_parent_run_name_prefix="MVP",
                    seed=GLOBAL_SEED,
                    primary_metric_config=PRIMARY_METRIC, # Not directly used by stacker HPO, but good for consistency
                    max_metrics_config=MAX_METRICS_TO_LOG
                )
                if stacking_result['status'] == 'success':
                    print(f"    Stacked Ensemble ({meta_learner_key}) Test RMSE: {stacking_result['rmse']:.4f}, R2: {stacking_result['r2']:.4f}")
                else:
                    print(f"    Failed to train Stacked Ensemble ({meta_learner_key}): {stacking_result.get('error_message')}")
        else:
            print("  Failed to create meta-features for stacking. Skipping.")
    else:
        print("Skipping ensemble creation due to failure in loading test labels for evaluation.")

print("\n--- Main Orchestration Completed ---")

In [None]:
# Cell 5: (Optional) Clean up temporary files from DBFS if created directly by pandas
# Note: Spark writes (like .write.parquet) to UC Volumes are managed by Spark.
# If pandas created files via /dbfs/ paths, you might want to clean them.
# Example:
# try:
#    if os.path.exists(DBFS_UC_OOF_PREDS_DIR):
#        print(f"Cleaning up OOF predictions directory: {DBFS_UC_OOF_PREDS_DIR}")
#        shutil.rmtree(DBFS_UC_OOF_PREDS_DIR) # Careful with this!
#    if os.path.exists(DBFS_UC_TEST_PREDS_DIR):
#        print(f"Cleaning up test predictions directory: {DBFS_UC_TEST_PREDS_DIR}")
#        shutil.rmtree(DBFS_UC_TEST_PREDS_DIR)
# except Exception as e:
#    print(f"Error during cleanup: {e}")

# If spark session is no longer needed by other cells:
# spark.stop()