In [None]:
# Cell 1: Imports (Add/Update from previous full script)
import mlflow
# No need for mlflow.spark here for preprocessing, but keep for HPO orchestration
# import mlflow.sklearn # Will be used in HPO and for logging preprocessing components

import pandas as pd
import numpy as np
import os
import joblib # For saving/loading Python objects like scalers, arrays
import time
import shutil # For cleaning up temp directories if any

from sklearn.model_selection import KFold # For K-fold target encoding if implemented manually
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce # For TargetEncoder - install if not present

from pyspark.sql import SparkSession # Still need SparkSession for environment context

# Suppress warnings if any from category_encoders or others for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)


# Ensure spark session is available
if 'spark' not in locals():
    spark = SparkSession.builder.appName("Pandas_Preprocessing_MVP").getOrCreate()

print("Imports for Pandas Preprocessing successful.")

In [None]:
# Cell 2: Init Cell - Global Configurations (Review and Update)

# --- MLflow Configuration ---
EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Regression_Pandas_Preprocessing" # CHANGE

# --- Data Paths (Unity Catalog Volumes) ---
UC_BASE_DATA_PATH = "/Volumes/delfos/"
RAW_TRAIN_DATA_PATH = f"{UC_BASE_DATA_PATH}raw_data/train.parquet" # Path to RAW Parquet train data
RAW_TEST_DATA_PATH = f"{UC_BASE_DATA_PATH}raw_data/test.parquet"   # Path to RAW Parquet test data

# --- Paths for PROCESSED Data (Output of this Pandas preprocessing script) ---
# These will be .joblib files containing dicts of NumPy arrays {'X': ..., 'y': ...}
# Saved to UC Volume via /dbfs/ prefix for joblib
DBFS_PROCESSED_DATA_DIR = f"/dbfs{UC_BASE_DATA_PATH}processed_data_pandas_mvp_v1/" # Use /dbfs for joblib
SHARED_PROCESSED_TRAIN_PATH = os.path.join(DBFS_PROCESSED_DATA_DIR, "train_processed_data.joblib")
SHARED_PROCESSED_TEST_PATH = os.path.join(DBFS_PROCESSED_DATA_DIR, "test_processed_data.joblib")

# --- Path for saving the FITTED PREPROCESSING COMPONENTS ---
DBFS_PREPROCESSOR_COMPONENTS_PATH = os.path.join(DBFS_PROCESSED_DATA_DIR, "preprocessor_components.joblib")


# !!! IMPORTANT: SET YOUR ACTUAL LABEL COLUMN NAME (must exist in raw data) !!!
YOUR_LABEL_COLUMN_NAME = "target"

# --- Define your categorical and numerical columns from the RAW data ---
# !!! IMPORTANT: UPDATE THESE LISTS BASED ON YOUR ACTUAL RAW DATASET !!!
CATEGORICAL_COLUMNS_RAW = ["category_feature_1", "category_feature_2"]
NUMERICAL_COLUMNS_RAW = ["numerical_feature_1", "numerical_feature_2"]

# --- MLflow Configuration for saving the preprocessing "model" ---
PREPROCESSING_EXPERIMENT_PATH = EXPERIMENT_PATH # Or a dedicated one
MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH = "pandas_preprocessor"

# --- Target Encoding Configuration (for category_encoders or manual) ---
# For category_encoders.TargetEncoder, smoothing is a parameter.
# If doing manual K-Fold TE, this might be relevant.
TARGET_ENCODING_SMOOTHING = 10.0 # Default for category_encoders, can be tuned

# --- Reproducibility ---
GLOBAL_SEED = 117

# --- Other Global Settings from previous Init Cell ---
# (NUM_HPO_TRIALS, PRIMARY_METRIC, BASE_ALGORITHMS_TO_RUN, K_FOLDS_OOF, etc.
#  are for the HPO script, not strictly needed here but good for context if this cell is shared)

# Ensure output directories exist
try:
    os.makedirs(DBFS_PROCESSED_DATA_DIR, exist_ok=True)
    print(f"Checked/created processed data directory: {DBFS_PROCESSED_DATA_DIR}")
except Exception as e:
    print(f"Warning: Could not create directory {DBFS_PROCESSED_DATA_DIR}. Error: {e}")

print("--- Pandas Preprocessing Configurations Initialized ---")
# ... (print other relevant configs)

In [None]:
# Cell 3: Pandas Preprocessing Functions

def load_raw_data_to_pandas(uc_volume_parquet_path: str) -> pd.DataFrame:
    """Loads a Parquet file from a UC Volume path into a Pandas DataFrame."""
    # For Pandas to read directly from UC Volumes, it typically needs the /dbfs/ prefix
    dbfs_path = uc_volume_parquet_path
    if uc_volume_parquet_path.startswith("/Volumes/"):
        dbfs_path = f"/dbfs{uc_volume_parquet_path}"
    
    print(f"  Loading Pandas DataFrame from: {dbfs_path}")
    try:
        pdf = pd.read_parquet(dbfs_path)
        print(f"    Successfully loaded. Shape: {pdf.shape}")
        return pdf
    except Exception as e:
        print(f"    ERROR loading Parquet from {dbfs_path}: {e}")
        raise

def fit_pandas_preprocessor(train_pdf: pd.DataFrame, 
                            categorical_cols: list, 
                            numerical_cols: list, 
                            label_col: str,
                            te_smoothing: float,
                            global_seed: int):
    """
    Fits preprocessing components (imputers, target encoders, scaler) on training data.
    Returns processed X_train_np, y_train_np, and a dictionary of fitted components.
    """
    print("  Fitting Pandas preprocessor...")
    X_train = train_pdf.drop(columns=[label_col])
    y_train = train_pdf[label_col].astype(float)

    fitted_components = {}

    # 1. Impute Numerical Features (Median)
    if numerical_cols:
        num_imputer = SimpleImputer(strategy="median")
        X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
        fitted_components['numerical_imputer'] = num_imputer
        print(f"    Fitted Numerical Imputer for: {numerical_cols}")

    # 2. Impute Categorical Features (Most Frequent or Constant)
    if categorical_cols:
        # Using a constant fill value is often safer for unseen categories later
        cat_imputer_fill_value = "__MISSING__"
        # SimpleImputer for categoricals needs strategy='most_frequent' or 'constant'
        cat_imputer = SimpleImputer(strategy="constant", fill_value=cat_imputer_fill_value)
        X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
        fitted_components['categorical_imputer'] = cat_imputer
        fitted_components['categorical_imputer_fill_value'] = cat_imputer_fill_value # Store for transform
        print(f"    Fitted Categorical Imputer for: {categorical_cols} (using fill_value='{cat_imputer_fill_value}')")

    # 3. Target Encoding for Categorical Features
    # Using category_encoders library for simplicity and robustness in MVP
    # It handles unseen categories by default (imputes with global mean of target during fit)
    # and training set leakage with smoothing or k-fold like behavior internally if configured.
    if categorical_cols:
        # Note: category_encoders.TargetEncoder expects y to be passed during fit and transform of X
        # It also needs to know which columns are categorical.
        # Ensure categorical columns are of 'object' or 'category' dtype for category_encoders
        for col in categorical_cols:
            X_train[col] = X_train[col].astype('category')

        target_encoder = ce.TargetEncoder(cols=categorical_cols, smoothing=te_smoothing, handle_unknown='value', handle_missing='value')
        # handle_unknown='value' uses the overall mean of y_train for new categories
        # handle_missing='value' uses the overall mean of y_train for NaNs seen during transform
        
        X_train[categorical_cols] = target_encoder.fit_transform(X_train[categorical_cols], y_train)
        fitted_components['target_encoder'] = target_encoder
        print(f"    Fitted TargetEncoder for: {categorical_cols} with smoothing={te_smoothing}")
        # The transformed columns are now numeric.

    # 4. Scale All Features (Numerical + Target Encoded Categoricals)
    # All columns in X_train (that are features) should now be numeric.
    feature_columns_for_scaling = numerical_cols + categorical_cols # Cat cols are now numerically encoded
    
    if feature_columns_for_scaling: # Check if there are any features to scale
        scaler = StandardScaler()
        # Ensure all columns for scaling are indeed numeric and do not contain NaNs from TE if 'value' wasn't perfect
        # TargetEncoder with handle_missing='value' and handle_unknown='value' should prevent NaNs if y_train has no NaNs.
        # If any NaNs persist (e.g., if a numerical column was all NaNs and imputer failed, or TE issue), scaler will fail.
        # A check or a final SimpleImputer(strategy='median') pass on feature_columns_for_scaling could be added.
        
        # Make sure all columns are float before scaling
        for col in feature_columns_for_scaling:
            X_train[col] = X_train[col].astype(float)
            # Final check for NaNs after TE before scaling, impute if necessary
            if X_train[col].isnull().any():
                print(f"    Warning: NaNs detected in column {col} before scaling. Applying median imputation.")
                median_val = X_train[col].median()
                X_train[col] = X_train[col].fillna(median_val)
                # Store this median for transform stage if not already handled by a formal imputer for these generated features
                if 'post_te_imputer_medians' not in fitted_components:
                    fitted_components['post_te_imputer_medians'] = {}
                fitted_components['post_te_imputer_medians'][col] = median_val


        X_train_scaled_np = scaler.fit_transform(X_train[feature_columns_for_scaling])
        fitted_components['scaler'] = scaler
        fitted_components['feature_columns_for_scaling'] = feature_columns_for_scaling # Store order
        print(f"    Fitted StandardScaler for features: {feature_columns_for_scaling}")
    else: # No features to scale (e.g. only one categorical feature became one TE feature)
        print("    No features identified for scaling. X_train might be empty or only have non-scalable features.")
        # Handle case where X_train might become just an empty dataframe or a single column that was a label.
        # This logic assumes X_train[feature_columns_for_scaling] results in a valid array for scikit-learn models.
        # If feature_columns_for_scaling is empty, X_train_scaled_np would be problematic.
        if not feature_columns_for_scaling and not X_train.empty:
             X_train_scaled_np = X_train.values # Or handle appropriately
        elif X_train.empty:
             X_train_scaled_np = np.array([]) # Or handle as error
        else: # Should not happen if there were features
             X_train_scaled_np = X_train[feature_columns_for_scaling].values


    y_train_np = y_train.values
    print("  Pandas preprocessor fitting complete.")
    return X_train_scaled_np, y_train_np, fitted_components


def transform_pandas_preprocessor(raw_pdf: pd.DataFrame, 
                                  fitted_components: dict,
                                  categorical_cols: list, 
                                  numerical_cols: list, 
                                  label_col: str = None, # Label col might not be in test data for prediction
                                  is_train_data=False): # Flag to indicate if we are transforming training data (for y)
    """
    Applies fitted preprocessing components to new data.
    """
    print("  Transforming data with Pandas preprocessor...")
    
    # Prepare X and y (if label_col is present)
    if label_col and label_col in raw_pdf.columns:
        X_data = raw_pdf.drop(columns=[label_col]).copy() # Make a copy to avoid SettingWithCopyWarning
        y_data_np = raw_pdf[label_col].astype(float).values
    else:
        X_data = raw_pdf.copy()
        y_data_np = None

    # 1. Impute Numerical
    if numerical_cols and 'numerical_imputer' in fitted_components:
        X_data.loc[:, numerical_cols] = fitted_components['numerical_imputer'].transform(X_data[numerical_cols])
        print(f"    Applied Numerical Imputer for: {numerical_cols}")

    # 2. Impute Categorical
    if categorical_cols and 'categorical_imputer' in fitted_components:
        X_data.loc[:, categorical_cols] = fitted_components['categorical_imputer'].transform(X_data[categorical_cols])
        print(f"    Applied Categorical Imputer for: {categorical_cols}")
    
    # 3. Target Encode Categorical
    if categorical_cols and 'target_encoder' in fitted_components:
        # Ensure categorical columns are of 'category' or 'object' dtype for category_encoders
        for col in categorical_cols:
            X_data[col] = X_data[col].astype('category')
        
        # For TargetEncoder's transform, y is not strictly needed if it learned from y during fit,
        # but some versions/setups might expect it. If y_data_np is None (e.g. for true new data),
        # category_encoders TE should use the global mean learned during fit for unknowns/missings.
        # Pass y=None if it's not available (e.g. scoring new data).
        # If transforming training data (is_train_data=True), y_data_np should be available.
        # However, for consistency, TE transform should only use X.
        X_data.loc[:, categorical_cols] = fitted_components['target_encoder'].transform(X_data[categorical_cols])
        print(f"    Applied TargetEncoder for: {categorical_cols}")

    # 4. Scale All Features
    feature_columns_for_scaling = fitted_components.get('feature_columns_for_scaling', [])
    if feature_columns_for_scaling and 'scaler' in fitted_components:
        # Make sure all columns are float before scaling and impute any post-TE NaNs
        for col in feature_columns_for_scaling:
            X_data[col] = X_data[col].astype(float)
            if col in fitted_components.get('post_te_imputer_medians', {}):
                 if X_data[col].isnull().any():
                    print(f"    Post-TE Imputing NaNs in {col} with stored median before scaling.")
                    X_data[col] = X_data[col].fillna(fitted_components['post_te_imputer_medians'][col])
            # If there are still NaNs and no stored median, this might indicate an issue or need for a default fill
            if X_data[col].isnull().any():
                print(f"    Warning: NaNs still present in {col} before scaling and no stored median. Filling with 0.")
                X_data[col] = X_data[col].fillna(0)


        X_data_scaled_np = fitted_components['scaler'].transform(X_data[feature_columns_for_scaling])
        print(f"    Applied StandardScaler for features: {feature_columns_for_scaling}")
    elif not feature_columns_for_scaling and not X_data.empty: # No scaling was fitted
        X_data_scaled_np = X_data.values
    elif X_data.empty :
        X_data_scaled_np = np.array([])
    else: # Should not happen if feature_columns_for_scaling was defined during fit
        X_data_scaled_np = X_data[feature_columns_for_scaling].values


    print("  Pandas data transformation complete.")
    if y_data_np is not None:
        return X_data_scaled_np, y_data_np
    else:
        return X_data_scaled_np, None


def save_processed_data_and_components(X_np, y_np, file_path, components_dict, components_path):
    """Saves processed NumPy arrays and fitted components using joblib."""
    print(f"  Saving processed data to: {file_path}")
    try:
        payload = {'X': X_np, 'y': y_np}
        joblib.dump(payload, file_path)
        print(f"    Data saved successfully.")
    except Exception as e:
        print(f"    ERROR saving data to {file_path}: {e}")
        raise

    if components_dict and components_path:
        print(f"  Saving preprocessor components to: {components_path}")
        try:
            joblib.dump(components_dict, components_path)
            print(f"    Preprocessor components saved successfully.")
        except Exception as e:
            print(f"    ERROR saving components to {components_path}: {e}")
            # Don't raise if components fail but data saved, or handle as critical
            
print("--- Pandas Preprocessing Utility Functions Defined ---")

In [None]:
# Cell 4: Main Preprocessing Orchestration (Pandas - Updated for Dual Save)

print("--- Starting Pandas Preprocessing Orchestration (Dual Save) ---")

# --- 0. Set MLflow Experiment for Preprocessing ---
try:
    preprocessing_mlflow_experiment_id = get_or_create_experiment(PREPROCESSING_EXPERIMENT_PATH, spark)
    if preprocessing_mlflow_experiment_id:
        mlflow.set_experiment(experiment_id=preprocessing_mlflow_experiment_id)
        print(f"MLflow experiment '{PREPROCESSING_EXPERIMENT_PATH}' for preprocessing is set with ID: {preprocessing_mlflow_experiment_id}")
    else:
        raise Exception("Preprocessing MLflow experiment could not be set. Halting.")
except Exception as e:
    print(f"CRITICAL: Could not initialize MLflow experiment for preprocessing. Error: {e}")
    # Consider dbutils.notebook.exit("MLflow experiment setup failed")

# --- 1. Load Raw Data ---
print(f"\nLoading RAW training data for Pandas preprocessing...")
raw_train_pdf = load_raw_data_to_pandas(RAW_TRAIN_DATA_PATH) # Uses UC Volume path
print(f"\nLoading RAW test data for Pandas preprocessing...")
raw_test_pdf = load_raw_data_to_pandas(RAW_TEST_DATA_PATH)   # Uses UC Volume path

# --- 2. Fit Preprocessor on Training Data ---
fitted_components_dict = None
with mlflow.start_run(run_name="Pandas_Preprocessor_Fit_DualSave") as preproc_run:
    print(f"\nFitting preprocessor. MLflow Run ID: {preproc_run.info.run_id}")
    # ... (log params as before) ...
    mlflow.log_param("label_column", YOUR_LABEL_COLUMN_NAME)
    mlflow.log_param("categorical_features_raw", ", ".join(CATEGORICAL_COLUMNS_RAW))
    mlflow.log_param("numerical_features_raw", ", ".join(NUMERICAL_COLUMNS_RAW))
    mlflow.log_param("target_encoding_smoothing", TARGET_ENCODING_SMOOTHING)
    mlflow.log_param("global_seed", GLOBAL_SEED)
    mlflow.set_tag("preprocessing_type", "pandas_mvp_dual_save")

    try:
        # Fit_pandas_preprocessor now returns processed_pdf_named_cols as well
        X_train_processed_np, y_train_processed_np, \
        processed_train_pdf_named_cols, fitted_components_dict = fit_pandas_preprocessor(
            raw_train_pdf, CATEGORICAL_COLUMNS_RAW, NUMERICAL_COLUMNS_RAW,
            YOUR_LABEL_COLUMN_NAME, TARGET_ENCODING_SMOOTHING, GLOBAL_SEED
        )
        print(f"  Processed X_train_np shape: {X_train_processed_np.shape}, y_train_np shape: {y_train_processed_np.shape}")
        print(f"  Processed train_pdf_named_cols shape: {processed_train_pdf_named_cols.shape}")

        # Save 1: NumPy arrays as .joblib (for HPO function)
        save_joblib_data(
            X_train_processed_np, y_train_processed_np, 
            SHARED_PROCESSED_TRAIN_PATH_JOBLIB # Path from Init Cell
        )
        
        # Save 2: Pandas DataFrame with named columns as Parquet
        save_named_cols_parquet(
            processed_train_pdf_named_cols,
            SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH # Path from Init Cell (UC Volume path for Pandas to_parquet needs /dbfs/)
        )
        
        # Save fitted components
        save_preprocessor_components(fitted_components_dict, DBFS_PREPROCESSOR_COMPONENTS_PATH)
        
        mlflow.log_artifact(DBFS_PREPROCESSOR_COMPONENTS_PATH, artifact_path=MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH)
        # Log paths to the saved data as params or tags for traceability
        mlflow.set_tag("processed_train_joblib_path", SHARED_PROCESSED_TRAIN_PATH_JOBLIB.replace("/dbfs", "dbfs:"))
        mlflow.set_tag("processed_train_parquet_named_path", SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH.replace("/dbfs", "dbfs:"))
        print(f"  Logged fitted preprocessor components and data paths to MLflow.")
        mlflow.set_tag("status_fit", "success")

    except Exception as e:
        print(f"  ERROR during preprocessor fitting: {e}")
        mlflow.log_param("error_fit", str(e)[:250])
        mlflow.set_tag("status_fit", "failed")
        raise

# --- 3. Transform Test Data using Fitted Preprocessor ---
if fitted_components_dict:
    print("\nTransforming TEST data using fitted preprocessor...")
    try:
        # transform_pandas_preprocessor now also returns processed_pdf_named_cols
        X_test_processed_np, y_test_processed_np, \
        processed_test_pdf_named_cols = transform_pandas_preprocessor(
            raw_test_pdf, fitted_components_dict, CATEGORICAL_COLUMNS_RAW,
            NUMERICAL_COLUMNS_RAW, YOUR_LABEL_COLUMN_NAME, is_train_data=False
        )
        print(f"  Processed X_test_np shape: {X_test_processed_np.shape}, y_test_np shape: {y_test_processed_np.shape if y_test_processed_np is not None else 'N/A'}")
        print(f"  Processed test_pdf_named_cols shape: {processed_test_pdf_named_cols.shape}")

        # Save 1: NumPy arrays as .joblib
        save_joblib_data(
            X_test_processed_np, y_test_processed_np,
            SHARED_PROCESSED_TEST_PATH_JOBLIB # Path from Init Cell
        )
        
        # Save 2: Pandas DataFrame with named columns as Parquet
        save_named_cols_parquet(
            processed_test_pdf_named_cols,
            SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH # Path from Init Cell
        )
        # In the MLflow run for fitting, we can also log the paths to test data if desired, or handle it separately.
        # For simplicity, just saving them here. The HPO script will expect these paths.
        print("  Test data transformed and saved in both formats.")
    except Exception as e:
        print(f"  ERROR during test data transformation: {e}")
        raise
else:
    print("CRITICAL: Preprocessor fitting failed. Cannot transform test data.")

print("\n--- Pandas Preprocessing Orchestration (Dual Save) Completed ---")
print(f"Joblib processed training data should be at: {SHARED_PROCESSED_TRAIN_PATH_JOBLIB}")
print(f"Joblib processed test data should be at: {SHARED_PROCESSED_TEST_PATH_JOBLIB}")
print(f"Named Parquet processed training data should be at: {SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH}")
print(f"Named Parquet processed test data should be at: {SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH}")
print(f"Fitted preprocessor components should be at: {DBFS_PREPROCESSOR_COMPONENTS_PATH}")

In [None]:
# <-------------------- CELL 1: IMPORTS -------------------->
print("Cell 1: Imports - Executing...")
import mlflow
# No mlflow.spark specifically needed for this Pandas preprocessing script,
# but mlflow itself is used.
# The HPO script will use mlflow.sklearn, mlflow.lightgbm etc.

import pandas as pd
import numpy as np
import os
import joblib # For saving/loading Python objects like scalers, arrays, and our components dict
import time
import shutil # For cleaning up temp directories if any (not used extensively here yet)

from sklearn.model_selection import KFold # Kept for potential future K-Fold TE, not used in current TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce # For TargetEncoder

from pyspark.sql import SparkSession # Still useful for environment context, paths, and future Spark ML

# Suppress common warnings for cleaner output during MVP development
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning) # If category_encoders uses deprecated features

# Ensure spark session is available (Databricks notebooks usually provide 'spark')
if 'spark' not in locals():
    spark = SparkSession.builder.appName("Pandas_Preprocessing_MVP_Full").getOrCreate()
    print("SparkSession created.")
else:
    print("SparkSession already exists.")

print("Imports successful for Pandas Preprocessing.")
print("-" * 50)

# <-------------------- CELL 2: INIT CELL - GLOBAL CONFIGURATIONS -------------------->
print("\nCell 2: Global Configurations - Executing...")

# --- MLflow Configuration ---
# !!! IMPORTANT: SET YOUR MLFLOW EXPERIMENT PATH !!!
# Example: /Users/your.email@domain.com/MyProjectExperiment_Preprocessing
# You can get your username programmatically in Databricks:
# current_user = spark.sql("SELECT current_user()").collect()[0][0]
# EXPERIMENT_PATH = f"/Users/{current_user}/MVP_Regression_Pandas_Preprocessing_Full"
EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Regression_Pandas_Preprocessing_Full" # CHANGE THIS

# --- Data Paths (Unity Catalog Volumes) ---
# !!! IMPORTANT: SET YOUR UNITY CATALOG VOLUME BASE PATH !!!
# Example: "/Volumes/my_main_catalog/my_bronze_schema/my_project_volume/"
UC_BASE_DATA_PATH = "/Volumes/delfos/" # As per your input

# --- Paths for RAW Data (Input to this preprocessing script) ---
# !!! IMPORTANT: UPDATE THESE TO YOUR ACTUAL RAW DATA PATHS IN UC VOLUMES !!!
RAW_TRAIN_DATA_PATH = f"{UC_BASE_DATA_PATH}raw_data/train.parquet" # Example
RAW_TEST_DATA_PATH = f"{UC_BASE_DATA_PATH}raw_data/test.parquet"   # Example

# --- Paths for PROCESSED Data (Output of this Pandas preprocessing script) ---
# Using a versioned directory for processed data
PROCESSED_DATA_VERSION = "v1_pandas"
DBFS_PROCESSED_DATA_DIR_BASE = f"/dbfs{UC_BASE_DATA_PATH}processed_data/" # /dbfs/ prefix for Python os/joblib
PROCESSED_DATA_DIR_VERSIONED = os.path.join(DBFS_PROCESSED_DATA_DIR_BASE, PROCESSED_DATA_VERSION)

# For .joblib files (NumPy arrays for HPO objective function)
SHARED_PROCESSED_TRAIN_PATH_JOBLIB = os.path.join(PROCESSED_DATA_DIR_VERSIONED, "train_processed_data.joblib")
SHARED_PROCESSED_TEST_PATH_JOBLIB = os.path.join(PROCESSED_DATA_DIR_VERSIONED, "test_processed_data.joblib")

# For Parquet files with named columns (for inspection, other tools)
SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED, "train_processed_named_cols.parquet")
SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED, "test_processed_named_cols.parquet")

# --- Path for saving the FITTED PREPROCESSING COMPONENTS ---
DBFS_PREPROCESSOR_COMPONENTS_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED, "preprocessor_components.joblib")

# !!! IMPORTANT: SET YOUR ACTUAL LABEL COLUMN NAME (must exist in raw data) !!!
YOUR_LABEL_COLUMN_NAME = "target" # Example, change this

# --- Define your categorical and numerical columns from the RAW data ---
# !!! IMPORTANT: UPDATE THESE LISTS BASED ON YOUR ACTUAL RAW DATASET !!!
CATEGORICAL_COLUMNS_RAW = ["category_feature_1", "category_feature_2"] # Example
NUMERICAL_COLUMNS_RAW = ["numerical_feature_1", "numerical_feature_2", "numerical_feature_3"] # Example

# --- MLflow Configuration for saving the preprocessing "model" (components) ---
MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH = "pandas_preprocessor_components" # Directory in MLflow artifacts

# --- Target Encoding Configuration ---
TARGET_ENCODING_SMOOTHING = 20.0 # Smoothing factor for category_encoders.TargetEncoder

# --- Reproducibility ---
GLOBAL_SEED = 117

# --- Ensure output directories exist (using /dbfs/ path for os.makedirs) ---
try:
    os.makedirs(PROCESSED_DATA_DIR_VERSIONED, exist_ok=True)
    print(f"Checked/created base processed data directory: {PROCESSED_DATA_DIR_VERSIONED}")
except Exception as e:
    print(f"Warning: Could not create directory {PROCESSED_DATA_DIR_VERSIONED} using os.makedirs. Error: {e}")
    print("Ensure the UC Volume path is correct and accessible if this fails.")

print(f"--- Global Configurations Initialized (Pandas Preprocessing) ---")
print(f"MLflow Experiment Path for Preprocessing: {EXPERIMENT_PATH}")
print(f"Unity Catalog Base Data Path: {UC_BASE_DATA_PATH}")
print(f"Raw Train Data Path: {RAW_TRAIN_DATA_PATH}")
print(f"Raw Test Data Path: {RAW_TEST_DATA_PATH}")
print(f"  Output Joblib Processed Train Data Path: {SHARED_PROCESSED_TRAIN_PATH_JOBLIB}")
print(f"  Output Joblib Processed Test Data Path: {SHARED_PROCESSED_TEST_PATH_JOBLIB}")
print(f"  Output Parquet (Named Cols) Processed Train Path: {SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH}")
print(f"  Output Parquet (Named Cols) Processed Test Path: {SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH}")
print(f"  Output Preprocessor Components Path: {DBFS_PREPROCESSOR_COMPONENTS_PATH}")
print(f"Label Column: {YOUR_LABEL_COLUMN_NAME}")
print(f"Categorical Columns (Raw): {CATEGORICAL_COLUMNS_RAW}")
print(f"Numerical Columns (Raw): {NUMERICAL_COLUMNS_RAW}")
print(f"Global Seed: {GLOBAL_SEED}")
print(f"Target Encoding Smoothing: {TARGET_ENCODING_SMOOTHING}")
print("-" * 50)


# <-------------------- CELL 3: UTILITY FUNCTIONS & PANDAS PREPROCESSING LOGIC -------------------->
print("\nCell 3: Utility Functions & Pandas Preprocessing Logic - Defining...")

# --- MLflow Utility ---
def get_or_create_experiment(experiment_name_param, spark_session_param): # Added params to avoid global clashes
    """Safely creates or fetches an MLflow experiment."""
    try:
        # In Databricks, experiment names can be full paths
        experiment = mlflow.get_experiment_by_name(experiment_name_param)
        if experiment:
            print(f"MLflow experiment '{experiment_name_param}' found with ID: {experiment.experiment_id}")
            return experiment.experiment_id
        else:
            print(f"MLflow experiment '{experiment_name_param}' not found. Attempting to create.")
            # Note: Artifact location can be specified for UC experiments if needed,
            # e.g., f"uc://{catalog}.{schema}.{volume_or_dir}"
            # For now, default artifact location.
            experiment_id = mlflow.create_experiment(name=experiment_name_param)
            print(f"MLflow experiment '{experiment_name_param}' created with ID: {experiment_id}")
            return experiment_id
    except mlflow.exceptions.MlflowException as e:
        if "RESOURCE_ALREADY_EXISTS" in str(e) or ("Experiment with name" in str(e) and "already exists" in str(e)):
            print(f"Race condition or experiment '{experiment_name_param}' was created concurrently. Fetching again.")
            experiment = mlflow.get_experiment_by_name(experiment_name_param)
            if experiment:
                print(f"Successfully fetched concurrently created experiment '{experiment_name_param}' with ID: {experiment.experiment_id}")
                return experiment.experiment_id
        print(f"MLflowException: Could not get or create experiment '{experiment_name_param}'. Error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error in get_or_create_experiment for '{experiment_name_param}'. Error: {e}")
        return None

# --- Pandas Preprocessing Functions ---
def load_raw_data_to_pandas_from_uc_volume(uc_volume_parquet_path: str) -> pd.DataFrame:
    """Loads a Parquet file from a UC Volume path into a Pandas DataFrame using /dbfs/ prefix."""
    dbfs_path = uc_volume_parquet_path
    if uc_volume_parquet_path.startswith("/Volumes/"):
        dbfs_path = f"/dbfs{uc_volume_parquet_path}"
    else: # Assume it might already be a /dbfs/ path or other local path
        print(f"Warning: Path '{uc_volume_parquet_path}' does not start with /Volumes/. Assuming it's a direct /dbfs/ path or local.")

    print(f"  Attempting to load Pandas DataFrame from: {dbfs_path}")
    try:
        pdf = pd.read_parquet(dbfs_path)
        print(f"    Successfully loaded. Shape: {pdf.shape}")
        return pdf
    except Exception as e:
        print(f"    ERROR loading Parquet from {dbfs_path}: {e}")
        raise

def fit_pandas_preprocessor_mvp(train_pdf: pd.DataFrame, 
                                categorical_cols: list, 
                                numerical_cols: list, 
                                label_col: str,
                                te_smoothing_factor: float): # Renamed for clarity
    """
    Fits preprocessing components (imputers, target encoders, scaler) on training data (Pandas).
    Returns:
        - X_train_processed_df_named_cols: Pandas DataFrame with original feature names but transformed & scaled values.
        - y_train_series: Pandas Series of labels.
        - fitted_components: Dictionary of fitted components.
    """
    print("  Fitting Pandas preprocessor (MVP version)...")
    X_train_pd_intermediate = train_pdf.drop(columns=[label_col], errors='ignore').copy()
    y_train_series = train_pdf[label_col].astype(float).copy()

    fitted_components = {}

    # 1. Impute Numerical Features (Median)
    if numerical_cols:
        # Ensure only existing columns are processed
        valid_numerical_cols = [col for col in numerical_cols if col in X_train_pd_intermediate.columns]
        if valid_numerical_cols:
            num_imputer = SimpleImputer(strategy="median")
            X_train_pd_intermediate[valid_numerical_cols] = num_imputer.fit_transform(X_train_pd_intermediate[valid_numerical_cols])
            fitted_components['numerical_imputer'] = num_imputer
            fitted_components['numerical_cols_fitted'] = valid_numerical_cols # Store actual columns used
            print(f"    Fitted Numerical Imputer for: {valid_numerical_cols}")
        else:
            print("    No valid numerical columns found in data for imputation.")
    else:
        print("    No numerical columns specified for imputation.")


    # 2. Impute Categorical Features (Using a constant string)
    if categorical_cols:
        valid_categorical_cols = [col for col in categorical_cols if col in X_train_pd_intermediate.columns]
        if valid_categorical_cols:
            cat_imputer_fill_value = "__MISSING_CATEGORY__" # More explicit fill value
            cat_imputer = SimpleImputer(strategy="constant", fill_value=cat_imputer_fill_value)
            X_train_pd_intermediate[valid_categorical_cols] = cat_imputer.fit_transform(X_train_pd_intermediate[valid_categorical_cols])
            fitted_components['categorical_imputer'] = cat_imputer
            fitted_components['categorical_cols_fitted_for_impute'] = valid_categorical_cols
            print(f"    Fitted Categorical Imputer for: {valid_categorical_cols} (using fill_value='{cat_imputer_fill_value}')")
        else:
            print("    No valid categorical columns found in data for imputation.")
    else:
        print("    No categorical columns specified for imputation.")


    # 3. Target Encoding for Categorical Features (using category_encoders)
    # Ensure `categorical_cols_fitted_for_impute` or original `valid_categorical_cols` are used
    te_input_cols = fitted_components.get('categorical_cols_fitted_for_impute', [])
    if te_input_cols: # Only proceed if there were categorical columns after imputation check
        # Ensure columns are object or category type for TargetEncoder
        for col in te_input_cols:
            X_train_pd_intermediate[col] = X_train_pd_intermediate[col].astype('category')

        target_encoder = ce.TargetEncoder(
            cols=te_input_cols, 
            smoothing=te_smoothing_factor,
            handle_unknown='value', # Uses mean of y_train for unseen categories
            handle_missing='value'  # Uses mean of y_train for NaNs (already imputed, but good fallback)
        )
        
        # TargetEncoder transforms specified columns in place or returns only those.
        # It should return a DataFrame with the same columns as input if cols is specified.
        X_train_pd_intermediate[te_input_cols] = target_encoder.fit_transform(X_train_pd_intermediate[te_input_cols], y_train_series)
        fitted_components['target_encoder'] = target_encoder
        fitted_components['categorical_cols_target_encoded'] = te_input_cols # Store which cols were TE'd
        print(f"    Fitted TargetEncoder for: {te_input_cols} with smoothing={te_smoothing_factor}")
    else:
        print("    No categorical columns available for Target Encoding.")

    # 4. Scale All Features (Original Numerical + Target Encoded Categoricals)
    # Columns that were originally numerical + columns that were categorical (now numerically target-encoded)
    feature_columns_for_scaling = fitted_components.get('numerical_cols_fitted', []) + \
                                  fitted_components.get('categorical_cols_target_encoded', [])
    
    X_train_processed_df_named_cols = X_train_pd_intermediate.copy() # This will hold final scaled values with names

    if feature_columns_for_scaling:
        # Ensure all columns for scaling are float and impute any new NaNs that might have emerged
        # (e.g., if TE produced NaNs, though `handle_unknown='value'` should prevent this if y_train is clean)
        final_impute_medians = {}
        for col in feature_columns_for_scaling:
            X_train_processed_df_named_cols[col] = X_train_processed_df_named_cols[col].astype(float) # Ensure float
            if X_train_processed_df_named_cols[col].isnull().any():
                median_val = X_train_processed_df_named_cols[col].median()
                X_train_processed_df_named_cols[col] = X_train_processed_df_named_cols[col].fillna(median_val)
                final_impute_medians[col] = median_val
                print(f"    Post-TE/Pre-Scaling Imputing NaNs in {col} with median: {median_val}")
        if final_impute_medians:
            fitted_components['post_te_imputer_medians'] = final_impute_medians
            
        scaler = StandardScaler()
        X_train_processed_df_named_cols[feature_columns_for_scaling] = scaler.fit_transform(X_train_processed_df_named_cols[feature_columns_for_scaling])
        
        fitted_components['scaler'] = scaler
        fitted_components['feature_columns_for_scaling'] = feature_columns_for_scaling # Store order and names
        print(f"    Fitted StandardScaler for features: {feature_columns_for_scaling}")
    else:
        print("    No features identified for scaling. Output X will be based on previous steps.")

    # Add label back for the Parquet output with named columns
    X_train_processed_df_named_cols[label_col] = y_train_series.values

    print("  Pandas preprocessor fitting complete.")
    return X_train_processed_df_named_cols, y_train_series, fitted_components


def transform_pandas_preprocessor_mvp(raw_pdf: pd.DataFrame, 
                                      fitted_components: dict,
                                      label_col: str = None): # Cat/Num cols are derived from fitted_components
    """
    Applies fitted Pandas preprocessing components to new data.
    Returns:
        - processed_pdf_named_cols: Pandas DataFrame with transformed values under original feature names (plus label if present).
        - y_data_series: Pandas Series of labels (if label_col present).
    """
    print("  Transforming data with fitted Pandas preprocessor (MVP version)...")
    
    X_data_pd = raw_pdf.drop(columns=[label_col], errors='ignore').copy() if label_col and label_col in raw_pdf.columns else raw_pdf.copy()
    y_data_series = raw_pdf[label_col].astype(float).copy() if label_col and label_col in raw_pdf.columns else None

    # 1. Impute Numerical
    numerical_cols_fitted = fitted_components.get('numerical_cols_fitted', [])
    if numerical_cols_fitted and 'numerical_imputer' in fitted_components:
        # Ensure only existing columns are processed
        valid_numerical_cols_to_transform = [col for col in numerical_cols_fitted if col in X_data_pd.columns]
        if valid_numerical_cols_to_transform:
           X_data_pd[valid_numerical_cols_to_transform] = fitted_components['numerical_imputer'].transform(X_data_pd[valid_numerical_cols_to_transform])
           print(f"    Applied Numerical Imputer for: {valid_numerical_cols_to_transform}")
    
    # 2. Impute Categorical
    categorical_cols_fitted_for_impute = fitted_components.get('categorical_cols_fitted_for_impute', [])
    if categorical_cols_fitted_for_impute and 'categorical_imputer' in fitted_components:
        valid_cat_cols_to_transform = [col for col in categorical_cols_fitted_for_impute if col in X_data_pd.columns]
        if valid_cat_cols_to_transform:
            X_data_pd[valid_cat_cols_to_transform] = fitted_components['categorical_imputer'].transform(X_data_pd[valid_cat_cols_to_transform])
            print(f"    Applied Categorical Imputer for: {valid_cat_cols_to_transform}")
    
    # 3. Target Encode Categorical
    categorical_cols_target_encoded = fitted_components.get('categorical_cols_target_encoded', [])
    if categorical_cols_target_encoded and 'target_encoder' in fitted_components:
        # Ensure dtype for category_encoders
        valid_te_cols_to_transform = [col for col in categorical_cols_target_encoded if col in X_data_pd.columns]
        if valid_te_cols_to_transform:
            for col in valid_te_cols_to_transform:
                X_data_pd[col] = X_data_pd[col].astype('category')
            
            # TargetEncoder's transform only needs X. y is ignored if provided.
            X_data_pd[valid_te_cols_to_transform] = fitted_components['target_encoder'].transform(X_data_pd[valid_te_cols_to_transform])
            print(f"    Applied TargetEncoder for: {valid_te_cols_to_transform}")

    # 4. Scale All Features
    feature_columns_for_scaling = fitted_components.get('feature_columns_for_scaling', [])
    processed_pdf_named_cols = X_data_pd.copy()

    if feature_columns_for_scaling and 'scaler' in fitted_components:
        valid_scale_cols_to_transform = [col for col in feature_columns_for_scaling if col in processed_pdf_named_cols.columns]
        if valid_scale_cols_to_transform:
            # Ensure float and impute post-TE NaNs using medians learned during fit
            post_te_imputer_medians = fitted_components.get('post_te_imputer_medians', {})
            for col in valid_scale_cols_to_transform:
                processed_pdf_named_cols[col] = processed_pdf_named_cols[col].astype(float)
                if processed_pdf_named_cols[col].isnull().any():
                    fill_val = post_te_imputer_medians.get(col, 0) # Fallback to 0 if somehow not in medians
                    processed_pdf_named_cols[col] = processed_pdf_named_cols[col].fillna(fill_val)
                    print(f"    Post-TE/Pre-Scaling Imputing NaNs in {col} with: {fill_val}")
            
            processed_pdf_named_cols[valid_scale_cols_to_transform] = fitted_components['scaler'].transform(processed_pdf_named_cols[valid_scale_cols_to_transform])
            print(f"    Applied StandardScaler for features: {valid_scale_cols_to_transform}")
    else:
        print("    No features to scale or scaler not found in fitted_components.")


    if y_data_series is not None: # Add original label back if it was present
        processed_pdf_named_cols[label_col] = y_data_series.values

    print("  Pandas data transformation complete.")
    return processed_pdf_named_cols, y_data_series


def save_processed_outputs(processed_pdf_named_cols: pd.DataFrame, 
                           label_series: pd.Series, # Can be None for test data without label
                           feature_cols_in_order: list, # To extract X_np correctly
                           joblib_file_path: str, 
                           parquet_file_path: str,
                           label_col_name_in_output: str):
    """Saves processed data as both joblib (NumPy) and Parquet (named cols)."""
    
    # 1. Prepare NumPy arrays for joblib
    if feature_cols_in_order and not processed_pdf_named_cols[feature_cols_in_order].empty:
        X_np = processed_pdf_named_cols[feature_cols_in_order].values
    elif not processed_pdf_named_cols.empty and not feature_cols_in_order: # E.g. if all columns were features after label drop
        temp_X_pdf = processed_pdf_named_cols.drop(columns=[label_col_name_in_output], errors='ignore')
        X_np = temp_X_pdf.values
        print(f"Warning: feature_cols_in_order was empty, derived X_np from all columns minus label. Shape: {X_np.shape}")
    else:
        X_np = np.array([])
        print("Warning: X_np is empty.")

    y_np = label_series.values if label_series is not None else np.array([])

    # Save joblib
    dbfs_joblib_path = joblib_file_path
    if joblib_file_path.startswith("/Volumes/"): dbfs_joblib_path = f"/dbfs{joblib_file_path}"
    print(f"  Saving joblib data (X_np shape: {X_np.shape}, y_np shape: {y_np.shape}) to: {dbfs_joblib_path}")
    try:
        payload = {'X': X_np, 'y': y_np}
        joblib.dump(payload, dbfs_joblib_path)
        print(f"    Joblib data saved successfully to {dbfs_joblib_path}")
    except Exception as e:
        print(f"    ERROR saving joblib data to {dbfs_joblib_path}: {e}")
        # raise # Decide if this is a critical error to stop the flow

    # Save Parquet with named columns
    dbfs_parquet_path = parquet_file_path
    if parquet_file_path.startswith("/Volumes/"): dbfs_parquet_path = f"/dbfs{parquet_file_path}"
    print(f"  Saving named cols Parquet data (shape: {processed_pdf_named_cols.shape}) to: {dbfs_parquet_path}")
    try:
        # Ensure label column is correctly named if it was added back
        df_to_save = processed_pdf_named_cols.copy()
        if label_series is not None and label_col_name_in_output not in df_to_save.columns:
            df_to_save[label_col_name_in_output] = label_series.values
        
        df_to_save.to_parquet(dbfs_parquet_path, index=False)
        print(f"    Named cols Parquet data saved successfully to {dbfs_parquet_path}")
    except Exception as e:
        print(f"    ERROR saving named cols Parquet to {dbfs_parquet_path}: {e}")
        # raise

def save_fitted_components_artifact(components_dict, components_dbfs_path, mlflow_artifact_path_name):
    """Saves fitted components locally (via /dbfs/) and logs to MLflow."""
    if components_dict and components_dbfs_path:
        print(f"  Saving preprocessor components to: {components_dbfs_path}")
        try:
            joblib.dump(components_dict, components_dbfs_path)
            print(f"    Preprocessor components saved successfully to {components_dbfs_path}.")
            # Log components as artifact in MLflow
            mlflow.log_artifact(components_dbfs_path, artifact_path=mlflow_artifact_path_name)
            print(f"    Logged fitted preprocessor components to MLflow artifact path: {mlflow_artifact_path_name}")
        except Exception as e:
            print(f"    ERROR saving/logging components from {components_dbfs_path}: {e}")
            # Decide if this is critical
            
print("--- All Pandas Preprocessing Utility Functions Defined ---")
print("-" * 50)

# <-------------------- CELL 4: MAIN PREPROCESSING ORCHESTRATION (PANDAS) -------------------->
print("\nCell 4: Main Pandas Preprocessing Orchestration - Executing...")

# --- 0. Set MLflow Experiment for Preprocessing ---
# This variable `preprocessing_mlflow_experiment_id` is used in the MLflow run context
global preprocessing_mlflow_experiment_id # Make it global if used inside functions called later
preprocessing_mlflow_experiment_id = None
try:
    preprocessing_mlflow_experiment_id = get_or_create_experiment(EXPERIMENT_PATH, spark) # Use EXPERIMENT_PATH from Init
    if preprocessing_mlflow_experiment_id:
        mlflow.set_experiment(experiment_id=preprocessing_mlflow_experiment_id)
        print(f"MLflow experiment '{EXPERIMENT_PATH}' for preprocessing is set with ID: {preprocessing_mlflow_experiment_id}")
    else:
        raise Exception("Preprocessing MLflow experiment could not be set. Halting.")
except Exception as e:
    print(f"CRITICAL: Could not initialize MLflow experiment for preprocessing. Error: {e}")
    # In a real script, you might use dbutils.notebook.exit("MLflow experiment setup failed") here

# Ensure execution proceeds only if experiment is set
if preprocessing_mlflow_experiment_id:
    # --- 1. Load Raw Data into Pandas DataFrames ---
    print(f"\nLoading RAW training data for Pandas preprocessing...")
    raw_train_pdf = load_raw_data_to_pandas_from_uc_volume(RAW_TRAIN_DATA_PATH)
    print(f"RAW training data columns: {raw_train_pdf.columns.tolist()}")
    raw_train_pdf.info(verbose=True, show_counts=True)


    print(f"\nLoading RAW test data for Pandas preprocessing...")
    raw_test_pdf = load_raw_data_to_pandas_from_uc_volume(RAW_TEST_DATA_PATH)
    raw_test_pdf.info(verbose=True, show_counts=True)


    # --- 2. Fit Preprocessor on Training Data & Save Outputs ---
    fitted_components_dict = None # Initialize
    with mlflow.start_run(run_name="Pandas_Preprocessor_Fit_Save_MVP") as preproc_run:
        run_id_for_preproc = preproc_run.info.run_id
        print(f"\nFitting preprocessor and saving outputs. MLflow Run ID: {run_id_for_preproc}")
        mlflow.log_param("label_column", YOUR_LABEL_COLUMN_NAME)
        mlflow.log_param("categorical_features_raw", ", ".join(CATEGORICAL_COLUMNS_RAW))
        mlflow.log_param("numerical_features_raw", ", ".join(NUMERICAL_COLUMNS_RAW))
        mlflow.log_param("target_encoding_smoothing", TARGET_ENCODING_SMOOTHING)
        mlflow.log_param("global_seed_config", GLOBAL_SEED) # Though seed is less impactful here than in model training
        mlflow.set_tag("preprocessing_type", "pandas_mvp_full_script")
        mlflow.set_tag("data_version_processed", PROCESSED_DATA_VERSION)

        try:
            # Fit preprocessor
            processed_train_pdf_named_cols, y_train_series, \
            fitted_components_dict = fit_pandas_preprocessor_mvp(
                raw_train_pdf,
                CATEGORICAL_COLUMNS_RAW,
                NUMERICAL_COLUMNS_RAW,
                YOUR_LABEL_COLUMN_NAME,
                TARGET_ENCODING_SMOOTHING
            )
            print(f"  Fit complete. Processed train_pdf_named_cols shape: {processed_train_pdf_named_cols.shape}, y_train_series shape: {y_train_series.shape}")

            # Save processed training data (both formats)
            save_processed_outputs(
                processed_pdf_named_cols=processed_train_pdf_named_cols,
                label_series=y_train_series, # Pass y_train_series to ensure label column is handled correctly for Parquet save
                feature_cols_in_order=fitted_components_dict.get('feature_columns_for_scaling', []), # Important for X_np
                joblib_file_path=SHARED_PROCESSED_TRAIN_PATH_JOBLIB,
                parquet_file_path=SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH,
                label_col_name_in_output=YOUR_LABEL_COLUMN_NAME
            )
            
            # Save fitted components and log to MLflow
            save_fitted_components_artifact(fitted_components_dict, DBFS_PREPROCESSOR_COMPONENTS_PATH, MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH)
            
            # Log paths to the saved data as params or tags for traceability
            mlflow.set_tag("processed_train_joblib_path", SHARED_PROCESSED_TRAIN_PATH_JOBLIB.replace("/dbfs", "dbfs:"))
            mlflow.set_tag("processed_train_parquet_named_path", SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH.replace("/dbfs", "dbfs:"))
            mlflow.set_tag("preprocessor_components_path", DBFS_PREPROCESSOR_COMPONENTS_PATH.replace("/dbfs", "dbfs:"))
            mlflow.set_tag("status_fit", "success")
            print("  Training data processed and components saved successfully.")

        except Exception as e:
            print(f"  ERROR during preprocessor fitting or saving training data: {e}")
            mlflow.log_param("error_fit", str(e)[:250]) # Log error to MLflow
            mlflow.set_tag("status_fit", "failed")
            # dbutils.notebook.exit(f"Preprocessing fitting failed: {e}") # Halt on critical error
            raise # Re-raise to ensure failure is noted

    # --- 3. Transform Test Data using Fitted Preprocessor & Save Outputs ---
    if fitted_components_dict: # Proceed only if fitting was successful
        print("\nTransforming TEST data using fitted preprocessor...")
        # No new MLflow run needed here, this is part of the same preprocessing "job"
        # Or, if preferred, could be a separate run for "transform_test_data"
        try:
            processed_test_pdf_named_cols, y_test_series = transform_pandas_preprocessor_mvp(
                raw_test_pdf,
                fitted_components_dict,
                label_col=YOUR_LABEL_COLUMN_NAME # Pass label_col if test set has it for consistency in y_test_series
            )
            print(f"  Test data transformed. Processed test_pdf_named_cols shape: {processed_test_pdf_named_cols.shape}")
            if y_test_series is not None:
                print(f"  y_test_series shape: {y_test_series.shape}")

            # Save processed test data (both formats)
            save_processed_outputs(
                processed_pdf_named_cols=processed_test_pdf_named_cols,
                label_series=y_test_series,
                feature_cols_in_order=fitted_components_dict.get('feature_columns_for_scaling', []),
                joblib_file_path=SHARED_PROCESSED_TEST_PATH_JOBLIB,
                parquet_file_path=SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH,
                label_col_name_in_output=YOUR_LABEL_COLUMN_NAME
            )
            # Log test data paths to the same MLflow run as the preprocessor components
            with mlflow.start_run(run_id=run_id_for_preproc, nested=False): # Re-open run if closed or use existing
                 mlflow.set_tag("processed_test_joblib_path", SHARED_PROCESSED_TEST_PATH_JOBLIB.replace("/dbfs", "dbfs:"))
                 mlflow.set_tag("processed_test_parquet_named_path", SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH.replace("/dbfs", "dbfs:"))
                 mlflow.set_tag("status_transform_test", "success")

            print("  Test data transformed and saved in both formats.")
        except Exception as e:
            print(f"  ERROR during test data transformation or saving: {e}")
            with mlflow.start_run(run_id=run_id_for_preproc, nested=False): # Re-open run if closed
                mlflow.log_param("error_transform_test", str(e)[:250])
                mlflow.set_tag("status_transform_test", "failed")
            # dbutils.notebook.exit(f"Test data transformation failed: {e}") # Halt on critical error
            raise
    else:
        print("CRITICAL: Preprocessor fitting failed or did not produce components. Cannot transform test data.")

    print("\n--- Pandas Preprocessing Orchestration (Full Script) Completed ---")
    print(f"Joblib processed training data should be at: {SHARED_PROCESSED_TRAIN_PATH_JOBLIB}")
    print(f"Joblib processed test data should be at: {SHARED_PROCESSED_TEST_PATH_JOBLIB}")
    print(f"Named Parquet processed training data should be at: {SHARED_PROCESSED_TRAIN_PARQUET_NAMED_COLS_PATH}")
    print(f"Named Parquet processed test data should be at: {SHARED_PROCESSED_TEST_PARQUET_NAMED_COLS_PATH}")
    print(f"Fitted preprocessor components (transformer) saved at: {DBFS_PREPROCESSOR_COMPONENTS_PATH}")
    print(f"MLflow Run ID for this preprocessing: {run_id_for_preproc}")
    print("You can now proceed to the HPO and Model Training script/cells using these outputs.")

else:
    print("Halting script because MLflow experiment for preprocessing could not be set.")

print("-" * 50)

# <-------------------- CELL X: EXAMPLE USAGE OF SAVED PANDAS PREPROCESSOR FOR NEW DATA (INFERENCE) -------------------->
# This is a conceptual cell showing how you would use the saved components later for prediction.
# You would run this in a new session/notebook.

# print("\nCell X: Example - Using Saved Pandas Preprocessor for New Data (Inference)...")

# # --- 1. Define Configuration (should match context of saved preprocessor) ---
# # These paths and column names should correspond to how the preprocessor was trained.
# PREPROC_COMPONENTS_TO_LOAD_PATH = DBFS_PREPROCESSOR_COMPONENTS_PATH # From Init Cell of this script, or from MLflow run
# # Example of loading from MLflow if you have the run_id and artifact path:
# # PREPROC_RUN_ID_FOR_LOADING = "your_mlflow_run_id_where_preprocessor_was_saved"
# # MLFLOW_ARTIFACT_DIR_FOR_LOADING = "pandas_preprocessor_components" # Artifact path used during logging
# # COMPONENTS_FILENAME = "preprocessor_components.joblib"

# NEW_RAW_DATA_UC_PATH_FOR_PREDICTION = "/Volumes/delfos/new_inference_data/new_data_for_prediction.parquet" # !!! REPLACE !!!

# # Feature columns must match those used during fitting the preprocessor
# CATEGORICAL_COLS_FOR_PREDICTION = CATEGORICAL_COLUMNS_RAW # From Init Cell
# NUMERICAL_COLS_FOR_PREDICTION = NUMERICAL_COLUMNS_RAW     # From Init Cell

# # --- 2. Load the Fitted Preprocessing Components ---
# print("Loading fitted preprocessor components for inference...")
# try:
#     # If loading from MLflow (ensure load_components_from_mlflow function is defined or use direct client):
#     # from Cell 3 of previous response for the function load_components_from_mlflow
#     # loaded_components_for_inference = load_components_from_mlflow(
#     # PREPROC_RUN_ID_FOR_LOADING,
#     # MLFLOW_ARTIFACT_DIR_FOR_LOADING,
#     #    COMPONENTS_FILENAME
#     # )
#     # Or loading directly from DBFS path:
#     loaded_components_for_inference = joblib.load(PREPROC_COMPONENTS_TO_LOAD_PATH)
#     print("Preprocessor components loaded successfully for inference.")
# except Exception as e:
#     print(f"CRITICAL ERROR: Could not load preprocessor components for inference. {e}")
#     # dbutils.notebook.exit("Failed to load preprocessor components for inference")

# if 'loaded_components_for_inference' in locals():
#     # --- 3. Load New Raw Data for Prediction ---
#     print(f"Loading new raw data for prediction from {NEW_RAW_DATA_UC_PATH_FOR_PREDICTION}...")
#     try:
#         new_raw_pdf_for_prediction = load_raw_data_to_pandas_from_uc_volume(NEW_RAW_DATA_UC_PATH_FOR_PREDICTION)
#     except Exception as e:
#         print(f"CRITICAL ERROR: Could not load new raw data for prediction. {e}")
#         # dbutils.notebook.exit("Failed to load new raw data for prediction")

#     if 'new_raw_pdf_for_prediction' in locals():
#         # --- 4. Apply Preprocessing to New Data ---
#         print("Applying preprocessing to new data for prediction...")
#         try:
#             # The transform function returns X_processed_np, y_processed_np (y will be None if no label_col)
#             # And also the processed_pdf_named_cols
#             X_new_processed_np, _, processed_new_pdf_named_cols = transform_pandas_preprocessor_mvp(
#                 raw_pdf=new_raw_pdf_for_prediction,
#                 fitted_components=loaded_components_for_inference,
#                 label_col=None # No label column in new data for prediction typically
#             )
#             print(f"Preprocessing of new data complete. Processed feature shape (NumPy): {X_new_processed_np.shape}")
#             print(f"Processed new data DataFrame shape (Named Cols): {processed_new_pdf_named_cols.shape}")

#             # X_new_processed_np is now ready for your trained ML model's predict() method
#             # For example:
#             # final_trained_model = mlflow.sklearn.load_model("runs:/<your_best_model_run_id>/model")
#             # predictions_on_new_data = final_trained_model.predict(X_new_processed_np)
#             # print(f"Sample predictions on new data: {predictions_on_new_data[:5]}")

#         except Exception as e:
#             print(f"ERROR during preprocessing or prediction on new data: {e}")

# print("-" * 50)