In [None]:
# Cell 1: Imports (Add/Update from previous full script)
import mlflow
# No need for mlflow.spark here for preprocessing, but keep for HPO orchestration
# import mlflow.sklearn # Will be used in HPO and for logging preprocessing components

import pandas as pd
import numpy as np
import os
import joblib # For saving/loading Python objects like scalers, arrays
import time
import shutil # For cleaning up temp directories if any

from sklearn.model_selection import KFold # For K-fold target encoding if implemented manually
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce # For TargetEncoder - install if not present

from pyspark.sql import SparkSession # Still need SparkSession for environment context

# Suppress warnings if any from category_encoders or others for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)


# Ensure spark session is available
if 'spark' not in locals():
    spark = SparkSession.builder.appName("Pandas_Preprocessing_MVP").getOrCreate()

print("Imports for Pandas Preprocessing successful.")

In [None]:
# Cell 2: Init Cell - Global Configurations (Review and Update)

# --- MLflow Configuration ---
EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Regression_Pandas_Preprocessing" # CHANGE

# --- Data Paths (Unity Catalog Volumes) ---
UC_BASE_DATA_PATH = "/Volumes/delfos/"
RAW_TRAIN_DATA_PATH = f"{UC_BASE_DATA_PATH}raw_data/train.parquet" # Path to RAW Parquet train data
RAW_TEST_DATA_PATH = f"{UC_BASE_DATA_PATH}raw_data/test.parquet"   # Path to RAW Parquet test data

# --- Paths for PROCESSED Data (Output of this Pandas preprocessing script) ---
# These will be .joblib files containing dicts of NumPy arrays {'X': ..., 'y': ...}
# Saved to UC Volume via /dbfs/ prefix for joblib
DBFS_PROCESSED_DATA_DIR = f"/dbfs{UC_BASE_DATA_PATH}processed_data_pandas_mvp_v1/" # Use /dbfs for joblib
SHARED_PROCESSED_TRAIN_PATH = os.path.join(DBFS_PROCESSED_DATA_DIR, "train_processed_data.joblib")
SHARED_PROCESSED_TEST_PATH = os.path.join(DBFS_PROCESSED_DATA_DIR, "test_processed_data.joblib")

# --- Path for saving the FITTED PREPROCESSING COMPONENTS ---
DBFS_PREPROCESSOR_COMPONENTS_PATH = os.path.join(DBFS_PROCESSED_DATA_DIR, "preprocessor_components.joblib")


# !!! IMPORTANT: SET YOUR ACTUAL LABEL COLUMN NAME (must exist in raw data) !!!
YOUR_LABEL_COLUMN_NAME = "target"

# --- Define your categorical and numerical columns from the RAW data ---
# !!! IMPORTANT: UPDATE THESE LISTS BASED ON YOUR ACTUAL RAW DATASET !!!
CATEGORICAL_COLUMNS_RAW = ["category_feature_1", "category_feature_2"]
NUMERICAL_COLUMNS_RAW = ["numerical_feature_1", "numerical_feature_2"]

# --- MLflow Configuration for saving the preprocessing "model" ---
PREPROCESSING_EXPERIMENT_PATH = EXPERIMENT_PATH # Or a dedicated one
MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH = "pandas_preprocessor"

# --- Target Encoding Configuration (for category_encoders or manual) ---
# For category_encoders.TargetEncoder, smoothing is a parameter.
# If doing manual K-Fold TE, this might be relevant.
TARGET_ENCODING_SMOOTHING = 10.0 # Default for category_encoders, can be tuned

# --- Reproducibility ---
GLOBAL_SEED = 117

# --- Other Global Settings from previous Init Cell ---
# (NUM_HPO_TRIALS, PRIMARY_METRIC, BASE_ALGORITHMS_TO_RUN, K_FOLDS_OOF, etc.
#  are for the HPO script, not strictly needed here but good for context if this cell is shared)

# Ensure output directories exist
try:
    os.makedirs(DBFS_PROCESSED_DATA_DIR, exist_ok=True)
    print(f"Checked/created processed data directory: {DBFS_PROCESSED_DATA_DIR}")
except Exception as e:
    print(f"Warning: Could not create directory {DBFS_PROCESSED_DATA_DIR}. Error: {e}")

print("--- Pandas Preprocessing Configurations Initialized ---")
# ... (print other relevant configs)

In [None]:
# Cell 3: Pandas Preprocessing Functions

def load_raw_data_to_pandas(uc_volume_parquet_path: str) -> pd.DataFrame:
    """Loads a Parquet file from a UC Volume path into a Pandas DataFrame."""
    # For Pandas to read directly from UC Volumes, it typically needs the /dbfs/ prefix
    dbfs_path = uc_volume_parquet_path
    if uc_volume_parquet_path.startswith("/Volumes/"):
        dbfs_path = f"/dbfs{uc_volume_parquet_path}"
    
    print(f"  Loading Pandas DataFrame from: {dbfs_path}")
    try:
        pdf = pd.read_parquet(dbfs_path)
        print(f"    Successfully loaded. Shape: {pdf.shape}")
        return pdf
    except Exception as e:
        print(f"    ERROR loading Parquet from {dbfs_path}: {e}")
        raise

def fit_pandas_preprocessor(train_pdf: pd.DataFrame, 
                            categorical_cols: list, 
                            numerical_cols: list, 
                            label_col: str,
                            te_smoothing: float,
                            global_seed: int):
    """
    Fits preprocessing components (imputers, target encoders, scaler) on training data.
    Returns processed X_train_np, y_train_np, and a dictionary of fitted components.
    """
    print("  Fitting Pandas preprocessor...")
    X_train = train_pdf.drop(columns=[label_col])
    y_train = train_pdf[label_col].astype(float)

    fitted_components = {}

    # 1. Impute Numerical Features (Median)
    if numerical_cols:
        num_imputer = SimpleImputer(strategy="median")
        X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
        fitted_components['numerical_imputer'] = num_imputer
        print(f"    Fitted Numerical Imputer for: {numerical_cols}")

    # 2. Impute Categorical Features (Most Frequent or Constant)
    if categorical_cols:
        # Using a constant fill value is often safer for unseen categories later
        cat_imputer_fill_value = "__MISSING__"
        # SimpleImputer for categoricals needs strategy='most_frequent' or 'constant'
        cat_imputer = SimpleImputer(strategy="constant", fill_value=cat_imputer_fill_value)
        X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
        fitted_components['categorical_imputer'] = cat_imputer
        fitted_components['categorical_imputer_fill_value'] = cat_imputer_fill_value # Store for transform
        print(f"    Fitted Categorical Imputer for: {categorical_cols} (using fill_value='{cat_imputer_fill_value}')")

    # 3. Target Encoding for Categorical Features
    # Using category_encoders library for simplicity and robustness in MVP
    # It handles unseen categories by default (imputes with global mean of target during fit)
    # and training set leakage with smoothing or k-fold like behavior internally if configured.
    if categorical_cols:
        # Note: category_encoders.TargetEncoder expects y to be passed during fit and transform of X
        # It also needs to know which columns are categorical.
        # Ensure categorical columns are of 'object' or 'category' dtype for category_encoders
        for col in categorical_cols:
            X_train[col] = X_train[col].astype('category')

        target_encoder = ce.TargetEncoder(cols=categorical_cols, smoothing=te_smoothing, handle_unknown='value', handle_missing='value')
        # handle_unknown='value' uses the overall mean of y_train for new categories
        # handle_missing='value' uses the overall mean of y_train for NaNs seen during transform
        
        X_train[categorical_cols] = target_encoder.fit_transform(X_train[categorical_cols], y_train)
        fitted_components['target_encoder'] = target_encoder
        print(f"    Fitted TargetEncoder for: {categorical_cols} with smoothing={te_smoothing}")
        # The transformed columns are now numeric.

    # 4. Scale All Features (Numerical + Target Encoded Categoricals)
    # All columns in X_train (that are features) should now be numeric.
    feature_columns_for_scaling = numerical_cols + categorical_cols # Cat cols are now numerically encoded
    
    if feature_columns_for_scaling: # Check if there are any features to scale
        scaler = StandardScaler()
        # Ensure all columns for scaling are indeed numeric and do not contain NaNs from TE if 'value' wasn't perfect
        # TargetEncoder with handle_missing='value' and handle_unknown='value' should prevent NaNs if y_train has no NaNs.
        # If any NaNs persist (e.g., if a numerical column was all NaNs and imputer failed, or TE issue), scaler will fail.
        # A check or a final SimpleImputer(strategy='median') pass on feature_columns_for_scaling could be added.
        
        # Make sure all columns are float before scaling
        for col in feature_columns_for_scaling:
            X_train[col] = X_train[col].astype(float)
            # Final check for NaNs after TE before scaling, impute if necessary
            if X_train[col].isnull().any():
                print(f"    Warning: NaNs detected in column {col} before scaling. Applying median imputation.")
                median_val = X_train[col].median()
                X_train[col] = X_train[col].fillna(median_val)
                # Store this median for transform stage if not already handled by a formal imputer for these generated features
                if 'post_te_imputer_medians' not in fitted_components:
                    fitted_components['post_te_imputer_medians'] = {}
                fitted_components['post_te_imputer_medians'][col] = median_val


        X_train_scaled_np = scaler.fit_transform(X_train[feature_columns_for_scaling])
        fitted_components['scaler'] = scaler
        fitted_components['feature_columns_for_scaling'] = feature_columns_for_scaling # Store order
        print(f"    Fitted StandardScaler for features: {feature_columns_for_scaling}")
    else: # No features to scale (e.g. only one categorical feature became one TE feature)
        print("    No features identified for scaling. X_train might be empty or only have non-scalable features.")
        # Handle case where X_train might become just an empty dataframe or a single column that was a label.
        # This logic assumes X_train[feature_columns_for_scaling] results in a valid array for scikit-learn models.
        # If feature_columns_for_scaling is empty, X_train_scaled_np would be problematic.
        if not feature_columns_for_scaling and not X_train.empty:
             X_train_scaled_np = X_train.values # Or handle appropriately
        elif X_train.empty:
             X_train_scaled_np = np.array([]) # Or handle as error
        else: # Should not happen if there were features
             X_train_scaled_np = X_train[feature_columns_for_scaling].values


    y_train_np = y_train.values
    print("  Pandas preprocessor fitting complete.")
    return X_train_scaled_np, y_train_np, fitted_components


def transform_pandas_preprocessor(raw_pdf: pd.DataFrame, 
                                  fitted_components: dict,
                                  categorical_cols: list, 
                                  numerical_cols: list, 
                                  label_col: str = None, # Label col might not be in test data for prediction
                                  is_train_data=False): # Flag to indicate if we are transforming training data (for y)
    """
    Applies fitted preprocessing components to new data.
    """
    print("  Transforming data with Pandas preprocessor...")
    
    # Prepare X and y (if label_col is present)
    if label_col and label_col in raw_pdf.columns:
        X_data = raw_pdf.drop(columns=[label_col]).copy() # Make a copy to avoid SettingWithCopyWarning
        y_data_np = raw_pdf[label_col].astype(float).values
    else:
        X_data = raw_pdf.copy()
        y_data_np = None

    # 1. Impute Numerical
    if numerical_cols and 'numerical_imputer' in fitted_components:
        X_data.loc[:, numerical_cols] = fitted_components['numerical_imputer'].transform(X_data[numerical_cols])
        print(f"    Applied Numerical Imputer for: {numerical_cols}")

    # 2. Impute Categorical
    if categorical_cols and 'categorical_imputer' in fitted_components:
        X_data.loc[:, categorical_cols] = fitted_components['categorical_imputer'].transform(X_data[categorical_cols])
        print(f"    Applied Categorical Imputer for: {categorical_cols}")
    
    # 3. Target Encode Categorical
    if categorical_cols and 'target_encoder' in fitted_components:
        # Ensure categorical columns are of 'category' or 'object' dtype for category_encoders
        for col in categorical_cols:
            X_data[col] = X_data[col].astype('category')
        
        # For TargetEncoder's transform, y is not strictly needed if it learned from y during fit,
        # but some versions/setups might expect it. If y_data_np is None (e.g. for true new data),
        # category_encoders TE should use the global mean learned during fit for unknowns/missings.
        # Pass y=None if it's not available (e.g. scoring new data).
        # If transforming training data (is_train_data=True), y_data_np should be available.
        # However, for consistency, TE transform should only use X.
        X_data.loc[:, categorical_cols] = fitted_components['target_encoder'].transform(X_data[categorical_cols])
        print(f"    Applied TargetEncoder for: {categorical_cols}")

    # 4. Scale All Features
    feature_columns_for_scaling = fitted_components.get('feature_columns_for_scaling', [])
    if feature_columns_for_scaling and 'scaler' in fitted_components:
        # Make sure all columns are float before scaling and impute any post-TE NaNs
        for col in feature_columns_for_scaling:
            X_data[col] = X_data[col].astype(float)
            if col in fitted_components.get('post_te_imputer_medians', {}):
                 if X_data[col].isnull().any():
                    print(f"    Post-TE Imputing NaNs in {col} with stored median before scaling.")
                    X_data[col] = X_data[col].fillna(fitted_components['post_te_imputer_medians'][col])
            # If there are still NaNs and no stored median, this might indicate an issue or need for a default fill
            if X_data[col].isnull().any():
                print(f"    Warning: NaNs still present in {col} before scaling and no stored median. Filling with 0.")
                X_data[col] = X_data[col].fillna(0)


        X_data_scaled_np = fitted_components['scaler'].transform(X_data[feature_columns_for_scaling])
        print(f"    Applied StandardScaler for features: {feature_columns_for_scaling}")
    elif not feature_columns_for_scaling and not X_data.empty: # No scaling was fitted
        X_data_scaled_np = X_data.values
    elif X_data.empty :
        X_data_scaled_np = np.array([])
    else: # Should not happen if feature_columns_for_scaling was defined during fit
        X_data_scaled_np = X_data[feature_columns_for_scaling].values


    print("  Pandas data transformation complete.")
    if y_data_np is not None:
        return X_data_scaled_np, y_data_np
    else:
        return X_data_scaled_np, None


def save_processed_data_and_components(X_np, y_np, file_path, components_dict, components_path):
    """Saves processed NumPy arrays and fitted components using joblib."""
    print(f"  Saving processed data to: {file_path}")
    try:
        payload = {'X': X_np, 'y': y_np}
        joblib.dump(payload, file_path)
        print(f"    Data saved successfully.")
    except Exception as e:
        print(f"    ERROR saving data to {file_path}: {e}")
        raise

    if components_dict and components_path:
        print(f"  Saving preprocessor components to: {components_path}")
        try:
            joblib.dump(components_dict, components_path)
            print(f"    Preprocessor components saved successfully.")
        except Exception as e:
            print(f"    ERROR saving components to {components_path}: {e}")
            # Don't raise if components fail but data saved, or handle as critical
            
print("--- Pandas Preprocessing Utility Functions Defined ---")

In [None]:
# Cell 4: Main Preprocessing Orchestration (Pandas)

print("--- Starting Pandas Preprocessing Orchestration ---")

# --- 0. Set MLflow Experiment for Preprocessing ---
try:
    # Make sure spark session from Init cell is used if get_or_create_experiment needs it
    preprocessing_mlflow_experiment_id = get_or_create_experiment(PREPROCESSING_EXPERIMENT_PATH, spark)
    if preprocessing_mlflow_experiment_id:
        # Setting experiment for the whole notebook session for this stage
        mlflow.set_experiment(experiment_id=preprocessing_mlflow_experiment_id)
        print(f"MLflow experiment '{PREPROCESSING_EXPERIMENT_PATH}' for preprocessing is set with ID: {preprocessing_mlflow_experiment_id}")
    else:
        raise Exception("Preprocessing MLflow experiment could not be set. Halting.")
except Exception as e:
    print(f"CRITICAL: Could not initialize MLflow experiment for preprocessing. Error: {e}")
    # Consider dbutils.notebook.exit("MLflow experiment setup failed")

# --- 1. Load Raw Data into Pandas DataFrames ---
# Paths are from Init Cell. Ensure these Parquet files exist in UC Volumes.
print(f"\nLoading RAW training data for Pandas preprocessing...")
raw_train_pdf = load_raw_data_to_pandas(RAW_TRAIN_DATA_PATH)
print(f"RAW training data features: {raw_train_pdf.columns.tolist()}")
raw_train_pdf.info()

print(f"\nLoading RAW test data for Pandas preprocessing...")
raw_test_pdf = load_raw_data_to_pandas(RAW_TEST_DATA_PATH)

# --- 2. Fit Preprocessor on Training Data ---
# This MLflow run will log the parameters and the fitted components artifact
fitted_components_dict = None
with mlflow.start_run(run_name="Pandas_Preprocessor_Fit") as preproc_run:
    print(f"\nFitting preprocessor. MLflow Run ID: {preproc_run.info.run_id}")
    mlflow.log_param("label_column", YOUR_LABEL_COLUMN_NAME)
    mlflow.log_param("categorical_features_raw", ", ".join(CATEGORICAL_COLUMNS_RAW))
    mlflow.log_param("numerical_features_raw", ", ".join(NUMERICAL_COLUMNS_RAW))
    mlflow.log_param("target_encoding_smoothing", TARGET_ENCODING_SMOOTHING)
    mlflow.log_param("global_seed", GLOBAL_SEED)
    mlflow.set_tag("preprocessing_type", "pandas_mvp")

    try:
        X_train_processed_np, y_train_processed_np, fitted_components_dict = fit_pandas_preprocessor(
            raw_train_pdf,
            CATEGORICAL_COLUMNS_RAW,
            NUMERICAL_COLUMNS_RAW,
            YOUR_LABEL_COLUMN_NAME,
            TARGET_ENCODING_SMOOTHING,
            GLOBAL_SEED
        )
        print(f"  Processed X_train_np shape: {X_train_processed_np.shape}, y_train_np shape: {y_train_processed_np.shape}")

        # Save processed training data and the fitted components
        save_processed_data_and_components(
            X_train_processed_np, y_train_processed_np, 
            SHARED_PROCESSED_TRAIN_PATH, # Path from Init Cell (DBFS path for joblib)
            fitted_components_dict, DBFS_PREPROCESSOR_COMPONENTS_PATH # Path from Init Cell
        )
        
        # Log components as artifact in MLflow
        # mlflow.log_dict(fitted_components_dict, "fitted_preprocessing_components.json") # This won't work for sklearn objects
        mlflow.log_artifact(DBFS_PREPROCESSOR_COMPONENTS_PATH, artifact_path=MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH)
        print(f"  Logged fitted preprocessor components to MLflow artifact path: {MLFLOW_PANDAS_PREPROCESSOR_ARTIFACT_PATH}")
        mlflow.set_tag("status_fit", "success")

    except Exception as e:
        print(f"  ERROR during preprocessor fitting: {e}")
        mlflow.log_param("error_fit", str(e)[:250])
        mlflow.set_tag("status_fit", "failed")
        raise # Re-raise to stop execution if fitting fails

# --- 3. Transform Test Data using Fitted Preprocessor ---
if fitted_components_dict: # Proceed only if fitting was successful
    print("\nTransforming TEST data using fitted preprocessor...")
    try:
        X_test_processed_np, y_test_processed_np = transform_pandas_preprocessor(
            raw_test_pdf,
            fitted_components_dict,
            CATEGORICAL_COLUMNS_RAW,
            NUMERICAL_COLUMNS_RAW,
            YOUR_LABEL_COLUMN_NAME, # Pass label col to get y_test_processed_np
            is_train_data=False
        )
        print(f"  Processed X_test_np shape: {X_test_processed_np.shape}, y_test_np shape: {y_test_processed_np.shape if y_test_processed_np is not None else 'N/A'}")

        # Save processed test data
        # The y_test_processed_np is needed by HPO objective function for evaluation
        save_processed_data_and_components(
            X_test_processed_np, y_test_processed_np,
            SHARED_PROCESSED_TEST_PATH, # Path from Init Cell (DBFS path for joblib)
            None, None # Don't re-save components here
        )
        print("  Test data transformed and saved.")
    except Exception as e:
        print(f"  ERROR during test data transformation: {e}")
        # Decide if this is critical. For now, we'll let it error out.
        raise
else:
    print("CRITICAL: Preprocessor fitting failed or did not produce components. Cannot transform test data.")


print("\n--- Pandas Preprocessing Orchestration Completed ---")
print(f"Processed training data should be at: {SHARED_PROCESSED_TRAIN_PATH}")
print(f"Processed test data should be at: {SHARED_PROCESSED_TEST_PATH}")
print(f"Fitted preprocessor components should be at: {DBFS_PREPROCESSOR_COMPONENTS_PATH}")
print("You can now proceed to the HPO and Model Training script/cells.")