In [None]:
# <-------------------- CELL 1: IMPORTS -------------------->
print("Cell 1: Imports - Executing...")
import mlflow
import mlflow.pyfunc # For saving custom Python models

import pandas as pd
import numpy as np
import os
import joblib # For saving the pyfunc model's components IF NOT relying solely on mlflow pickling the instance
import time
from datetime import datetime

from sklearn.model_selection import train_test_split # Used for month-wise splits
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce # For TargetEncoder

from pyspark.sql import SparkSession # Still useful for environment context

# Suppress common warnings for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', message="Previously subsetted data...") # From category_encoders

# Ensure spark session is available
if 'spark' not in locals():
    spark = SparkSession.builder.appName("Pandas_FE_Preprocessing_Full_MVP").getOrCreate()
    print("SparkSession created.")
else:
    print("SparkSession already exists.")

print("Imports successful for Pandas Preprocessing Pipeline with Feature Engineering.")
print("-" * 50)

# <-------------------- CELL 2: INIT CELL - GLOBAL CONFIGURATIONS FOR PREPROCESSING -------------------->
print("\nCell 2: Global Configurations for Preprocessing - Executing...")

# --- MLflow Configuration ---
# !!! IMPORTANT: SET YOUR MLFLOW EXPERIMENT PATH !!!
PREPROCESSING_EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Classification_FE_Preprocessing_Full" # CHANGE THIS

# --- Data Paths (Unity Catalog Volumes) ---
# !!! IMPORTANT: SET YOUR UNITY CATALOG VOLUME BASE PATH !!!
UC_BASE_DATA_PATH = "/Volumes/delfos/" # As per your input

# --- Path to the FULL RAW input dataset ---
# !!! IMPORTANT: UPDATE THIS TO YOUR ACTUAL FULL RAW DATASET PATH IN UC VOLUMES !!!
FULL_RAW_DATA_PARQUET_PATH = f"{UC_BASE_DATA_PATH}raw_data_full/full_dataset_generic.parquet" # Example

# --- Date Column for Stratified Splitting ---
# !!! IMPORTANT: SET THE NAME OF YOUR DATE/TIMESTAMP COLUMN IN THE RAW DATA !!!
DATE_COLUMN_FOR_SPLIT = "date_col" # Example: 'order_date', 'policy_start_date'

# --- Output Paths for INTERMEDIATE RAW SPLIT Data (Optional, but good for traceability) ---
# Using /dbfs/ prefix for direct Pandas/os operations on UC Volumes
DBFS_RAW_SPLITS_DIR = f"/dbfs{UC_BASE_DATA_PATH}raw_splits_v1_fe/" # Unique name
RAW_TRAIN_SPLIT_PATH = os.path.join(DBFS_RAW_SPLITS_DIR, "raw_train_split.parquet")
RAW_TEST_SPLIT_PATH = os.path.join(DBFS_RAW_SPLITS_DIR, "raw_test_split.parquet")

# --- Output Paths for FINAL PROCESSED Data (Parquet with Named Columns) ---
PROCESSED_DATA_VERSION_FE = "v1_pandas_fe_final" # Versioning for processed data
DBFS_PROCESSED_DATA_DIR_BASE_FE = f"/dbfs{UC_BASE_DATA_PATH}processed_data/"
PROCESSED_DATA_DIR_VERSIONED_FE = os.path.join(DBFS_PROCESSED_DATA_DIR_BASE_FE, PROCESSED_DATA_VERSION_FE)

# These paths will point to the Parquet files with named columns
SHARED_PROCESSED_TRAIN_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED_FE, "train_processed_named_cols.parquet")
SHARED_PROCESSED_TEST_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED_FE, "test_processed_named_cols.parquet")

# --- MLflow artifact path for the saved pyfunc preprocessor model ---
MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH = "pandas_full_preprocessor"

# !!! IMPORTANT: SET YOUR ACTUAL BINARY TARGET LABEL COLUMN NAME (must exist in raw data) !!!
YOUR_TARGET_COLUMN_NAME = "target_binary" # Example: 0 or 1

# --- Define your categorical and numerical columns from the RAW data ---
# These are columns read from FULL_RAW_DATA_PARQUET_PATH *before* any FE in the PyFunc model.
# The PyFunc model will then create new features based on these (e.g. from date_col, age_col, premium_col).
# !!! IMPORTANT: UPDATE THESE LISTS BASED ON YOUR ACTUAL RAW DATASET !!!
CATEGORICAL_COLUMNS_RAW = ["cat_feat_1", "cat_feat_2", "region_col"] # Example
NUMERICAL_COLUMNS_RAW = ["num_feat_1", "age_col", "premium_col", "interactions_col"] # Example

# --- Preprocessing Configuration ---
TEST_SET_SPLIT_RATIO = 0.20
TARGET_ENCODING_SMOOTHING = 20.0 # For category_encoders.TargetEncoder
CATEGORICAL_IMPUTE_CONSTANT = "__MISSING_CAT__" # For SimpleImputer
NUMERICAL_IMPUTE_STRATEGY = "median" # For SimpleImputer

# --- Reproducibility ---
GLOBAL_SEED = 117

# --- Feature Engineering Configuration (Example) ---
# These original column names are used as basis for FE. They must be in NUMERICAL_COLUMNS_RAW or DATE_COLUMN_FOR_SPLIT.
AGE_COL_FOR_FE = "age_col" # Must be in NUMERICAL_COLUMNS_RAW
PREMIUM_COL_FOR_FE = "premium_col" # Must be in NUMERICAL_COLUMNS_RAW
INTERACTIONS_COL_FOR_FE = "interactions_col" # Example, must be in NUMERICAL_COLUMNS_RAW

# For binning age (example)
AGE_BINS = [0, 18, 30, 45, 60, 120]
AGE_BIN_LABELS = ['0-18', '19-30', '31-45', '46-60', '60+']

# Ensure output directories exist (using /dbfs/ prefix for os.makedirs)
try:
    os.makedirs(DBFS_RAW_SPLITS_DIR, exist_ok=True)
    os.makedirs(PROCESSED_DATA_DIR_VERSIONED_FE, exist_ok=True)
    print(f"Checked/created raw splits directory: {DBFS_RAW_SPLITS_DIR}")
    print(f"Checked/created processed data directory: {PROCESSED_DATA_DIR_VERSIONED_FE}")
except Exception as e:
    print(f"Warning: Could not create directory. Ensure UC Volume '{UC_BASE_DATA_PATH}' exists and you have write permissions. Error: {e}")

print(f"--- Preprocessing Global Configurations (with FE) Initialized ---")
print(f"MLflow Experiment Path for Preprocessing: {PREPROCESSING_EXPERIMENT_PATH}")
print(f"Full Raw Data Input Path: {FULL_RAW_DATA_PARQUET_PATH}")
print(f"Date Column for Split: {DATE_COLUMN_FOR_SPLIT}")
print(f"Target Column: {YOUR_TARGET_COLUMN_NAME}")
print(f"  Output Processed Train Data Path (Parquet Named Cols): {SHARED_PROCESSED_TRAIN_PATH}")
print(f"  Output Processed Test Data Path (Parquet Named Cols): {SHARED_PROCESSED_TEST_PATH}")
print(f"  MLflow Pyfunc Preprocessor Artifact Path: {MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH}")
print(f"Categorical Columns (Raw): {CATEGORICAL_COLUMNS_RAW}")
print(f"Numerical Columns (Raw): {NUMERICAL_COLUMNS_RAW}")
print(f"Global Seed: {GLOBAL_SEED}")
print("-" * 50)


# <-------------------- CELL 3: PREPROCESSING LOGIC & MLFLOW PYFUNC MODEL CLASS -------------------->
print("\nCell 3: Preprocessing Logic & Pyfunc Model Class - Defining (with Custom Initial Formatting & Feature Engineering)...")

# --- MLflow Utility ---
def get_or_create_experiment(experiment_name_param, spark_session_param=None): # spark_session_param optional
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name_param)
        if experiment:
            print(f"MLflow experiment '{experiment_name_param}' found with ID: {experiment.experiment_id}")
            return experiment.experiment_id
        else:
            print(f"MLflow experiment '{experiment_name_param}' not found. Attempting to create.")
            experiment_id = mlflow.create_experiment(name=experiment_name_param)
            print(f"MLflow experiment '{experiment_name_param}' created with ID: {experiment_id}")
            return experiment_id
    except Exception as e:
        print(f"Error in get_or_create_experiment for '{experiment_name_param}'. Error: {e}")
        return None

# --- Data Splitting Function ---
def split_by_month_and_stratify(full_pdf: pd.DataFrame, 
                                date_col: str, 
                                target_col: str, 
                                test_size: float, 
                                random_state: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    print(f"  Starting stratified split by month from column '{date_col}' and target '{target_col}'...")
    if date_col not in full_pdf.columns:
        raise ValueError(f"Date column '{date_col}' not found in DataFrame.")
    if target_col not in full_pdf.columns:
        raise ValueError(f"Target column '{target_col}' not found in DataFrame.")

    try:
        temp_df = full_pdf.copy()
        temp_df[date_col] = pd.to_datetime(temp_df[date_col])
        temp_df['year_month_group'] = temp_df[date_col].dt.to_period('M')
    except Exception as e:
        raise ValueError(f"Could not convert column '{date_col}' to datetime or extract year_month. Error: {e}")

    train_dfs_list = []
    test_dfs_list = []
    
    for month_period, group_data in temp_df.groupby('year_month_group'):
        print(f"    Splitting for month-year: {month_period}, group size: {len(group_data)}")
        if len(group_data) < 2:
             print(f"      Group for {month_period} is too small ({len(group_data)}). Assigning based on ratio if possible, else to train.")
             if len(group_data) == 1: # Single sample goes to train
                train_dfs_list.append(group_data)
             elif np.random.RandomState(random_state).rand() > test_size : # Use seeded random for consistency
                 train_dfs_list.append(group_data)
             else:
                 test_dfs_list.append(group_data)
             continue
        
        target_counts = group_data[target_col].value_counts()
        # Check if any class has fewer samples than required for a split (typically 2 for train_test_split)
        # Or if only one class is present in the group
        min_samples_per_class_needed = 2 # For train_test_split to be able to make a split for each class
        
        if len(target_counts) < 2 or target_counts.min() < min_samples_per_class_needed:
            print(f"      Not enough class diversity or samples in {month_period} for stratification (counts: {target_counts.to_dict()}). Performing random split.")
            month_train_df, month_test_df = train_test_split(
                group_data, test_size=test_size, random_state=random_state, shuffle=True
            )
        else:
            try:
                month_train_df, month_test_df = train_test_split(
                    group_data, test_size=test_size, random_state=random_state, 
                    stratify=group_data[target_col], shuffle=True
                )
            except ValueError as ve: # Handles "The least populated class in y has only X members"
                print(f"      Stratification failed for {month_period} (Error: {ve}). Performing random split.")
                month_train_df, month_test_df = train_test_split(
                    group_data, test_size=test_size, random_state=random_state, shuffle=True
                )
        train_dfs_list.append(month_train_df)
        if not month_test_df.empty: # Only append if test_df is not empty
             test_dfs_list.append(month_test_df)

    final_train_df = pd.concat(train_dfs_list).drop(columns=['year_month_group']) if train_dfs_list else pd.DataFrame(columns=full_pdf.columns)
    final_test_df = pd.concat(test_dfs_list).drop(columns=['year_month_group']) if test_dfs_list else pd.DataFrame(columns=full_pdf.columns)


    print(f"  Splitting complete. Train shape: {final_train_df.shape}, Test shape: {final_test_df.shape}")
    if not final_train_df.empty: print(f"  Train target distribution:\n{final_train_df[target_col].value_counts(normalize=True, dropna=False)}")
    if not final_test_df.empty: print(f"  Test target distribution:\n{final_test_df[target_col].value_counts(normalize=True, dropna=False)}")
    return final_train_df, final_test_df


# --- Pandas Preprocessor as an mlflow.pyfunc.PythonModel (with Feature Engineering) ---
class PandasFeatureEngineeringPreprocessor(mlflow.pyfunc.PythonModel):
    
    def __init__(self, 
                 raw_categorical_cols, raw_numerical_cols, 
                 date_col_for_fe, age_col_for_fe, premium_col_for_fe, interactions_col_for_fe,
                 age_bins_for_fe, age_bin_labels_for_fe,
                 label_col_for_te_fitting, 
                 te_smoothing_factor, 
                 cat_impute_constant, num_impute_strategy,
                 global_seed):
        
        self.raw_categorical_cols_config = list(raw_categorical_cols)
        self.raw_numerical_cols_config = list(raw_numerical_cols)
        self.date_col_for_fe = date_col_for_fe
        self.age_col_for_fe = age_col_for_fe
        self.premium_col_for_fe = premium_col_for_fe
        self.interactions_col_for_fe = interactions_col_for_fe
        self.age_bins_for_fe = age_bins_for_fe
        self.age_bin_labels_for_fe = age_bin_labels_for_fe
        self.label_col_for_te_fitting = label_col_for_te_fitting # Only used for fitting TE
        self.te_smoothing_factor = te_smoothing_factor
        self.cat_impute_constant = cat_impute_constant
        self.num_impute_strategy = num_impute_strategy
        self.global_seed = global_seed

        # Fitted components and dynamic column lists will be stored here after `fit`
        self.fitted_components = {} # Store all imputers, encoders, scalers
        self.feature_engineering_details = {} # Store names of engineered features
        self.final_feature_columns_in_order = [] # Defines the output columns and their order

    def _get_valid_cols(self, df, col_list):
        return [col for col in col_list if col in df.columns and col != self.label_col_for_te_fitting]

    def _apply_initial_custom_formatting(self, df_input: pd.DataFrame) -> pd.DataFrame:
        df = df_input.copy()
        print("    Custom Preprocessing Step 1: Applying initial custom data formatting...")
        # !!! REPLACE THE CONTENT BELOW WITH YOUR ACTUAL CUSTOM FORMATTING LOGIC !!!
        # This function should take the raw DataFrame (without the label if separated early)
        # and return the formatted DataFrame.
        # The columns listed in raw_categorical_cols_config and raw_numerical_cols_config
        # should exist AFTER this step, or this step should create them.
        
        # Example (ensure this doesn't conflict with your actual column names):
        # if 'some_text_column' in df.columns:
        #     df['some_text_column'] = df['some_text_column'].astype(str).str.lower().str.strip()
        #     print("      Example custom formatting: 'some_text_column' processed.")
        # else:
        #     print("      Example custom formatting: 'some_text_column' not found.")
        print("    Custom Preprocessing Step 1: Initial custom data formatting complete.")
        return df

    def _engineer_features(self, df_input: pd.DataFrame, is_fitting_phase: bool) -> pd.DataFrame:
        df = df_input.copy()
        print("    Feature Engineering Step 2: Creating new features...")
        
        engineered_categoricals_temp = []
        engineered_numericals_temp = []

        # Date/Time Features
        if self.date_col_for_fe and self.date_col_for_fe in df.columns:
            try:
                s_date = pd.to_datetime(df[self.date_col_for_fe], errors='coerce')
                if not s_date.isnull().all(): # Proceed if some dates are valid
                    df['fe_month'] = s_date.dt.month.fillna(-1).astype(int).astype(str) # Impute NaT with -1 then to str
                    df['fe_day_of_week'] = s_date.dt.dayofweek.fillna(-1).astype(int).astype(str)
                    df['fe_is_weekend'] = s_date.dt.dayofweek.isin([5,6]).astype(int).astype(str)
                    engineered_categoricals_temp.extend(['fe_month', 'fe_day_of_week', 'fe_is_weekend'])
                    print(f"      FE: Created date features: month, day_of_week, is_weekend from {self.date_col_for_fe}")
            except Exception as e_date: print(f"      Warning: Could not create date features from {self.date_col_for_fe}. Error: {e_date}")
        
        # Numerical Transformations
        for col_name, new_col_prefix in [
            (self.premium_col_for_fe, "fe_premium"), 
            (self.age_col_for_fe, "fe_age"), 
            (self.interactions_col_for_fe, "fe_interactions") # Example
        ]:
            if col_name and col_name in df.columns:
                df[col_name] = pd.to_numeric(df[col_name], errors='coerce') # Ensure numeric
                df[f'{new_col_prefix}_log1p'] = np.log1p(df[col_name].fillna(0).clip(lower=0))
                df[f'{new_col_prefix}_sq'] = df[col_name].fillna(0)**2
                engineered_numericals_temp.extend([f'{new_col_prefix}_log1p', f'{new_col_prefix}_sq'])
                print(f"      FE: Created log1p and squared features for {col_name}")

        # Binning Age
        if self.age_col_for_fe and self.age_col_for_fe in df.columns and self.age_bins_for_fe and self.age_bin_labels_for_fe:
            age_col_binned_name = f'fe_{self.age_col_for_fe}_binned'
            # Ensure age column is numeric before binning
            df[self.age_col_for_fe] = pd.to_numeric(df[self.age_col_for_fe], errors='coerce')
            df[age_col_binned_name] = pd.cut(df[self.age_col_for_fe], 
                                             bins=self.age_bins_for_fe, 
                                             labels=self.age_bin_labels_for_fe, 
                                             right=False, include_lowest=True)
            df[age_col_binned_name] = df[age_col_binned_name].astype(str).fillna(self.cat_impute_constant) # Handle NaNs from binning then to string
            engineered_categoricals_temp.append(age_col_binned_name)
            print(f"      FE: Created binned age feature: {age_col_binned_name}")

        if is_fitting_phase:
            self.feature_engineering_details['engineered_categorical_cols'] = list(set(engineered_categoricals_temp))
            self.feature_engineering_details['engineered_numerical_cols'] = list(set(engineered_numericals_temp))
        return df

    def _create_interaction_features_post_te(self, df_input: pd.DataFrame, is_fitting_phase: bool) -> pd.DataFrame:
        df = df_input.copy()
        print("    Feature Engineering Step 4: Creating interaction features (post-target encoding)...")
        newly_created_interactions_temp = []

        # Use log_premium if available, else original premium (ensure it exists)
        premium_col_for_interact = f'fe_log_{self.premium_col_for_fe}' if f'fe_log_{self.premium_col_for_fe}' in df.columns else self.premium_col_for_fe
        if premium_col_for_interact not in df.columns:
            print(f"      FE Interaction: Base premium column '{premium_col_for_interact}' for interactions not found. Skipping.")
            if is_fitting_phase: self.feature_engineering_details['engineered_interaction_cols'] = []
            return df
        
        # Ensure premium column for interaction is numeric
        df[premium_col_for_interact] = pd.to_numeric(df[premium_col_for_interact], errors='coerce').fillna(0)

        # Example: Interaction with target-encoded binned age
        # The original binned age (e.g., 'fe_age_col_binned') was target encoded.
        # Its name remains the same after TE by category_encoders.TargetEncoder.
        age_binned_col_name = f'fe_{self.age_col_for_fe}_binned'
        if age_binned_col_name in df.columns: # This column is now numeric (target-encoded)
            interaction_col_name = f'fe_inter_{premium_col_for_interact}_x_{age_binned_col_name}'
            df[interaction_col_name] = df[premium_col_for_interact] * df[age_binned_col_name]
            newly_created_interactions_temp.append(interaction_col_name)
            print(f"      FE Interaction: Created {interaction_col_name}")
        
        # Example: Interaction with target-encoded raw categorical 'customer_segment'
        # (Assuming 'customer_segment' is in self.raw_categorical_cols_config)
        customer_segment_col_name = "customer_segment" # Example from raw_categorical_cols_config
        if customer_segment_col_name in self.raw_categorical_cols_config and customer_segment_col_name in df.columns:
            interaction_col_name_seg = f'fe_inter_{premium_col_for_interact}_x_{customer_segment_col_name}'
            df[interaction_col_name_seg] = df[premium_col_for_interact] * df[customer_segment_col_name] # Assumes customer_segment is now numeric post-TE
            newly_created_interactions_temp.append(interaction_col_name_seg)
            print(f"      FE Interaction: Created {interaction_col_name_seg}")

        if is_fitting_phase:
            self.feature_engineering_details['engineered_interaction_cols'] = list(set(newly_created_interactions_temp))
        return df


    def fit(self, train_pdf: pd.DataFrame):
        print("    Fitting PandasFeatureEngineeringPreprocessor...")
        if self.label_col_for_te_fitting not in train_pdf.columns:
            raise ValueError(f"Label column '{self.label_col_for_te_fitting}' for Target Encoder fitting not found in training DataFrame.")
        
        # --- Step 0: Initial Custom Formatting ---
        X_fit_custom_formatted = self._apply_initial_custom_formatting(
            train_pdf.drop(columns=[self.label_col_for_te_fitting], errors='ignore')
        )
        y_fit_series = train_pdf[self.label_col_for_te_fitting].astype(float).copy() # Used by TargetEncoder

        # --- Step 1: Create Base Engineered Features ---
        X_fit_fe_engineered = self._engineer_features(X_fit_custom_formatted, is_fitting_phase=True)
        
        # --- Define columns for imputation based on raw and engineered ---
        # Valid columns present after initial FE
        current_cols_in_X = X_fit_fe_engineered.columns.tolist()
        active_numerical_cols = self._get_valid_cols(X_fit_fe_engineered, self.raw_numerical_cols_config + self.feature_engineering_details.get('engineered_numerical_cols', []))
        active_categorical_cols = self._get_valid_cols(X_fit_fe_engineered, self.raw_categorical_cols_config + self.feature_engineering_details.get('engineered_categorical_cols', []))
        
        self.fitted_components['active_numerical_cols_for_impute'] = active_numerical_cols
        self.fitted_components['active_categorical_cols_for_impute'] = active_categorical_cols

        # --- Step 2: Impute Numerical Features ---
        if active_numerical_cols:
            num_imputer = SimpleImputer(strategy=self.num_impute_strategy)
            X_fit_fe_engineered[active_numerical_cols] = num_imputer.fit_transform(X_fit_fe_engineered[active_numerical_cols])
            self.fitted_components['numerical_imputer'] = num_imputer
            print(f"      Fitted Numerical Imputer for: {active_numerical_cols}")

        # --- Step 3: Impute Categorical Features ---
        if active_categorical_cols:
            cat_imputer = SimpleImputer(strategy="constant", fill_value=self.cat_impute_constant)
            X_fit_fe_engineered[active_categorical_cols] = cat_imputer.fit_transform(X_fit_fe_engineered[active_categorical_cols])
            self.fitted_components['categorical_imputer'] = cat_imputer
            print(f"      Fitted Categorical Imputer for: {active_categorical_cols}")
            
        # --- Step 4: Target Encoding ---
        # Target encode all active categorical columns (original raw + newly engineered categoricals)
        cols_for_te_fit = list(active_categorical_cols) # Use the imputed ones
        target_encoded_output_names = [] # Will be same as input names for category_encoders.TargetEncoder
        if cols_for_te_fit:
            for col in cols_

In [None]:
# <-------------------- CELL 1: IMPORTS -------------------->
print("Cell 1: Imports - Executing...")
import mlflow
import mlflow.pyfunc

import pandas as pd
import numpy as np
import os
import joblib
import time
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce

from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', message="Previously subsetted data...")

if 'spark' not in locals():
    spark = SparkSession.builder.appName("Pandas_FE_Preprocessing_UC_Volumes").getOrCreate()
    print("SparkSession created.")
else:
    print("SparkSession already exists.")

print("Imports successful for Pandas Preprocessing Pipeline with Feature Engineering (UC Volumes).")
print("-" * 50)

# <-------------------- CELL 2: INIT CELL - GLOBAL CONFIGURATIONS FOR PREPROCESSING -------------------->
print("\nCell 2: Global Configurations for Preprocessing - Executing...")

# --- MLflow Configuration ---
PREPROCESSING_EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Classification_FE_Preprocessing_UCV" # !!! CHANGE THIS !!!

# --- Data Paths (Unity Catalog Volumes) ---
# All paths will now be direct /Volumes/ paths
UC_BASE_DATA_PATH = "/Volumes/delfos/" # !!! From your input: /Volumes/<catalog>/<schema>/<volume>/ !!!

FULL_RAW_DATA_PARQUET_PATH = f"{UC_BASE_DATA_PATH}raw_data_full/full_dataset_generic.parquet" # !!! UPDATE !!!

DATE_COLUMN_FOR_SPLIT = "date_col" # !!! GENERIC NAME - UPDATE !!!
YOUR_TARGET_COLUMN_NAME = "target_binary" # !!! GENERIC NAME - UPDATE (0 or 1) !!!

CATEGORICAL_COLUMNS_RAW = ["cat_feat_1", "cat_feat_2", "region_col"] # !!! UPDATE !!!
NUMERICAL_COLUMNS_RAW = ["num_feat_1", "age_col", "premium_col", "interactions_col"] # !!! UPDATE !!!

# --- Output Paths for INTERMEDIATE RAW SPLIT Data (Optional) ---
RAW_SPLITS_DIR_UCV = os.path.join(UC_BASE_DATA_PATH, "raw_splits_v1_fe_ucv") # Direct UC Volume path
RAW_TRAIN_SPLIT_PATH_UCV = os.path.join(RAW_SPLITS_DIR_UCV, "raw_train_split.parquet")
RAW_TEST_SPLIT_PATH_UCV = os.path.join(RAW_SPLITS_DIR_UCV, "raw_test_split.parquet")

# --- Output Paths for FINAL PROCESSED Data (Parquet with Named Columns) ---
PROCESSED_DATA_VERSION_FE_UCV = "v1_pandas_fe_ucv_parquet_only"
PROCESSED_DATA_DIR_BASE_FE_UCV = os.path.join(UC_BASE_DATA_PATH, "processed_data")
PROCESSED_DATA_DIR_VERSIONED_FE_UCV = os.path.join(PROCESSED_DATA_DIR_BASE_FE_UCV, PROCESSED_DATA_VERSION_FE_UCV)

SHARED_PROCESSED_TRAIN_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED_FE_UCV, "train_processed_named_cols.parquet")
SHARED_PROCESSED_TEST_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED_FE_UCV, "test_processed_named_cols.parquet")

# --- Path for saving the FITTED PREPROCESSING COMPONENTS for pyfunc model (joblib) ---
# This path will be used by joblib.dump for the pyfunc's internal components if not relying on MLflow's auto-pickling.
# For MLflow pyfunc, the components are pickled with the model instance. We'll use this path for MLflow to log *from*.
# So, this path is a temporary local path on the driver that MLflow can access to pick up the components.
# Or, if joblib can write directly to /Volumes/ for MLflow to pick up, we can use that.
# Let's assume for `mlflow.pyfunc.log_model`, internal components of the PythonModel instance are pickled.
# If we needed to save components separately and tell pyfunc to load them from artifacts, this would be different.
# For now, the PythonModel will store fitted components as attributes.

MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH = "pandas_classification_fe_preprocessor_ucv" # Name in MLflow artifacts

TEST_SET_SPLIT_RATIO = 0.20
TARGET_ENCODING_SMOOTHING = 20.0
CATEGORICAL_IMPUTE_CONSTANT = "__MISSING_CAT__"
NUMERICAL_IMPUTE_STRATEGY = "median"
GLOBAL_SEED = 117

AGE_COL_FOR_FE = "age_col"
PREMIUM_COL_FOR_FE = "premium_col"
INTERACTIONS_COL_FOR_FE = "interactions_col"
AGE_BINS = [0, 18, 30, 45, 60, 120]
AGE_BIN_LABELS = ['0-18', '19-30', '31-45', '46-60', '60+']

# Ensure output directories exist using os.makedirs with /Volumes/ paths
# This assumes your Databricks environment allows direct os operations on /Volumes/
try:
    os.makedirs(RAW_SPLITS_DIR_UCV, exist_ok=True)
    os.makedirs(PROCESSED_DATA_DIR_VERSIONED_FE_UCV, exist_ok=True)
    print(f"Checked/created raw splits directory: {RAW_SPLITS_DIR_UCV}")
    print(f"Checked/created processed data directory: {PROCESSED_DATA_DIR_VERSIONED_FE_UCV}")
except Exception as e:
    print(f"Warning: Could not create directories using os.makedirs with /Volumes/ paths. Error: {e}")
    print("Ensure the UC Volume path is correct, mounted, and you have write permissions from the driver.")

print(f"--- Preprocessing Global Configurations (UC Volumes Exclusive) Initialized ---")
print(f"MLflow Experiment Path for Preprocessing: {PREPROCESSING_EXPERIMENT_PATH}")
print(f"Full Raw Data Input Path: {FULL_RAW_DATA_PARQUET_PATH}")
print(f"  Output Processed Train Data Path (Parquet Named Cols): {SHARED_PROCESSED_TRAIN_PATH}")
print(f"  Output Processed Test Data Path (Parquet Named Cols): {SHARED_PROCESSED_TEST_PATH}")
print(f"  MLflow Pyfunc Preprocessor Artifact Path: {MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH}")
print("-" * 50)


# <-------------------- CELL 3: PREPROCESSING LOGIC & MLFLOW PYFUNC MODEL CLASS -------------------->
print("\nCell 3: Preprocessing Logic & Pyfunc Model Class - Defining (UC Volumes Exclusive)...")

# --- MLflow Utility (get_or_create_experiment - same as before) ---
def get_or_create_experiment(experiment_name_param, spark_session_param=None):
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name_param)
        if experiment: print(f"MLflow experiment '{experiment_name_param}' found with ID: {experiment.experiment_id}"); return experiment.experiment_id
        else:
            print(f"MLflow experiment '{experiment_name_param}' not found. Creating."); experiment_id = mlflow.create_experiment(name=experiment_name_param)
            print(f"MLflow experiment '{experiment_name_param}' created with ID: {experiment_id}"); return experiment_id
    except Exception as e: print(f"Error in get_or_create_experiment for '{experiment_name_param}': {e}"); return None

# --- Data Splitting Function (split_by_month_and_stratify - same as before) ---
def split_by_month_and_stratify(full_pdf: pd.DataFrame, date_col: str, target_col: str, test_size: float, random_state: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    # ... (Full implementation from the previous detailed response) ...
    print(f"  Starting stratified split by month from column '{date_col}' and target '{target_col}'...")
    if date_col not in full_pdf.columns: raise ValueError(f"Date column '{date_col}' not found.")
    if target_col not in full_pdf.columns: raise ValueError(f"Target column '{target_col}' not found.")
    try:
        temp_df = full_pdf.copy(); temp_df[date_col] = pd.to_datetime(temp_df[date_col]); temp_df['year_month_group'] = temp_df[date_col].dt.to_period('M')
    except Exception as e: raise ValueError(f"Could not process date column '{date_col}'. Error: {e}")
    train_dfs_list, test_dfs_list = [], []
    for month_period, group_data in temp_df.groupby('year_month_group'):
        if len(group_data) < 2:
             if len(group_data) == 1: train_dfs_list.append(group_data)
             elif np.random.RandomState(random_state).rand() > test_size : train_dfs_list.append(group_data)
             else: test_dfs_list.append(group_data)
             continue
        target_counts = group_data[target_col].value_counts()
        min_samples_per_class_needed = 2 
        if len(target_counts) < 2 or target_counts.min() < min_samples_per_class_needed:
            month_train_df, month_test_df = train_test_split(group_data, test_size=test_size, random_state=random_state, shuffle=True)
        else:
            try: month_train_df, month_test_df = train_test_split(group_data, test_size=test_size, random_state=random_state, stratify=group_data[target_col], shuffle=True)
            except ValueError as ve: month_train_df, month_test_df = train_test_split(group_data, test_size=test_size, random_state=random_state, shuffle=True)
        train_dfs_list.append(month_train_df)
        if not month_test_df.empty: test_dfs_list.append(month_test_df)
    final_train_df = pd.concat(train_dfs_list).drop(columns=['year_month_group']) if train_dfs_list else pd.DataFrame(columns=full_pdf.columns)
    final_test_df = pd.concat(test_dfs_list).drop(columns=['year_month_group']) if test_dfs_list else pd.DataFrame(columns=full_pdf.columns)
    print(f"  Splitting complete. Train shape: {final_train_df.shape}, Test shape: {final_test_df.shape}")
    return final_train_df, final_test_df

# --- Pandas Preprocessor as an mlflow.pyfunc.PythonModel (with Feature Engineering) ---
class PandasFeatureEngineeringPreprocessor(mlflow.pyfunc.PythonModel):
    # ... (Full implementation of the class from the previous response, no changes needed to its internal logic) ...
    # The key is that its `fit` method stores components as self.fitted_components['component_name']
    # and self.final_feature_columns_in_order.
    # Its `predict` method uses these self.fitted_components and self.final_feature_columns_in_order.
    # MLflow's `log_model` will pickle this instance, thus saving the fitted state.
    def __init__(self, raw_categorical_cols, raw_numerical_cols, date_col_for_fe, age_col_for_fe, premium_col_for_fe, interactions_col_for_fe, age_bins_for_fe, age_bin_labels_for_fe, label_col_for_te_fitting, te_smoothing_factor, cat_impute_constant, num_impute_strategy, global_seed):
        self.raw_categorical_cols_config = list(raw_categorical_cols); self.raw_numerical_cols_config = list(raw_numerical_cols)
        self.date_col_for_fe = date_col_for_fe; self.age_col_for_fe = age_col_for_fe; self.premium_col_for_fe = premium_col_for_fe; self.interactions_col_for_fe = interactions_col_for_fe
        self.age_bins_for_fe = age_bins_for_fe; self.age_bin_labels_for_fe = age_bin_labels_for_fe
        self.label_col_for_te_fitting = label_col_for_te_fitting; self.te_smoothing_factor = te_smoothing_factor
        self.cat_impute_constant = cat_impute_constant; self.num_impute_strategy = num_impute_strategy; self.global_seed = global_seed
        self.fitted_components = {}; self.feature_engineering_details = {}; self.final_feature_columns_in_order = []
    def _get_valid_cols(self, df, col_list): return [col for col in col_list if col in df.columns and col != self.label_col_for_te_fitting]
    def _apply_initial_custom_formatting(self, df_input: pd.DataFrame) -> pd.DataFrame:
        df = df_input.copy(); print("    Custom Preprocessing Step 0: Applying initial custom data formatting (User to implement)...")
        # !!! REPLACE THIS WITH YOUR LOGIC !!!
        # Example: if 'example_col_to_clean' in df.columns: df['example_col_to_clean'] = df['example_col_to_clean'].str.strip().str.lower()
        return df
    def _engineer_features(self, df_input: pd.DataFrame, is_fitting_phase: bool) -> pd.DataFrame:
        df = df_input.copy(); print("    Feature Engineering Step 1: Creating base new features...")
        engineered_categoricals_temp, engineered_numericals_temp = [], []
        if self.date_col_for_fe and self.date_col_for_fe in df.columns:
            try: s_date = pd.to_datetime(df[self.date_col_for_fe], errors='coerce')
            if not s_date.isnull().all():
                df['fe_month'] = s_date.dt.month.fillna(-1).astype(int).astype(str); df['fe_day_of_week'] = s_date.dt.dayofweek.fillna(-1).astype(int).astype(str)
                df['fe_is_weekend'] = s_date.dt.dayofweek.isin([5,6]).astype(int).astype(str); engineered_categoricals_temp.extend(['fe_month', 'fe_day_of_week', 'fe_is_weekend'])
            except Exception as e_date: print(f"      Warning: Date FE error for {self.date_col_for_fe}: {e_date}")
        for col_name, new_col_prefix in [(self.premium_col_for_fe, "fe_premium"), (self.age_col_for_fe, "fe_age"), (self.interactions_col_for_fe, "fe_interactions")]:
            if col_name and col_name in df.columns:
                df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
                df[f'{new_col_prefix}_log1p'] = np.log1p(df[col_name].fillna(0).clip(lower=0)); df[f'{new_col_prefix}_sq'] = df[col_name].fillna(0)**2
                engineered_numericals_temp.extend([f'{new_col_prefix}_log1p', f'{new_col_prefix}_sq'])
        if self.age_col_for_fe and self.age_col_for_fe in df.columns and self.age_bins_for_fe and self.age_bin_labels_for_fe:
            age_binned_name = f'fe_{self.age_col_for_fe}_binned'
            df[self.age_col_for_fe] = pd.to_numeric(df[self.age_col_for_fe], errors='coerce')
            df[age_binned_name] = pd.cut(df[self.age_col_for_fe], bins=self.age_bins_for_fe, labels=self.age_bin_labels_for_fe, right=False, include_lowest=True)
            df[age_binned_name] = df[age_binned_name].astype(str).fillna(self.cat_impute_constant); engineered_categoricals_temp.append(age_binned_name)
        if is_fitting_phase: self.feature_engineering_details['engineered_categorical_cols'] = list(set(engineered_categoricals_temp)); self.feature_engineering_details['engineered_numerical_cols'] = list(set(engineered_numericals_temp))
        return df
    def _create_interaction_features_post_te(self, df_input: pd.DataFrame, is_fitting_phase: bool) -> pd.DataFrame:
        df = df_input.copy(); print("    Feature Engineering Step 3: Creating interaction features (post-target encoding)...")
        newly_created_interactions_temp = []
        premium_col_for_interact = f'fe_log_{self.premium_col_for_fe}' if f'fe_log_{self.premium_col_for_fe}' in df.columns else self.premium_col_for_fe
        if premium_col_for_interact not in df.columns:
            if is_fitting_phase: self.feature_engineering_details['engineered_interaction_cols'] = []
            return df
        df[premium_col_for_interact] = pd.to_numeric(df[premium_col_for_interact], errors='coerce').fillna(0)
        age_binned_col_name = f'fe_{self.age_col_for_fe}_binned' 
        if age_binned_col_name in df.columns:
            df[f'fe_inter_{premium_col_for_interact}_x_age_binned'] = df[premium_col_for_interact] * pd.to_numeric(df[age_binned_col_name], errors='coerce').fillna(0)
            newly_created_interactions_temp.append(f'fe_inter_{premium_col_for_interact}_x_age_binned')
        customer_segment_col_name = "customer_segment" # Example
        if customer_segment_col_name in self.raw_categorical_cols_config and customer_segment_col_name in df.columns:
            df[f'fe_inter_{premium_col_for_interact}_x_{customer_segment_col_name}'] = df[premium_col_for_interact] * pd.to_numeric(df[customer_segment_col_name], errors='coerce').fillna(0)
            newly_created_interactions_temp.append(f'fe_inter_{premium_col_for_interact}_x_{customer_segment_col_name}')
        if is_fitting_phase: self.feature_engineering_details['engineered_interaction_cols'] = list(set(newly_created_interactions_temp))
        return df
    def fit(self, train_pdf: pd.DataFrame):
        print("    Fitting PandasFeatureEngineeringPreprocessor..."); X_fit_no_label = train_pdf.drop(columns=[self.label_col_for_te_fitting], errors='ignore').copy()
        y_fit_series = train_pdf[self.label_col_for_te_fitting].astype(float).copy()
        X_fit_custom = self._apply_initial_custom_formatting(X_fit_no_label)
        X_fit_fe = self._engineer_features(X_fit_custom, is_fitting_phase=True)
        active_num_cols = self._get_valid_cols(X_fit_fe, self.raw_numerical_cols_config + self.feature_engineering_details.get('engineered_numerical_cols', []))
        active_cat_cols = self._get_valid_cols(X_fit_fe, self.raw_categorical_cols_config + self.feature_engineering_details.get('engineered_categorical_cols', []))
        self.fitted_components['active_numerical_cols_for_impute'] = active_num_cols; self.fitted_components['active_categorical_cols_for_impute'] = active_cat_cols
        if active_num_cols:
            num_imputer = SimpleImputer(strategy=self.num_impute_strategy); X_fit_fe[active_num_cols] = num_imputer.fit_transform(X_fit_fe[active_num_cols])
            self.fitted_components['numerical_imputer'] = num_imputer
        if active_cat_cols:
            cat_imputer = SimpleImputer(strategy="constant", fill_value=self.cat_impute_constant); X_fit_fe[active_cat_cols] = cat_imputer.fit_transform(X_fit_fe[active_cat_cols])
            self.fitted_components['categorical_imputer'] = cat_imputer
        cols_for_te_fit = list(active_cat_cols); target_encoded_output_names = []
        if cols_for_te_fit:
            for col in cols_for_te_fit: X_fit_fe[col] = X_fit_fe[col].astype('category')
            target_encoder = ce.TargetEncoder(cols=cols_for_te_fit, smoothing=self.te_smoothing_factor, handle_unknown='value', handle_missing='value')
            X_fit_te_transformed = target_encoder.fit_transform(X_fit_fe[cols_for_te_fit], y_fit_series)
            for col in cols_for_te_fit: X_fit_fe[col] = X_fit_te_transformed[col]
            self.fitted_components['target_encoder'] = target_encoder; target_encoded_output_names = list(cols_for_te_fit)
            self.fitted_components['target_encoded_cols_list'] = target_encoded_output_names
        X_fit_interactions = self._create_interaction_features_post_te(X_fit_fe, is_fitting_phase=True)
        self.all_numerical_cols_for_scaling = list(set(active_num_cols + target_encoded_output_names + self.feature_engineering_details.get('engineered_interaction_cols', [])))
        self.final_feature_columns_in_order = [col for col in self.all_numerical_cols_for_scaling if col in X_fit_interactions.columns]
        final_impute_medians_before_scale = {}
        if self.final_feature_columns_in_order:
            for col in self.final_feature_columns_in_order:
                X_fit_interactions[col] = pd.to_numeric(X_fit_interactions[col], errors='coerce')
                if X_fit_interactions[col].isnull().any(): median_val = X_fit_interactions[col].median(); X_fit_interactions[col] = X_fit_interactions[col].fillna(median_val); final_impute_medians_before_scale[col] = median_val
            if final_impute_medians_before_scale: self.fitted_components['final_impute_medians_before_scale'] = final_impute_medians_before_scale
            scaler = StandardScaler(); X_fit_interactions[self.final_feature_columns_in_order] = scaler.fit_transform(X_fit_interactions[self.final_feature_columns_in_order])
            self.fitted_components['scaler'] = scaler
        print("    PandasFeatureEngineeringPreprocessor fitting complete."); return self
    def predict(self, context, model_input_pdf: pd.DataFrame):
        print(f"  Pyfunc Preprocessor: Applying full transformation to input DataFrame with shape {model_input_pdf.shape}...")
        X_data_pd_orig_cols = model_input_pdf.copy()
        if not hasattr(self, 'final_feature_columns_in_order') or not self.final_feature_columns_in_order: raise RuntimeError("Preprocessor not fitted.")
        X_data_pd_custom = self._apply_initial_custom_formatting(X_data_pd_orig_cols)
        X_data_pd_fe = self._engineer_features(X_data_pd_custom, is_fitting_phase=False)
        active_num_cols = self._get_valid_cols(X_data_pd_fe, self.fitted_components.get('active_numerical_cols_for_impute', []))
        if active_num_cols and self.fitted_components.get('numerical_imputer'): X_data_pd_fe[active_num_cols] = self.fitted_components['numerical_imputer'].transform(X_data_pd_fe[active_num_cols])
        active_cat_cols = self._get_valid_cols(X_data_pd_fe, self.fitted_components.get('active_categorical_cols_for_impute', []))
        if active_cat_cols and self.fitted_components.get('categorical_imputer'): X_data_pd_fe[active_cat_cols] = self.fitted_components['categorical_imputer'].transform(X_data_pd_fe[active_cat_cols])
        te_cols_to_transform = self._get_valid_cols(X_data_pd_fe, self.fitted_components.get('target_encoded_cols_list', []))
        if te_cols_to_transform and self.fitted_components.get('target_encoder'):
            for col in te_cols_to_transform: X_data_pd_fe[col] = X_data_pd_fe[col].astype('category')
            transformed_te_cols = self.fitted_components['target_encoder'].transform(X_data_pd_fe[te_cols_to_transform])
            for col in te_cols_to_transform: X_data_pd_fe[col] = transformed_te_cols[col]
        X_data_pd_interactions = self._create_interaction_features_post_te(X_data_pd_fe, is_fitting_phase=False)
        output_df_features_only = X_data_pd_interactions
        if self.final_feature_columns_in_order and self.fitted_components.get('scaler'):
            final_impute_medians = self.fitted_components.get('final_impute_medians_before_scale', {})
            temp_df_for_scaling = pd.DataFrame(index=output_df_features_only.index)
            for col in self.final_feature_columns_in_order:
                if col in output_df_features_only.columns: temp_df_for_scaling[col] = pd.to_numeric(output_df_features_only[col], errors='coerce').fillna(final_impute_medians.get(col, 0))
                else: temp_df_for_scaling[col] = final_impute_medians.get(col, 0)
            scaled_values = self.fitted_components['scaler'].transform(temp_df_for_scaling[self.final_feature_columns_in_order])
            output_df_features_only = pd.DataFrame(scaled_values, columns=self.final_feature_columns_in_order, index=output_df_features_only.index)
        elif self.final_feature_columns_in_order : output_df_features_only = output_df_features_only[self.final_feature_columns_in_order]
        else: # Should not happen if fit was successful
            print("    Error/Warning: final_feature_columns_in_order is empty in predict. Returning empty DataFrame.")
            return pd.DataFrame(index=model_input_pdf.index)
        print(f"  Pyfunc Preprocessor: Transformation complete. Output shape {output_df_features_only.shape}")
        return output_df_features_only

# --- Helper to Save Processed Outputs (Parquet Only) ---
def save_processed_pandas_outputs_parquet_only(
    processed_features_pdf: pd.DataFrame, # Contains ONLY processed features
    original_label_series: pd.Series,     # Original labels corresponding to the features
    parquet_file_path: str,               # Full /Volumes/... path for saving
    label_col_name_in_output: str
    ):
    """Saves processed features and labels as a single Parquet file."""
    df_to_save_parquet = processed_features_pdf.copy()
    
    # Add label back if it was provided (e.g., for train/test sets)
    if original_label_series is not None:
        # Ensure index alignment if possible, though for ML often just need values aligned
        if len(df_to_save_parquet) == len(original_label_series):
            df_to_save_parquet[label_col_name_in_output] = original_label_series.values
        else:
            print(f"  Warning: Length mismatch between processed features ({len(df_to_save_parquet)}) and labels ({len(original_label_series)}). Label not added to Parquet.")
            
    print(f"  Saving named cols Parquet data (shape: {df_to_save_parquet.shape}) to: {parquet_file_path}")
    try:
        # For direct Python I/O to UC Volumes, ensure the path is usable by pandas.
        # If parquet_file_path is already /dbfs/Volumes/..., it's fine.
        # If it's /Volumes/..., pandas might need /dbfs/ explicitly for some environments.
        # However, for consistency with user request, we'll use /Volumes/... directly.
        # Ensure parent directory exists
        os.makedirs(os.path.dirname(parquet_file_path), exist_ok=True)
        df_to_save_parquet.to_parquet(parquet_file_path, index=False)
        print(f"    Named cols Parquet data saved successfully to {parquet_file_path}")
    except Exception as e:
        print(f"    ERROR saving named cols Parquet to {parquet_file_path}: {e}")
        print(f"    Attempted path was: {parquet_file_path}. If this fails, ensure the path is accessible for direct Python I/O or try prefixing with /dbfs for UC Volumes.")
        raise
            
print("--- Pandas Preprocessing Pyfunc Model Class and Helpers (with Full FE & Parquet Only Save, UCV Paths) Defined ---")
print("-" * 50)

# <-------------------- CELL 4: MAIN PREPROCESSING ORCHESTRATION (with FE & Parquet Only Save, UCV Paths) -------------------->
print("\nCell 4: Main Preprocessing Orchestration - Executing (UC Volumes Exclusive)...")

global main_preprocessing_mlflow_experiment_id_ucv # Make it global if used inside functions
main_preprocessing_mlflow_experiment_id_ucv = None
try:
    main_preprocessing_mlflow_experiment_id_ucv = get_or_create_experiment(PREPROCESSING_EXPERIMENT_PATH, spark)
    if main_preprocessing_mlflow_experiment_id_ucv:
        mlflow.set_experiment(experiment_id=main_preprocessing_mlflow_experiment_id_ucv)
        print(f"MLflow experiment '{PREPROCESSING_EXPERIMENT_PATH}' for preprocessing is set with ID: {main_preprocessing_mlflow_experiment_id_ucv}")
    else:
        raise Exception("Preprocessing MLflow experiment could not be set. Halting.")
except Exception as e:
    print(f"CRITICAL: Could not initialize MLflow experiment for preprocessing. Error: {e}")
    # dbutils.notebook.exit("MLflow experiment setup failed") # Halt

if main_preprocessing_mlflow_experiment_id_ucv:
    print(f"\nLoading FULL RAW data for Pandas preprocessing from: {FULL_RAW_DATA_PARQUET_PATH}")
    # Assuming load_raw_data_to_pandas_from_uc_volume is defined (it was in Cell 3 of previous response)
    # For completeness, let's redefine it here or ensure it uses /Volumes/ directly.
    def load_raw_data_pandas_ucv(uc_volume_parquet_path: str) -> pd.DataFrame:
        print(f"  Attempting to load Pandas DataFrame directly from UC Volume path: {uc_volume_parquet_path}")
        try:
            # Forcing use of /Volumes/ path directly for pandas
            pdf = pd.read_parquet(uc_volume_parquet_path)
            print(f"    Successfully loaded. Shape: {pdf.shape}")
            return pdf
        except Exception as e:
            print(f"    ERROR loading Parquet from {uc_volume_parquet_path}: {e}")
            print(f"    Ensure path is correct and accessible. If issues persist, try prepending /dbfs to the path for pandas.")
            raise
    full_raw_pdf = load_raw_data_pandas_ucv(FULL_RAW_DATA_PARQUET_PATH)
    
    print("\nPerforming stratified train-test split by month and target...")
    raw_train_pdf, raw_test_pdf = split_by_month_and_stratify(
        full_pdf=full_raw_pdf, date_col=DATE_COLUMN_FOR_SPLIT,
        target_col=YOUR_TARGET_COLUMN_NAME, test_size=TEST_SET_SPLIT_RATIO,
        random_state=GLOBAL_SEED
    )
    
    # Optional: Save intermediate raw splits to UC Volumes using direct /Volumes/ path
    if not raw_train_pdf.empty: 
        os.makedirs(os.path.dirname(RAW_TRAIN_SPLIT_PATH_UCV), exist_ok=True)
        raw_train_pdf.to_parquet(RAW_TRAIN_SPLIT_PATH_UCV, index=False)
        print(f"  Intermediate raw train split ({raw_train_pdf.shape}) saved to: {RAW_TRAIN_SPLIT_PATH_UCV}")
    if not raw_test_pdf.empty: 
        os.makedirs(os.path.dirname(RAW_TEST_SPLIT_PATH_UCV), exist_ok=True)
        raw_test_pdf.to_parquet(RAW_TEST_SPLIT_PATH_UCV, index=False)
        print(f"  Intermediate raw test split ({raw_test_pdf.shape}) saved to: {RAW_TEST_SPLIT_PATH_UCV}")


    pyfunc_model_uri_saved_final = None
    fitted_preprocessor_to_log = None

    with mlflow.start_run(run_name="Pandas_FullFE_Pyfunc_UCV_FitLog") as preproc_run_final:
        run_id_main_preproc_final = preproc_run_final.info.run_id
        print(f"\nFitting and Logging Full FE Pandas Pyfunc Preprocessor. MLflow Run ID: {run_id_main_preproc_final}")
        
        # Log key configurations (as before)
        mlflow.log_params({
            "label_column": YOUR_TARGET_COLUMN_NAME, "raw_cat_cols": ", ".join(CATEGORICAL_COLUMNS_RAW),
            "raw_num_cols": ", ".join(NUMERICAL_COLUMNS_RAW), "date_col_fe": DATE_COLUMN_FOR_SPLIT, 
            "age_col_fe": AGE_COL_FOR_FE, "premium_col_fe": PREMIUM_COL_FOR_FE, 
            "interactions_col_fe": INTERACTIONS_COL_FOR_FE, "age_bins_config": str(AGE_BINS), 
            "te_smoothing": TARGET_ENCODING_SMOOTHING, "cat_impute_const": CATEGORICAL_IMPUTE_CONSTANT,
            "num_impute_strat": NUMERICAL_IMPUTE_STRATEGY, "test_split_ratio": TEST_SET_SPLIT_RATIO,
            "data_version_proc": PROCESSED_DATA_VERSION_FE_UCV
        })
        mlflow.set_tag("GLOBAL_SEED", GLOBAL_SEED); mlflow.set_tag("preprocessing_type", "pandas_pyfunc_full_fe_ucv")
        mlflow.log_param("raw_train_split_path_used", RAW_TRAIN_SPLIT_PATH_UCV) # Log /Volumes/ path

        try:
            preprocessor_instance_final = PandasFeatureEngineeringPreprocessor(
                raw_categorical_cols=CATEGORICAL_COLUMNS_RAW, raw_numerical_cols=NUMERICAL_COLUMNS_RAW,
                date_col_for_fe=DATE_COLUMN_FOR_SPLIT, age_col_for_fe=AGE_COL_FOR_FE, 
                premium_col_for_fe=PREMIUM_COL_FOR_FE, interactions_col_for_fe=INTERACTIONS_COL_FOR_FE,
                age_bins_for_fe=AGE_BINS, age_bin_labels_for_fe=AGE_BIN_LABELS,
                label_col_for_te_fitting=YOUR_TARGET_COLUMN_NAME,
                te_smoothing_factor=TARGET_ENCODING_SMOOTHING,
                cat_impute_constant=CATEGORICAL_IMPUTE_CONSTANT,
                num_impute_strategy=NUMERICAL_IMPUTE_STRATEGY,
                global_seed=GLOBAL_SEED
            )
            
            fitted_preprocessor_to_log = preprocessor_instance_final.fit(raw_train_pdf)
            print("  Full FE Preprocessor instance fitted successfully.")

            conda_env_final_fe = {
                'channels': ['conda-forge', 'defaults'],
                'dependencies': [
                    f'python={pd.__version__.split(".")[0]}.{pd.__version__.split(".")[1]}', 'pip',
                    {'pip': [
                        f'mlflow>={mlflow.__version__}', f'pandas>={pd.__version__}',
                        f'numpy>={np.__version__}', f'scikit-learn>={sklearn.__version__}',
                        f'category-encoders>={ce.__version__}', f'joblib>={joblib.__version__}',
                        'pyarrow'
                    ],},
                ],'name': 'pandas_full_fe_preprocessor_env_ucv'
            }
            
            input_example_df_final = raw_train_pdf.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore').head(5) if not raw_train_pdf.empty else None
            signature_final = None
            if input_example_df_final is not None and not input_example_df_final.empty:
                try:
                    output_example_for_pyfunc_final = fitted_preprocessor_to_log.predict(None, input_example_df_final)
                    signature_final = mlflow.models.infer_signature(input_example_df_final, output_example_for_pyfunc_final)
                    print("  Signature inferred for Pyfunc model.")
                except Exception as sig_e: print(f"  Warning: Could not infer signature for Pyfunc model. Error: {sig_e}")

            mlflow.pyfunc.log_model(
                artifact_path=MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH,
                python_model=fitted_preprocessor_to_log,
                conda_env=conda_env_final_fe,
                input_example=input_example_df_main, # Renamed variable from prev response
                signature=signature_main # Renamed variable from prev response
            )
            pyfunc_model_uri_saved_final = f"runs:/{run_id_main_preproc_final}/{MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH}"
            mlflow.set_tag("pyfunc_model_uri", pyfunc_model_uri_saved_final)
            
            if hasattr(fitted_preprocessor_to_log, 'final_feature_columns_in_order'):
                 mlflow.log_param("final_feature_names_count", len(fitted_preprocessor_to_log.final_feature_columns_in_order))
                 # To log all features (might be long):
                 # mlflow.log_text(",".join(fitted_preprocessor_to_log.final_feature_columns_in_order), "final_feature_names.txt")
            mlflow.set_tag("status_fit_log_pyfunc", "success")
            print(f"  Fitted Pyfunc Preprocessor with Full FE saved to MLflow: {pyfunc_model_uri_saved_final}")

        except Exception as e:
            print(f"  ERROR during Pyfunc Preprocessor (Full FE) fitting or logging: {e}")
            import traceback; traceback.print_exc()
            mlflow.log_param("error_fit_pyfunc", str(e)[:250])
            mlflow.set_tag("status_fit_log_pyfunc", "failed")
            raise

    # --- 4. Transform Train and Test Data using the FITTED instance & Save Outputs (Parquet Only) ---
    if fitted_preprocessor_to_log:
        print("\nTransforming TRAIN data using fitted preprocessor instance (Full FE)...")
        try:
            # The predict method of our Pyfunc model returns only the feature DataFrame
            processed_train_features_pdf = fitted_preprocessor_to_log.predict(None, raw_train_pdf.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore'))
            
            # Re-attach the original label for saving
            processed_train_pdf_to_save = processed_train_features_pdf.copy()
            # Ensure index is aligned if raw_train_pdf was manipulated, or just use values
            if len(processed_train_pdf_to_save) == len(raw_train_pdf):
                 processed_train_pdf_to_save[YOUR_TARGET_COLUMN_NAME] = raw_train_pdf[YOUR_TARGET_COLUMN_NAME].values
            else:
                 print(f"Warning: Length mismatch when re-attaching train labels. Train features shape {processed_train_features_pdf.shape}, raw_train_pdf shape {raw_train_pdf.shape}")
                 # Fallback or error
            
            save_processed_pandas_outputs_parquet_only( # Defined in Cell 3
                processed_pdf=processed_train_pdf_to_save,
                parquet_file_path=SHARED_PROCESSED_TRAIN_PATH # Path from Init Cell (now direct /Volumes/)
            )
            with mlflow.start_run(run_id=run_id_main_preproc_final, nested=False):
                 mlflow.set_tag("output_train_parquet_path", SHARED_PROCESSED_TRAIN_PATH) # Log direct /Volumes/
            print(f"  Processed TRAIN data with Full FE saved as Parquet to: {SHARED_PROCESSED_TRAIN_PATH}")
        except Exception as e:
            print(f"  ERROR during TRAIN data transformation or saving (Full FE): {e}")
            with mlflow.start_run(run_id=run_id_main_preproc_final, nested=False): mlflow.log_param("error_transform_train_fe", str(e)[:250])
            raise

        print("\nTransforming TEST data using fitted preprocessor instance (Full FE)...")
        try:
            processed_test_features_pdf = fitted_preprocessor_to_log.predict(None, raw_test_pdf.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore'))
            processed_test_pdf_to_save = processed_test_features_pdf.copy()
            if YOUR_TARGET_COLUMN_NAME in raw_test_pdf.columns:
                if len(processed_test_pdf_to_save) == len(raw_test_pdf):
                     processed_test_pdf_to_save[YOUR_TARGET_COLUMN_NAME] = raw_test_pdf[YOUR_TARGET_COLUMN_NAME].values
                else:
                     print(f"Warning: Length mismatch when re-attaching test labels. Test features shape {processed_test_features_pdf.shape}, raw_test_pdf shape {raw_test_pdf.shape}")


            save_processed_pandas_outputs_parquet_only( # Defined in Cell 3
                processed_pdf=processed_test_pdf_to_save,
                parquet_file_path=SHARED_PROCESSED_TEST_PATH
            )
            with mlflow.start_run(run_id=run_id_main_preproc_final, nested=False):
                 mlflow.set_tag("output_test_parquet_path", SHARED_PROCESSED_TEST_PATH)
            print(f"  Processed TEST data with Full FE saved as Parquet to: {SHARED_PROCESSED_TEST_PATH}")
        except Exception as e:
            print(f"  ERROR during TEST data transformation or saving (Full FE): {e}")
            with mlflow.start_run(run_id=run_id_main_preproc_final, nested=False): mlflow.log_param("error_transform_test_fe", str(e)[:250])
            raise
    else:
        print("CRITICAL: Preprocessor (Full FE) fitting/logging failed. Cannot transform train/test data.")

    print("\n--- Pandas Preprocessing Orchestration with Full Feature Engineering (UC Volumes Exclusive) Completed ---")
    if pyfunc_model_uri_saved_final: print(f"Fitted Pyfunc Preprocessor MLflow URI: {pyfunc_model_uri_saved_final}")
    print(f"Named Parquet processed training data saved at: {SHARED_PROCESSED_TRAIN_PATH}")
    print(f"Named Parquet processed test data saved at: {SHARED_PROCESSED_TEST_PATH}")
else:
    print("Halting script because MLflow experiment for preprocessing could not be set.")
print("-" * 50)

# <-------------------- CELL 5: EXAMPLE USAGE OF SAVED PANDAS PREPROCESSOR FOR NEW DATA (INFERENCE) -------------------->
# This cell is conceptual and shows how to use the logged Pyfunc model.
# print("\nCell 5: Example - Using Saved Pandas Preprocessor for New Data (Inference)...")

# # --- 1. Define Path to New Raw Data and MLflow URI of the Preprocessor ---
# # pyfunc_preprocessor_uri_to_load = pyfunc_model_uri_saved_final # From Cell 4 output if run in same session
# # OR "runs:/<RUN_ID_FROM_CELL_4>/pandas_classification_fe_preprocessor_ucv" 
# # NEW_RAW_DATA_FOR_INFERENCE_UCV_PATH = "/Volumes/delfos/new_inference_data/new_predict_data.parquet" # !!! REPLACE !!!

# # --- 2. Load the Pyfunc Preprocessor Model ---
# # print(f"Loading Pyfunc preprocessor from: {pyfunc_preprocessor_uri_to_load}")
# # try:
# #     loaded_pyfunc_preprocessor_instance = mlflow.pyfunc.load_model(pyfunc_preprocessor_uri_to_load)
# #     print("Pyfunc preprocessor loaded successfully for inference.")
# # except Exception as e:
# #     print(f"CRITICAL ERROR: Could not load Pyfunc preprocessor model. {e}")
# #     # dbutils.notebook.exit("Failed to load Pyfunc preprocessor")

# # if 'loaded_pyfunc_preprocessor_instance' in locals():
# #     # --- 3. Load New Raw Data for Prediction ---
# #     print(f"Loading new raw data for prediction from {NEW_RAW_DATA_FOR_INFERENCE_UCV_PATH}...")
# #     try:
# #         new_raw_pdf_to_predict = load_raw_data_pandas_ucv(NEW_RAW_DATA_FOR_INFERENCE_UCV_PATH) # Function from Cell 3
# #     except Exception as e:
# #         print(f"CRITICAL ERROR: Could not load new raw data for prediction. {e}")
# #         # dbutils.notebook.exit("Failed to load new raw data")

# #     if 'new_raw_pdf_to_predict' in locals() and not new_raw_pdf_to_predict.empty:
# #         # --- 4. Apply Preprocessing to New Data ---
# #         print("Applying Pyfunc preprocessing to new data for prediction...")
# #         try:
# #             # The .predict() method of our pyfunc model takes the raw Pandas DF
# #             # (it expects features only, so drop label if present in new_raw_pdf_to_predict, though usually not)
# #             raw_features_for_pyfunc_predict = new_raw_pdf_to_predict.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore')
            
# #             processed_features_for_prediction_pdf = loaded_pyfunc_preprocessor_instance.predict(raw_features_for_pyfunc_predict)
            
# #             print(f"Preprocessing of new data complete. Processed feature DataFrame shape: {processed_features_for_prediction_pdf.shape}")
# #             print("First 5 rows of processed features for prediction:")
# #             print(processed_features_for_prediction_pdf.head())

# #             # This processed_features_for_prediction_pdf (or its .values) can then be fed to your trained ML model.
# #             # For example:
# #             # final_ml_model = mlflow.sklearn.load_model("runs:/<your_ml_model_run_id>/model")
# #             # final_predictions = final_ml_model.predict(processed_features_for_prediction_pdf.values)
# #             # print(f"Sample final predictions on new data: {final_predictions[:5]}")

# #         except Exception as e:
# #             print(f"ERROR during preprocessing or prediction on new data: {e}")
# #             import traceback
# #             traceback.print_exc()
# #     else:
# #         print("New raw data for prediction is empty or failed to load.")
# # else:
# #     print("Pyfunc preprocessor was not loaded. Cannot proceed with inference example.")
# print("-" * 50)

In [None]:
# <-------------------- CELL 1: IMPORTS -------------------->
print("Cell 1: Imports - Executing...")
import mlflow
import mlflow.pyfunc

import pandas as pd
import numpy as np
import os
import joblib # For MLflow pyfunc to pickle the model instance with its components
import time
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce

from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', message="Previously subsetted data...")
warnings.filterwarnings('ignore', message="Downcasting behavior in `replace` is deprecated*")


if 'spark' not in locals():
    spark = SparkSession.builder.appName("Pandas_FE_Preprocessing_Interactions_MVP").getOrCreate()
    print("SparkSession created.")
else:
    print("SparkSession already exists.")

print("Imports successful for Pandas Preprocessing Pipeline with Interactions.")
print("-" * 50)

# <-------------------- CELL 2: INIT CELL - GLOBAL CONFIGURATIONS FOR PREPROCESSING -------------------->
print("\nCell 2: Global Configurations for Preprocessing - Executing...")

PREPROCESSING_EXPERIMENT_PATH = "/Users/your_username@example.com/MVP_Classification_FE_Interactions_Preprocessing" # !!! CHANGE THIS !!!
UC_BASE_DATA_PATH = "/Volumes/delfos/" # !!! From your input: /Volumes/<catalog>/<schema>/<volume>/ !!!
FULL_RAW_DATA_PARQUET_PATH = f"{UC_BASE_DATA_PATH}raw_data_full/full_dataset_generic.parquet" # !!! UPDATE !!!

DATE_COLUMN_FOR_SPLIT = "date_col" # !!! GENERIC NAME - UPDATE !!!
YOUR_TARGET_COLUMN_NAME = "target_binary" # !!! GENERIC NAME - UPDATE (0 or 1) !!!

CATEGORICAL_COLUMNS_RAW = ["cat_feat_1", "cat_feat_2", "region_col"] # !!! UPDATE !!!
NUMERICAL_COLUMNS_RAW = ["num_feat_1", "premium_col", "generic_numerical_col"] # !!! UPDATE - 'age_col' and 'interactions_col' removed/renamed based on previous steps

RAW_SPLITS_SUBDIR = "raw_splits_v3_interactions"
RAW_SPLITS_DIR_UCV = os.path.join(UC_BASE_DATA_PATH, RAW_SPLITS_SUBDIR)
RAW_TRAIN_SPLIT_PATH_UCV = os.path.join(RAW_SPLITS_DIR_UCV, "raw_train_split.parquet")
RAW_TEST_SPLIT_PATH_UCV = os.path.join(RAW_SPLITS_DIR_UCV, "raw_test_split.parquet")

PROCESSED_DATA_VERSION_FE = "v3_pandas_fe_interactions"
PROCESSED_DATA_SUBDIR = "processed_data"
PROCESSED_DATA_DIR_VERSIONED_FE_UCV = os.path.join(UC_BASE_DATA_PATH, PROCESSED_DATA_SUBDIR, PROCESSED_DATA_VERSION_FE)
SHARED_PROCESSED_TRAIN_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED_FE_UCV, "train_processed_final.parquet")
SHARED_PROCESSED_TEST_PATH = os.path.join(PROCESSED_DATA_DIR_VERSIONED_FE_UCV, "test_processed_final.parquet")

MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH = "pandas_classification_fe_interactions_preprocessor"

TEST_SET_SPLIT_RATIO = 0.20
TARGET_ENCODING_SMOOTHING = 20.0
CATEGORICAL_IMPUTE_CONSTANT = "__UNKNOWN_OR_MISSING_CAT__"
NUMERICAL_IMPUTE_STRATEGY = "median"
GLOBAL_SEED = 117

# Feature Engineering Base Columns (must be in NUMERICAL_COLUMNS_RAW or CATEGORICAL_COLUMNS_RAW or DATE_COLUMN_FOR_SPLIT)
PREMIUM_COL_FOR_FE = "premium_col" # Must be in NUMERICAL_COLUMNS_RAW
# The old INTERACTIONS_COL_FOR_FE is removed. Define specific interactions below.

# !!! NEW: Define columns to create interaction terms with the premium feature !!!
# These column names should exist in your dataset AFTER initial cleaning (_apply_initial_custom_formatting)
# AND AFTER base feature engineering (_engineer_features).
# If a listed column is categorical, the preprocessor will use its target-encoded (numeric) version.
COLS_TO_INTERACT_WITH_PREMIUM = [
    "cat_feat_1",         # This will be target-encoded first, then interacted
    "region_col",         # This will be target-encoded first, then interacted
    "fe_month",           # An engineered categorical (e.g., from date_col), will be target-encoded
    "num_feat_1",         # An existing numerical feature (will be used as is or after its own FE like log)
    f"fe_log_{PREMIUM_COL_FOR_FE}" # Self-interaction (premium * log_premium), or interact log_premium with others
                                   # Note: Interacting log_premium with itself is premium_log1p^2.
                                   # Better to interact log_premium with OTHER features.
                                   # Let's make this list interact with fe_log_premium
]
# Example: If you want to interact fe_log_premium with num_feat_1 and target_encoded cat_feat_1:
# COLS_TO_INTERACT_WITH_PREMIUM = ["num_feat_1", "cat_feat_1"]

try:
    os.makedirs(RAW_SPLITS_DIR_UCV, exist_ok=True)
    os.makedirs(PROCESSED_DATA_DIR_VERSIONED_FE_UCV, exist_ok=True)
    print(f"Checked/created raw splits directory: {RAW_SPLITS_DIR_UCV}")
    print(f"Checked/created processed data directory: {PROCESSED_DATA_DIR_VERSIONED_FE_UCV}")
except Exception as e:
    print(f"Warning: Could not create directories using os.makedirs with /Volumes/ paths. Error: {e}")

print(f"--- Preprocessing Global Configurations (with Interactions) Initialized ---")
print(f"Output Processed Train Data Path (Parquet Named Cols): {SHARED_PROCESSED_TRAIN_PATH}")
print(f"Output Processed Test Data Path (Parquet Named Cols): {SHARED_PROCESSED_TEST_PATH}")
print(f"Columns to interact with premium: {COLS_TO_INTERACT_WITH_PREMIUM}")
print("-" * 50)


# <-------------------- CELL 3: PREPROCESSING LOGIC & MLFLOW PYFUNC MODEL CLASS -------------------->
print("\nCell 3: Preprocessing Logic & Pyfunc Model Class - Defining (with Interactions)...")

# --- MLflow Utility (get_or_create_experiment - same as before) ---
def get_or_create_experiment(experiment_name_param, spark_session_param=None):
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name_param)
        if experiment: print(f"MLflow experiment '{experiment_name_param}' found with ID: {experiment.experiment_id}"); return experiment.experiment_id
        else:
            print(f"MLflow experiment '{experiment_name_param}' not found. Creating."); experiment_id = mlflow.create_experiment(name=experiment_name_param)
            print(f"MLflow experiment '{experiment_name_param}' created with ID: {experiment_id}"); return experiment_id
    except Exception as e: print(f"Error in get_or_create_experiment for '{experiment_name_param}': {e}"); return None

# --- Data Splitting Function (split_by_month_and_stratify - same as before) ---
def split_by_month_and_stratify(full_pdf: pd.DataFrame, date_col: str, target_col: str, test_size: float, random_state: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    print(f"  Starting stratified split by month from column '{date_col}' and target '{target_col}'...")
    if date_col not in full_pdf.columns: raise ValueError(f"Date column '{date_col}' not found.")
    if target_col not in full_pdf.columns: raise ValueError(f"Target column '{target_col}' not found.")
    try:
        temp_df = full_pdf.copy(); temp_df[date_col] = pd.to_datetime(temp_df[date_col], errors='coerce'); temp_df['year_month_group'] = temp_df[date_col].dt.to_period('M')
    except Exception as e: raise ValueError(f"Could not process date column '{date_col}'. Error: {e}")
    train_dfs_list, test_dfs_list = [], []
    for month_period, group_data in temp_df.groupby('year_month_group'):
        if len(group_data) < 2: # Handle very small groups
             if len(group_data) == 1: train_dfs_list.append(group_data.copy())
             elif np.random.RandomState(random_state).rand() > test_size : train_dfs_list.append(group_data.copy())
             else: test_dfs_list.append(group_data.copy())
             continue
        target_counts = group_data[target_col].value_counts()
        min_samples_per_class_needed = 2 
        if len(target_counts) < 2 or target_counts.min() < min_samples_per_class_needed: # Fallback to random split
            month_train_df, month_test_df = train_test_split(group_data, test_size=test_size, random_state=random_state, shuffle=True)
        else:
            try: month_train_df, month_test_df = train_test_split(group_data, test_size=test_size, random_state=random_state, stratify=group_data[target_col], shuffle=True)
            except ValueError as ve: month_train_df, month_test_df = train_test_split(group_data, test_size=test_size, random_state=random_state, shuffle=True)
        train_dfs_list.append(month_train_df)
        if not month_test_df.empty: test_dfs_list.append(month_test_df)
    final_train_df = pd.concat(train_dfs_list).drop(columns=['year_month_group']) if train_dfs_list else pd.DataFrame(columns=full_pdf.columns).drop(columns=['year_month_group'], errors='ignore')
    final_test_df = pd.concat(test_dfs_list).drop(columns=['year_month_group']) if test_dfs_list else pd.DataFrame(columns=full_pdf.columns).drop(columns=['year_month_group'], errors='ignore')
    print(f"  Splitting complete. Train shape: {final_train_df.shape}, Test shape: {final_test_df.shape}")
    return final_train_df, final_test_df

# --- Plotting Function for Split Distribution (plot_split_distribution_by_month - same as before) ---
# ... (assuming this function is defined here as in the previous response) ...
import matplotlib.pyplot as plt; import matplotlib.ticker as mtick # Ensure imported
def plot_split_distribution_by_month(train_df: pd.DataFrame, test_df: pd.DataFrame, date_col: str, target_col: str, title: str = "Train-Test Split Distribution and Target Rate by Month") -> plt.Figure:
    # ... (Full implementation from previous response) ...
    print(f"  Generating plot: {title}"); dfs_to_process = {};
    if not train_df.empty: dfs_to_process['train'] = train_df.copy()
    if not test_df.empty: dfs_to_process['test'] = test_df.copy()
    monthly_stats = []
    for name, df_orig in dfs_to_process.items():
        df = df_orig.copy()
        if date_col not in df.columns or (target_col not in df.columns and name in ['train','test']): continue
        try: df[date_col] = pd.to_datetime(df[date_col], errors='coerce'); df['year_month'] = df[date_col].dt.to_period('M')
        except Exception as e: print(f"    Warning: Could not process date column '{date_col}' in {name} DF. Error: {e}. Skipping."); continue
        grouped = df.groupby('year_month'); counts = grouped.size().rename(f'{name}_count')
        if target_col in df.columns: target_rate = grouped[target_col].mean().rename(f'{name}_target_rate'); month_data = pd.concat([counts, target_rate], axis=1)
        else: month_data = counts.to_frame(); month_data[f'{name}_target_rate'] = np.nan
        monthly_stats.append(month_data)
    if not monthly_stats: fig, ax = plt.subplots(); plt.close(fig); return fig
    if len(monthly_stats) == 2: combined_stats_df = monthly_stats[0].join(monthly_stats[1], how='outer').fillna(0)
    elif len(monthly_stats) == 1: combined_stats_df = monthly_stats[0]; # ... (handle single df columns for missing part) ...
    else: fig, ax = plt.subplots(); plt.close(fig); return fig
    combined_stats_df.sort_index(inplace=True); months = combined_stats_df.index.strftime('%Y-%m').tolist()
    fig, ax1 = plt.subplots(figsize=(15, 7)); fig.suptitle(title, fontsize=16)
    train_counts = combined_stats_df.get('train_count', pd.Series(0, index=combined_stats_df.index)); test_counts = combined_stats_df.get('test_count', pd.Series(0, index=combined_stats_df.index))
    ax1.bar(months, train_counts, 0.8, label='Train Count', color='skyblue'); ax1.bar(months, test_counts, 0.8, bottom=train_counts, label='Test Count', color='lightcoral')
    ax1.set_xlabel("Month-Year"); ax1.set_ylabel("Sample Count", color='dimgray'); ax1.tick_params(axis='y', labelcolor='dimgray'); ax1.tick_params(axis='x', rotation=45); ax1.legend(loc='upper left')
    ax2 = ax1.twinx(); line1_plotted, line2_plotted = False, False
    if 'train_target_rate' in combined_stats_df.columns: ax2.plot(months, combined_stats_df['train_target_rate'] * 100, color='steelblue', marker='o', linestyle='-', linewidth=2, label='Train Target Rate (%)'); line1_plotted=True
    if 'test_target_rate' in combined_stats_df.columns: ax2.plot(months, combined_stats_df['test_target_rate'] * 100, color='darkred', marker='x', linestyle='--', linewidth=2, label='Test Target Rate (%)'); line2_plotted=True
    ax2.set_ylabel("Target Rate (%)", color='dimgray'); ax2.tick_params(axis='y', labelcolor='dimgray'); ax2.yaxis.set_major_formatter(mtick.PercentFormatter()); ax2.set_ylim(0, 105)
    if line1_plotted or line2_plotted: ax2.legend(loc='upper right')
    plt.xticks(rotation=45, ha="right"); plt.grid(True, linestyle='--', alpha=0.7); fig.tight_layout(rect=[0, 0, 1, 0.96]); plt.close(fig)
    print(f"  Plot '{title}' generated."); return fig

# --- Pandas Preprocessor as an mlflow.pyfunc.PythonModel (with Interactions FE) ---
class PandasFeatureEngineeringPreprocessorWithInteractions(mlflow.pyfunc.PythonModel):
    
    def __init__(self, 
                 raw_categorical_cols, raw_numerical_cols, 
                 date_col_for_fe, premium_col_for_fe, # Removed age_col_for_fe
                 cols_to_interact_with_premium_config, # NEW
                 label_col_for_te_fitting, 
                 te_smoothing_factor, 
                 cat_impute_constant, num_impute_strategy,
                 global_seed):
        
        self.raw_categorical_cols_config = list(raw_categorical_cols)
        self.raw_numerical_cols_config = list(raw_numerical_cols)
        self.date_col_for_fe = date_col_for_fe
        self.premium_col_for_fe = premium_col_for_fe # Main premium col for FE & interactions
        self.cols_to_interact_with_premium_config = list(cols_to_interact_with_premium_config) # Store this new list
        # self.interactions_col_for_fe was removed (old single log transform example)
        
        self.label_col_for_te_fitting = label_col_for_te_fitting
        self.te_smoothing_factor = te_smoothing_factor
        self.cat_impute_constant = cat_impute_constant
        self.num_impute_strategy = num_impute_strategy
        self.global_seed = global_seed

        self.fitted_components = {}
        self.feature_engineering_details = { # To store lists of engineered column names
            'engineered_categorical_cols': [],
            'engineered_numerical_cols': [],
            'engineered_interaction_cols': []
        }
        self.final_feature_columns_in_order = []

    def _get_valid_cols(self, df, col_list): # Helper remains same
        return [col for col in col_list if col in df.columns and col != self.label_col_for_te_fitting]

    def _apply_initial_custom_formatting(self, df_input: pd.DataFrame) -> pd.DataFrame: # User fills this
        df = df_input.copy(); print("    Custom Preprocessing Step 0: Applying initial custom data formatting (User to implement)...")
        # !!! REPLACE THIS WITH YOUR CUSTOM LOGIC !!!
        return df

    def _engineer_features(self, df_input: pd.DataFrame, is_fitting_phase: bool) -> pd.DataFrame: # No Age FE
        df = df_input.copy(); print("    Feature Engineering Step 1: Creating base new features...")
        engineered_categoricals_temp, engineered_numericals_temp = [], []
        # Date/Time Features
        if self.date_col_for_fe and self.date_col_for_fe in df.columns:
            try:
                s_date = pd.to_datetime(df[self.date_col_for_fe], errors='coerce')
                if not s_date.isnull().all():
                    df['fe_month'] = s_date.dt.month.fillna(-1).astype(int).astype(str)
                    df['fe_day_of_week'] = s_date.dt.dayofweek.fillna(-1).astype(int).astype(str)
                    df['fe_is_weekend'] = s_date.dt.dayofweek.isin([5,6]).astype(int).astype(str)
                    engineered_categoricals_temp.extend(['fe_month', 'fe_day_of_week', 'fe_is_weekend'])
            except Exception as e_date: print(f"      Warning: Date FE error for {self.date_col_for_fe}: {e_date}")
        
        # Numerical Transformations (e.g., for premium)
        if self.premium_col_for_fe and self.premium_col_for_fe in df.columns:
            df[self.premium_col_for_fe] = pd.to_numeric(df[self.premium_col_for_fe], errors='coerce')
            df[f'fe_log_{self.premium_col_for_fe}'] = np.log1p(df[self.premium_col_for_fe].fillna(0).clip(lower=0))
            df[f'fe_sq_{self.premium_col_for_fe}'] = df[self.premium_col_for_fe].fillna(0)**2
            engineered_numericals_temp.extend([f'fe_log_{self.premium_col_for_fe}', f'fe_sq_{self.premium_col_for_fe}'])
        
        # Example for another generic numerical column if specified
        # This part is now more generic; user can define other NUMERICAL_COLUMNS_RAW for simple transforms if needed.
        # For now, only premium has specific log/sq transforms explicitly shown.
        # Other raw numericals will be imputed and scaled.

        if is_fitting_phase:
            self.feature_engineering_details['engineered_categorical_cols'] = list(set(engineered_categoricals_temp))
            self.feature_engineering_details['engineered_numerical_cols'] = list(set(engineered_numericals_temp))
        return df

    def _create_interaction_features_post_te(self, df_input: pd.DataFrame, is_fitting_phase: bool) -> pd.DataFrame:
        df = df_input.copy(); print("    Feature Engineering Step 3: Creating interaction features with premium (post-target encoding)...")
        newly_created_interactions_temp = []
        
        # Determine the premium feature to use (log-transformed if created, else original)
        base_premium_col_for_interaction = f'fe_log_{self.premium_col_for_fe}'
        if base_premium_col_for_interaction not in df.columns:
            base_premium_col_for_interaction = self.premium_col_for_fe # Fallback to original premium
        
        if base_premium_col_for_interaction not in df.columns:
            print(f"      FE Interaction: Base premium column for interactions ('{base_premium_col_for_interaction}' or '{self.premium_col_for_fe}') not found. Skipping premium interactions.")
            if is_fitting_phase: self.feature_engineering_details['engineered_interaction_cols'] = []
            return df
        
        # Ensure the premium column is numeric before interaction
        df[base_premium_col_for_interaction] = pd.to_numeric(df[base_premium_col_for_interaction], errors='coerce').fillna(0)

        for interact_col_name in self.cols_to_interact_with_premium_config: # Iterate through configured list
            if interact_col_name == base_premium_col_for_interaction: # Avoid self-interaction here unless intended
                print(f"      Skipping self-interaction for {base_premium_col_for_interaction} with itself in this loop.")
                continue

            if interact_col_name in df.columns:
                # Ensure the column to interact with is also numeric (it should be if it's an original numerical,
                # or if it was categorical and has been target-encoded by this stage)
                df[interact_col_name] = pd.to_numeric(df[interact_col_name], errors='coerce').fillna(0)
                
                interaction_feature_name = f'fe_inter_{base_premium_col_for_interaction}_x_{interact_col_name}'
                df[interaction_feature_name] = df[base_premium_col_for_interaction] * df[interact_col_name]
                newly_created_interactions_temp.append(interaction_feature_name)
                print(f"      FE Interaction: Created {interaction_feature_name}")
            else:
                print(f"      Warning: Column '{interact_col_name}' specified for interaction with premium not found in DataFrame. Skipping this interaction.")
        
        if is_fitting_phase:
            self.feature_engineering_details['engineered_interaction_cols'] = list(set(newly_created_interactions_temp))
        return df

    def fit(self, train_pdf: pd.DataFrame):
        print("    Fitting PandasFeatureEngineeringPreprocessorWithInteractions..."); 
        X_fit_no_label = train_pdf.drop(columns=[self.label_col_for_te_fitting], errors='ignore').copy()
        y_fit_series = train_pdf[self.label_col_for_te_fitting].astype(float).copy()
        
        X_fit_custom = self._apply_initial_custom_formatting(X_fit_no_label)
        X_fit_fe = self._engineer_features(X_fit_custom, is_fitting_phase=True)
        
        current_cols_in_X = X_fit_fe.columns.tolist()
        active_numerical_cols = self._get_valid_cols(X_fit_fe, self.raw_numerical_cols_config + self.feature_engineering_details.get('engineered_numerical_cols', []))
        active_categorical_cols = self._get_valid_cols(X_fit_fe, self.raw_categorical_cols_config + self.feature_engineering_details.get('engineered_categorical_cols', []))
        
        self.fitted_components['active_numerical_cols_for_impute'] = active_numerical_cols
        self.fitted_components['active_categorical_cols_for_impute'] = active_categorical_cols

        if active_numerical_cols:
            num_imputer = SimpleImputer(strategy=self.num_impute_strategy); X_fit_fe[active_numerical_cols] = num_imputer.fit_transform(X_fit_fe[active_numerical_cols])
            self.fitted_components['numerical_imputer'] = num_imputer
        if active_cat_cols:
            cat_imputer = SimpleImputer(strategy="constant", fill_value=self.cat_impute_constant); X_fit_fe[active_cat_cols] = cat_imputer.fit_transform(X_fit_fe[active_cat_cols])
            self.fitted_components['categorical_imputer'] = cat_imputer
            
        cols_for_te_fit = list(active_cat_cols); target_encoded_output_names = []
        if cols_for_te_fit:
            for col in cols_for_te_fit: X_fit_fe[col] = X_fit_fe[col].astype('category')
            target_encoder = ce.TargetEncoder(cols=cols_for_te_fit, smoothing=self.te_smoothing_factor, handle_unknown='value', handle_missing='value')
            X_fit_te_transformed = target_encoder.fit_transform(X_fit_fe[cols_for_te_fit], y_fit_series)
            for col in cols_for_te_fit: X_fit_fe[col] = X_fit_te_transformed[col]
            self.fitted_components['target_encoder'] = target_encoder; target_encoded_output_names = list(cols_for_te_fit)
            self.fitted_components['target_encoded_cols_list'] = target_encoded_output_names
            
        X_fit_interactions = self._create_interaction_features_post_te(X_fit_fe, is_fitting_phase=True)
        
        self.all_numerical_cols_for_scaling = list(set(
            active_num_cols + 
            target_encoded_output_names + 
            self.feature_engineering_details.get('engineered_interaction_cols', [])
        ))
        self.final_feature_columns_in_order = [col for col in self.all_numerical_cols_for_scaling if col in X_fit_interactions.columns]
        
        final_impute_medians_before_scale = {}
        if self.final_feature_columns_in_order:
            for col in self.final_feature_columns_in_order:
                X_fit_interactions[col] = pd.to_numeric(X_fit_interactions[col], errors='coerce')
                if X_fit_interactions[col].isnull().any(): 
                    median_val = X_fit_interactions[col].median()
                    X_fit_interactions[col] = X_fit_interactions[col].fillna(median_val)
                    final_impute_medians_before_scale[col] = median_val
            if final_impute_medians_before_scale: self.fitted_components['final_impute_medians_before_scale'] = final_impute_medians_before_scale
            
            scaler = StandardScaler()
            X_fit_interactions[self.final_feature_columns_in_order] = scaler.fit_transform(X_fit_interactions[self.final_feature_columns_in_order])
            self.fitted_components['scaler'] = scaler
        print("    Pandas Preprocessor with Interactions fitting complete."); return self

    def predict(self, context, model_input_pdf: pd.DataFrame):
        print(f"  Pyfunc Preprocessor: Applying full transformation (with Interactions) to input DataFrame with shape {model_input_pdf.shape}...")
        X_data_pd = model_input_pdf.copy()
        if not hasattr(self, 'final_feature_columns_in_order') or not self.final_feature_columns_in_order:
             # Attempt to load from context if available (for models loaded from MLflow)
            if context and hasattr(context, 'artifacts') and "pyfunc_internal_components" in context.artifacts: # Assuming components saved this way
                internal_state = joblib.load(context.artifacts["pyfunc_internal_components"])
                self.fitted_components = internal_state.get('fitted_components',{})
                self.feature_engineering_details = internal_state.get('feature_engineering_details',{})
                self.final_feature_columns_in_order = internal_state.get('final_feature_columns_in_order',[])
                # Re-set other config attributes if they were part of the saved state and not passed to init
                # (This part needs careful design if __init__ params are not available at predict time for a loaded model)
                print("    Loaded internal components via MLflow context artifacts for predict.")
                if not self.final_feature_columns_in_order: # Still not there
                    raise RuntimeError("Preprocessor state not fully loaded via context or final_feature_columns_in_order is empty.")
            else:
                 raise RuntimeError("Preprocessor not fitted or final_feature_columns_in_order is empty, and no context artifacts to load from.")


        X_data_pd_custom = self._apply_initial_custom_formatting(X_data_pd)
        X_data_pd_fe = self._engineer_features(X_data_pd_custom, is_fitting_phase=False)
        
        active_num_cols = self._get_valid_cols(X_data_pd_fe, self.fitted_components.get('active_numerical_cols_for_impute', []))
        if active_num_cols and self.fitted_components.get('numerical_imputer'): X_data_pd_fe[active_num_cols] = self.fitted_components['numerical_imputer'].transform(X_data_pd_fe[active_num_cols])
        
        active_cat_cols = self._get_valid_cols(X_data_pd_fe, self.fitted_components.get('active_categorical_cols_for_impute', []))
        if active_cat_cols and self.fitted_components.get('categorical_imputer'): X_data_pd_fe[active_cat_cols] = self.fitted_components['categorical_imputer'].transform(X_data_pd_fe[active_cat_cols])
        
        te_cols_to_transform = self._get_valid_cols(X_data_pd_fe, self.fitted_components.get('target_encoded_cols_list', []))
        if te_cols_to_transform and self.fitted_components.get('target_encoder'):
            for col in te_cols_to_transform: X_data_pd_fe[col] = X_data_pd_fe[col].astype('category')
            transformed_te_cols = self.fitted_components['target_encoder'].transform(X_data_pd_fe[te_cols_to_transform])
            for col in te_cols_to_transform: X_data_pd_fe[col] = transformed_te_cols[col]
            
        X_data_pd_interactions = self._create_interaction_features_post_te(X_data_pd_fe, is_fitting_phase=False)
        
        output_df_features_only = X_data_pd_interactions
        if self.final_feature_columns_in_order and self.fitted_components.get('scaler'):
            final_impute_medians = self.fitted_components.get('final_impute_medians_before_scale', {})
            # Ensure all columns in final_feature_columns_in_order exist in output_df_features_only before scaling
            # This is critical for the transform step.
            current_cols = output_df_features_only.columns.tolist()
            cols_for_scaler_input = []
            temp_df_for_scaling = pd.DataFrame(index=output_df_features_only.index)

            for col in self.final_feature_columns_in_order:
                if col in current_cols:
                    temp_df_for_scaling[col] = pd.to_numeric(output_df_features_only[col], errors='coerce').fillna(final_impute_medians.get(col, 0))
                    cols_for_scaler_input.append(col)
                else: # Feature expected by scaler is missing
                    print(f"    Warning: Feature '{col}' expected by scaler not found in input for predict. Filling with 0 before scaling.")
                    temp_df_for_scaling[col] = 0 # Create and fill with 0 or median
                    cols_for_scaler_input.append(col) # Add it so scaler receives correct number of features
            
            if cols_for_scaler_input: # Only scale if there are columns to scale
                scaled_values = self.fitted_components['scaler'].transform(temp_df_for_scaling[self.final_feature_columns_in_order]) # Use the definitive order
                output_df_features_only = pd.DataFrame(scaled_values, columns=self.final_feature_columns_in_order, index=output_df_features_only.index)
            else: # No columns matched for scaling
                 output_df_features_only = pd.DataFrame(columns=self.final_feature_columns_in_order, index=model_input_pdf.index) # Empty df with correct columns
        elif self.final_feature_columns_in_order: # Scaler not fit, but we have an expected feature set
             final_cols_present = [col for col in self.final_feature_columns_in_order if col in output_df_features_only.columns]
             output_df_features_only = output_df_features_only[final_cols_present]
             for col in self.final_feature_columns_in_order: # Ensure all expected columns are there
                 if col not in output_df_features_only.columns: output_df_features_only[col] = 0
        else: 
            print("    Error/Warning: final_feature_columns_in_order is empty. Preprocessor might not be properly fitted.")
            return pd.DataFrame(index=model_input_pdf.index) 

        print(f"  Pyfunc Preprocessor: Transformation complete. Output shape {output_df_features_only.shape}")
        return output_df_features_only


# --- Helper to Save Processed Outputs (Parquet Only - remains same) ---
def save_processed_pandas_outputs_parquet_only(processed_features_pdf, original_label_series, parquet_file_path, label_col_name_in_output):
    df_to_save_parquet = processed_features_pdf.copy()
    if original_label_series is not None:
        if len(df_to_save_parquet) == len(original_label_series): df_to_save_parquet[label_col_name_in_output] = original_label_series.values
        else: print(f"  Warning: Length mismatch for labels ({len(df_to_save_parquet)} vs {len(original_label_series)}). Label not added to Parquet.")       
    os.makedirs(os.path.dirname(parquet_file_path), exist_ok=True)
    print(f"  Saving named cols Parquet data (shape: {df_to_save_parquet.shape}) to: {parquet_file_path}")
    try: df_to_save_parquet.to_parquet(parquet_file_path, index=False); print(f"    Named cols Parquet data saved to {parquet_file_path}")
    except Exception as e: print(f"    ERROR saving Parquet to {parquet_file_path}: {e}"); raise

# --- Helper to load raw data (same as before) ---
def load_raw_data_pandas_ucv(uc_volume_parquet_path: str) -> pd.DataFrame: # Renamed for clarity
    print(f"  Attempting to load Pandas DataFrame directly from UC Volume path: {uc_volume_parquet_path}")
    try: pdf = pd.read_parquet(uc_volume_parquet_path); print(f"    Successfully loaded. Shape: {pdf.shape}"); return pdf
    except Exception as e: print(f"    ERROR loading Parquet from {uc_volume_parquet_path}: {e}"); raise

print("--- Pandas Preprocessing Pyfunc Model Class and Helpers (with Interactions) Defined ---")
print("-" * 50)

# <-------------------- CELL 4: MAIN PREPROCESSING ORCHESTRATION (with Interactions) -------------------->
print("\nCell 4: Main Preprocessing Orchestration (with Interactions) - Executing...")

global main_preprocessing_mlflow_experiment_id_interactions # Distinct name
main_preprocessing_mlflow_experiment_id_interactions = None
try:
    main_preprocessing_mlflow_experiment_id_interactions = get_or_create_experiment(PREPROCESSING_EXPERIMENT_PATH, spark)
    if main_preprocessing_mlflow_experiment_id_interactions:
        mlflow.set_experiment(experiment_id=main_preprocessing_mlflow_experiment_id_interactions)
    else:
        raise Exception("Preprocessing MLflow experiment could not be set. Halting.")
except Exception as e:
    print(f"CRITICAL: Could not initialize MLflow experiment for preprocessing. Error: {e}")
    # Consider dbutils.notebook.exit(...) for hard stop

if main_preprocessing_mlflow_experiment_id_interactions:
    print(f"\nLoading FULL RAW data for Pandas preprocessing from: {FULL_RAW_DATA_PARQUET_PATH}")
    full_raw_pdf = load_raw_data_pandas_ucv(FULL_RAW_DATA_PARQUET_PATH)
    
    print("\nPerforming stratified train-test split by month and target...")
    raw_train_pdf, raw_test_pdf = split_by_month_and_stratify(
        full_pdf=full_raw_pdf, date_col=DATE_COLUMN_FOR_SPLIT,
        target_col=YOUR_TARGET_COLUMN_NAME, test_size=TEST_SET_SPLIT_RATIO,
        random_state=GLOBAL_SEED
    )
    if not raw_train_pdf.empty: 
        os.makedirs(os.path.dirname(RAW_TRAIN_SPLIT_PATH_UCV), exist_ok=True)
        raw_train_pdf.to_parquet(RAW_TRAIN_SPLIT_PATH_UCV, index=False)
        print(f"  Intermediate raw train split ({raw_train_pdf.shape}) saved to: {RAW_TRAIN_SPLIT_PATH_UCV}")
    if not raw_test_pdf.empty: 
        os.makedirs(os.path.dirname(RAW_TEST_SPLIT_PATH_UCV), exist_ok=True)
        raw_test_pdf.to_parquet(RAW_TEST_SPLIT_PATH_UCV, index=False)
        print(f"  Intermediate raw test split ({raw_test_pdf.shape}) saved to: {RAW_TEST_SPLIT_PATH_UCV}")

    pyfunc_model_uri_saved_interactions = None
    fitted_preprocessor_for_transform_interactions = None

    with mlflow.start_run(run_name="Pandas_InteractionsFE_Pyfunc_FitLog") as preproc_run_interactions:
        run_id_main_preproc_interactions = preproc_run_interactions.info.run_id
        print(f"\nFitting and Logging Interactions FE Pandas Pyfunc Preprocessor. MLflow Run ID: {run_id_main_preproc_interactions}")
        
        mlflow.log_params({
            "label_column": YOUR_TARGET_COLUMN_NAME,
            "raw_categorical_cols": ", ".join(CATEGORICAL_COLUMNS_RAW),
            "raw_numerical_cols": ", ".join(NUMERICAL_COLUMNS_RAW),
            "date_col_for_fe": DATE_COLUMN_FOR_SPLIT, 
            "premium_col_for_fe": PREMIUM_COL_FOR_FE,
            "cols_to_interact_with_premium": ", ".join(COLS_TO_INTERACT_WITH_PREMIUM), # Log the new list
            "target_encoding_smoothing": TARGET_ENCODING_SMOOTHING,
            "cat_impute_constant": CATEGORICAL_IMPUTE_CONSTANT,
            "num_impute_strategy": NUMERICAL_IMPUTE_STRATEGY,
            "test_set_split_ratio": TEST_SET_SPLIT_RATIO,
            "data_version_processed": PROCESSED_DATA_VERSION_FE_UCV
        })
        mlflow.set_tag("GLOBAL_SEED", GLOBAL_SEED)
        mlflow.set_tag("preprocessing_type", "pandas_pyfunc_interactions_fe_ucv")

        try:
            preprocessor_instance_interactions = PandasFeatureEngineeringPreprocessorWithInteractions( # Use the new class name
                raw_categorical_cols=CATEGORICAL_COLUMNS_RAW,
                raw_numerical_cols=NUMERICAL_COLUMNS_RAW,
                date_col_for_fe=DATE_COLUMN_FOR_SPLIT,
                premium_col_for_fe=PREMIUM_COL_FOR_FE,
                cols_to_interact_with_premium_config=COLS_TO_INTERACT_WITH_PREMIUM, # Pass the new list
                # Removed age_col_for_fe and its binning configs
                label_col_for_te_fitting=YOUR_TARGET_COLUMN_NAME,
                te_smoothing_factor=TARGET_ENCODING_SMOOTHING,
                cat_impute_constant=CATEGORICAL_IMPUTE_CONSTANT,
                num_impute_strategy=NUMERICAL_IMPUTE_STRATEGY,
                global_seed=GLOBAL_SEED
            )
            
            fitted_preprocessor_to_log_interactions = preprocessor_instance_interactions.fit(raw_train_pdf)
            print("  Interactions FE Preprocessor instance fitted successfully.")

            conda_env_interactions_fe = { # Conda env definition
                'channels': ['conda-forge', 'defaults'],
                'dependencies': [
                    f'python={pd.__version__.split(".")[0]}.{pd.__version__.split(".")[1]}', 'pip',
                    {'pip': [
                        f'mlflow>={mlflow.__version__}', f'pandas>={pd.__version__}',
                        f'numpy>={np.__version__}', f'scikit-learn>={sklearn.__version__}',
                        f'category-encoders>={ce.__version__}', f'joblib>={joblib.__version__}',
                        'pyarrow', 'matplotlib'
                    ],},
                ],'name': 'pandas_interactions_fe_preprocessor_env'
            }
            
            input_example_df_interactions = raw_train_pdf.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore').head(5) if not raw_train_pdf.empty else None
            signature_interactions = None
            if input_example_df_interactions is not None and not input_example_df_interactions.empty:
                try:
                    output_example_interactions = fitted_preprocessor_to_log_interactions.predict(None, input_example_df_interactions)
                    signature_interactions = mlflow.models.infer_signature(input_example_df_interactions, output_example_interactions)
                except Exception as sig_e: print(f"  Warning: Could not infer signature. Error: {sig_e}")

            # Log the fitted preprocessor as a pyfunc model
            # The fitted_components are now attributes of fitted_preprocessor_to_log_interactions
            # and will be pickled by mlflow.pyfunc.log_model
            mlflow.pyfunc.log_model(
                artifact_path=MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH,
                python_model=fitted_preprocessor_to_log_interactions,
                conda_env=conda_env_interactions_fe,
                input_example=input_example_df_interactions,
                signature=signature_interactions
            )
            pyfunc_model_uri_saved_interactions = f"runs:/{run_id_main_preproc_interactions}/{MLFLOW_PYFUNC_PREPROCESSOR_ARTIFACT_PATH}"
            mlflow.set_tag("pyfunc_model_uri", pyfunc_model_uri_saved_interactions)
            
            if hasattr(fitted_preprocessor_to_log_interactions, 'final_feature_columns_in_order'):
                 mlflow.log_param("final_feature_names_count", len(fitted_preprocessor_to_log_interactions.final_feature_columns_in_order))
                 # Log actual final feature names as an artifact if too long for param
                 final_features_text = "\n".join(fitted_preprocessor_to_log_interactions.final_feature_columns_in_order)
                 mlflow.log_text(final_features_text, "final_feature_names.txt")

            mlflow.set_tag("status_fit_log_pyfunc", "success")
            print(f"  Fitted Pyfunc Preprocessor with Interactions FE saved to MLflow: {pyfunc_model_uri_saved_interactions}")

        except Exception as e:
            print(f"  ERROR during Pyfunc Preprocessor (Interactions FE) fitting or logging: {e}"); import traceback; traceback.print_exc()
            mlflow.log_param("error_fit_pyfunc", str(e)[:250]); mlflow.set_tag("status_fit_log_pyfunc", "failed"); raise

    # --- Transform Train and Test Data & Save Outputs (Parquet Only) ---
    if fitted_preprocessor_to_log_interactions:
        print("\nTransforming TRAIN data (Interactions FE)...")
        try:
            processed_train_features_pdf = fitted_preprocessor_to_log_interactions.predict(None, raw_train_pdf.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore'))
            # Re-attach original label
            processed_train_pdf_to_save = processed_train_features_pdf.copy()
            if YOUR_TARGET_COLUMN_NAME in raw_train_pdf.columns: processed_train_pdf_to_save[YOUR_TARGET_COLUMN_NAME] = raw_train_pdf[YOUR_TARGET_COLUMN_NAME].values
            
            save_processed_pandas_outputs_parquet_only(
                processed_features_pdf=processed_train_features_pdf, # Send features only
                original_label_series=raw_train_pdf[YOUR_TARGET_COLUMN_NAME] if YOUR_TARGET_COLUMN_NAME in raw_train_pdf else None,
                parquet_file_path=SHARED_PROCESSED_TRAIN_PATH,
                label_col_name_in_output=YOUR_TARGET_COLUMN_NAME
            )
            with mlflow.start_run(run_id=run_id_main_preproc_interactions, nested=False):
                 mlflow.set_tag("output_train_parquet_path", SHARED_PROCESSED_TRAIN_PATH)
            print(f"  Processed TRAIN data with Interactions FE saved as Parquet to: {SHARED_PROCESSED_TRAIN_PATH}")
        except Exception as e: # ... error handling ...
            print(f"  ERROR during TRAIN data transformation (Interactions FE): {e}")
            with mlflow.start_run(run_id=run_id_main_preproc_interactions, nested=False): mlflow.log_param("error_transform_train_fe", str(e)[:250])
            raise

        print("\nTransforming TEST data (Interactions FE)...")
        try:
            processed_test_features_pdf = fitted_preprocessor_to_log_interactions.predict(None, raw_test_pdf.drop(columns=[YOUR_TARGET_COLUMN_NAME], errors='ignore'))
            processed_test_pdf_to_save = processed_test_features_pdf.copy()
            original_test_label_series = None
            if YOUR_TARGET_COLUMN_NAME in raw_test_pdf.columns:
                processed_test_pdf_to_save[YOUR_TARGET_COLUMN_NAME] = raw_test_pdf[YOUR_TARGET_COLUMN_NAME].values
                original_test_label_series = raw_test_pdf[YOUR_TARGET_COLUMN_NAME]

            save_processed_pandas_outputs_parquet_only(
                processed_features_pdf=processed_test_features_pdf,
                original_label_series=original_test_label_series,
                parquet_file_path=SHARED_PROCESSED_TEST_PATH,
                label_col_name_in_output=YOUR_TARGET_COLUMN_NAME
            )
            with mlflow.start_run(run_id=run_id_main_preproc_interactions, nested=False):
                 mlflow.set_tag("output_test_parquet_path", SHARED_PROCESSED_TEST_PATH)
            print(f"  Processed TEST data with Interactions FE saved as Parquet to: {SHARED_PROCESSED_TEST_PATH}")
        except Exception as e: # ... error handling ...
            print(f"  ERROR during TEST data transformation (Interactions FE): {e}")
            with mlflow.start_run(run_id=run_id_main_preproc_interactions, nested=False): mlflow.log_param("error_transform_test_fe", str(e)[:250])
            raise
    else:
        print("CRITICAL: Preprocessor (Interactions FE) fitting/logging failed. Cannot transform.")

    print("\n--- Pandas Preprocessing Orchestration with Full Interactions FE (UC Volumes Exclusive) Completed ---")
    if pyfunc_model_uri_saved_interactions: print(f"Fitted Pyfunc Preprocessor MLflow URI: {pyfunc_model_uri_saved_interactions}")
    # ... (print other output paths) ...
else:
    print("Halting script because MLflow experiment for preprocessing could not be set.")
print("-" * 50)