In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, VotingClassifier, StackingClassifier
)

# SVC import is removed
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA # Keep PCA if used in FE
# TruncatedSVD import removed (can be added back if needed)
from category_encoders import TargetEncoder # Keep if used (e.g., for job title)
# CatBoostEncoder import removed (can be added back if needed)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb # Added LightGBM
import optuna
import warnings
import joblib
import os
import time
import json
from datetime import datetime
import shutil
import logging
import subprocess
import math
from sklearn.calibration import CalibratedClassifierCV # <-- Added for Step 2/3 plan

# Configure logging (Ensure this runs before logger is used)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Define the global logger instance
logger = logging.getLogger(__name__)
warnings.filterwarnings('ignore')
# Set seeds
np.random.seed(42)
tf.random.set_seed(42)
# Optional: Set LOKY env var if needed for Windows parallelism issues with joblib
try:
    cpu_count = os.cpu_count()
    if cpu_count: os.environ["LOKY_MAX_CPU_COUNT"] = str(cpu_count)
    logger.info(f"Set LOKY_MAX_CPU_COUNT to {os.environ.get('LOKY_MAX_CPU_COUNT')}")
except Exception as e:
    logger.warning(f"Could not set LOKY_MAX_CPU_COUNT: {e}")

2025-04-28 15:37:17,051 - INFO - Set LOKY_MAX_CPU_COUNT to 20


In [2]:
import os
import logging

# Assume logger is defined globally
# logger = logging.getLogger(__name__)

def create_directory_structure():
    """Creates the necessary directory structure for the project."""
    directories = ['models', 'features', 'results', 'submissions', 'logs', 'plots', 'optuna_trials', 'scalers', 'calibrated_models'] # Added calibrated_models
    logger.info("Creating directory structure...")
    for directory in directories:
        try:
            if not os.path.exists(directory):
                os.makedirs(directory)
                logger.info(f"Created directory: {directory}")
            # else: # Optional: log if directory already exists
                logger.debug(f"Directory already exists: {directory}")
        except Exception as e:
            logger.error(f"Error creating directory {directory}: {e}")
            # Re-raise the exception to halt execution if directory creation fails,
            # as it's likely critical for the pipeline.
            raise
    logger.info("Directory structure verified/created.")

In [3]:
from datetime import datetime

def get_timestamp():
    """Returns the current timestamp in YYYYMMDD_HHMMSS format."""
    return datetime.now().strftime("%Y%m%d_%H%M%S")

In [4]:

# Assume logger is defined globally
# logger = logging.getLogger(__name__)

def save_feature_importance(model, feature_names, timestamp, model_name):
    """
    Saves feature importances for compatible models (Tree-based, Linear).
    Logs a message for models where standard importance isn't directly available.
    """
    if not feature_names:
        logger.warning(f"No feature names provided for {model_name}. Skipping feature importance.")
        return

    importances = None
    importance_type = None
    is_fitted = True # Assume fitted unless checked otherwise

    # --- Model Type Specific Handling ---
    if isinstance(model, KerasClassifier):
        try:
            _ = model.model_ # Check if internal model exists
            logger.info(f"Standard feature importance plot not generated for Keras model {model_name}.")
            logger.info("Consider using techniques like Permutation Importance or SHAP.")
        except AttributeError:
            logger.warning(f"Keras model {model_name} not fitted. Skip importance.")
        return # Exit for Keras models

    elif isinstance(model, (VotingClassifier, StackingClassifier)):
        logger.info(f"Importance plot not generated for ensemble {model_name}.")
        return # Exit for ensembles

    elif isinstance(model, MLPClassifier):
        logger.info(f"Standard feature importance not directly available for MLPClassifier {model_name}.")
        return # Exit for MLP

    # Check for standard attributes AFTER handling special cases
    elif hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        importance_type = 'Importance'
    elif hasattr(model, 'coef_'):
        if model.coef_.ndim > 1:
            importances = np.abs(model.coef_).mean(axis=0)
        else:
            importances = np.abs(model.coef_)
        importance_type = 'Coefficient Magnitude'
    elif hasattr(model, 'estimator_') and hasattr(model.estimator_, 'feature_importances_'):
        # Handle cases like AdaBoost where the base estimator holds importance
        # Ensure estimator_ exists and has the attribute
        if getattr(model, 'estimator_', None) and hasattr(model.estimator_, 'feature_importances_'):
             logger.info(f"Using importance from base estimator ({model.estimator_.__class__.__name__}) of {model_name}.")
             importances = model.estimator_.feature_importances_
             importance_type = 'Base Estimator Importance'
        else:
             logger.warning(f"Base estimator not found or lacks importance for {model_name}.")
             return

    # Add check for LightGBM specifically if feature_importances_ isn't present on fitted model sometimes
    elif isinstance(model, lgb.LGBMClassifier) and hasattr(model, 'booster_'):
         try:
             importances = model.booster_.feature_importance(importance_type='gain') # Or 'split'
             importance_type = 'LGBM Gain'
             logger.info(f"Using booster_.feature_importance() for {model_name}.")
         except Exception as lgbm_imp_err:
              logger.warning(f"Could not get LGBM importance via booster_: {lgbm_imp_err}")
              return
    else:
        logger.info(f"Model {model_name} ({model.__class__.__name__}) lacks standard importance attributes (feature_importances_, coef_, relevant estimator_).")
        return

    # --- Process and Save Importances (if found) ---
    if importances is None:
        logger.warning(f"Could not retrieve importances for {model_name}.")
        return

    if isinstance(importances, list): # Ensure numpy array
        importances = np.array(importances)

    if importances.ndim > 1:
        logger.warning(f"Importances shape {importances.shape} for {model_name}. Taking mean over axis 0.")
        importances = importances.mean(axis=0)

    if len(importances) != len(feature_names):
        logger.warning(f"Importance length ({len(importances)}) vs names ({len(feature_names)}) mismatch for {model_name}.")
        return

    # --- Create DataFrame and Plot ---
    try:
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        # Handle potential NaN/Inf in importance values before sorting
        importance_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        importance_df.dropna(subset=['Importance'], inplace=True)
        if importance_df.empty:
             logger.warning(f"Importance DataFrame became empty after dropping NaN/Inf for {model_name}.")
             return

        importance_df = importance_df.sort_values('Importance', ascending=False)

        plt.figure(figsize=(12, 8))
        top_n = min(30, len(importance_df))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(top_n), palette='viridis')
        plt.title(f'Top {top_n} Feature Importances - {model_name}')
        plt.xlabel(f'Relative {importance_type}')
        plt.tight_layout()

        # Ensure directories exist before saving
        plot_dir = 'plots'
        results_dir = 'results'
        if not os.path.exists(plot_dir): os.makedirs(plot_dir)
        if not os.path.exists(results_dir): os.makedirs(results_dir)

        plot_filename = os.path.join(plot_dir, f'{model_name}_feature_importance_{timestamp}.png')
        plt.savefig(plot_filename)
        plt.close()
        logger.info(f"Saved importance plot: {plot_filename}")

        csv_filename = os.path.join(results_dir, f'{model_name}_feature_importance_{timestamp}.csv')
        importance_df.to_csv(csv_filename, index=False)
        logger.info(f"Saved importance csv: {csv_filename}")

    except Exception as e:
        logger.warning(f"Could not save importance plot/CSV for {model_name}: {e}", exc_info=True)

In [5]:
# Add to imports at the top of your file
from imblearn.combine import SMOTETomek
from collections import Counter

def apply_smote_tomek(X, y, random_state=42):
    """
    Apply SMOTETomek to handle class imbalance.
    This combines SMOTE (oversampling) with Tomek links (undersampling) for better balance.

    Args:
        X: Feature matrix
        y: Target labels
        random_state: Random state for reproducibility

    Returns:
        X_resampled, y_resampled: Balanced dataset
    """
    logger.info(f"Original class distribution: {Counter(y)}")
    
    # Create the resampler
    smt = SMOTETomek(random_state=random_state)
    
    # Apply resampling
    X_resampled, y_resampled = smt.fit_resample(X, y)
    
    logger.info(f"Resampled class distribution: {Counter(y_resampled)}")
    logger.info(f"Original shape: {X.shape}, Resampled shape: {X_resampled.shape}")
    
    # Preserve DataFrame if input was DataFrame
    if isinstance(X, pd.DataFrame):
        X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    
    return X_resampled, y_resampled

In [6]:
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt

def optimize_feature_selection(X, y, n_jobs=1, cv=5):
    """
    Performs optimal feature selection using RFECV (Recursive Feature Elimination
    with Cross-Validation) to find the optimal number of features.
    
    Args:
        X: Feature DataFrame
        y: Target vector
        n_jobs: Number of parallel jobs
        cv: Number of cross-validation folds
    
    Returns:
        selected_features: List of selected feature names
        selector: Fitted RFECV object
    """
    logger.info("Starting RFECV feature selection optimization...")
    
    # Create a base estimator (Random Forest)
    base_estimator = RandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced',
        random_state=42,
        n_jobs=n_jobs
    )
    
    # Create RFECV selector
    selector = RFECV(
        estimator=base_estimator,
        step=1,  # Remove one feature at a time
        cv=cv,   # Cross-validation folds
        scoring='accuracy',
        min_features_to_select=5,  # Don't go below 5 features
        n_jobs=n_jobs,
        verbose=1
    )
    
    # Fit the selector
    logger.info("Fitting RFECV feature selector (this may take time)...")
    selector.fit(X, y)
    
    # Get selected features
    selected_features = X.columns[selector.support_]
    
    # Plot number of features vs. performance
    plt.figure(figsize=(10, 6))
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross-validation score")
    plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
    plt.title('Optimal Feature Selection with RFECV')
    
    # Save the plot
    timestamp = get_timestamp()
    plt_path = f'plots/rfecv_feature_selection_{timestamp}.png'
    plt.savefig(plt_path)
    plt.close()
    
    logger.info(f"Optimal number of features: {selector.n_features_}")
    logger.info(f"Selected {len(selected_features)} features from {X.shape[1]} original features")
    logger.info(f"Best cross-validation score: {selector.grid_scores_.max():.5f}")
    logger.info(f"Feature selection plot saved to: {plt_path}")
    
    return selected_features, selector

In [7]:


def build_keras_model(n_features, n_classes, optimizer='adam', learning_rate=0.001,
                      hidden_units=[128, 64], dropout_rate=0.3, activation='relu', l2_reg=1e-4):
    """Builds a Keras MLP model with specified architecture and hyperparameters."""

    model = keras.Sequential(name="keras_mlp_tabular")
    model.add(layers.Input(shape=(n_features,)))

    # Optional: Batch Norm before the first layer
    model.add(layers.BatchNormalization())

    # Hidden Layers
    for units in hidden_units:
        model.add(layers.Dense(
            units,
            kernel_regularizer=keras.regularizers.l2(l2_reg) # Add L2 regularization
        ))
        model.add(layers.BatchNormalization()) # Batch Norm after Dense layer
        model.add(layers.Activation(activation)) # Activation after Batch Norm
        model.add(layers.Dropout(dropout_rate))

    # Output Layer
    model.add(layers.Dense(n_classes, activation='softmax'))

    # Select Optimizer
    if optimizer.lower() == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer.lower() == 'sgd':
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    else:
        logger.warning(f"Unsupported optimizer '{optimizer}'. Defaulting to Adam.")
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    # Compile Model
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

    # Log model summary details
    logger.info(f"Keras model built: Input({n_features}), Hidden({len(hidden_units)} layers, units={hidden_units}), Output({n_classes})")
    logger.info(f" Activation: {activation}, Dropout: {dropout_rate}, L2 Reg: {l2_reg}, Optimizer: {optimizer}, LR: {learning_rate}")
    # Optional detailed summary:
    stringlist = []
    model.summary(print_fn=lambda x: stringlist.append(x))
    short_model_summary = "\n".join(stringlist)
    logger.debug(f"Keras Model Summary:\n{short_model_summary}")

    return model

In [8]:


def preprocess_data(df, all_states, all_feature1, timestamp, is_training=True, feature_columns_to_use=None):
    """
    Preprocesses data using combined FE logic and robust categorical handling.
    Warns about constant columns in training data but does not drop them.
    """
    logger.info(f"Starting preprocessing (Combined Logic). Is training: {is_training}")
    start_time = time.time()
    data = df.copy()
    y = None
    le = None
    target_column = 'salary_category'

    # 1. Handle Target Variable
    if target_column in data.columns and is_training:
        target = data[target_column]
        le = LabelEncoder()
        y = le.fit_transform(target)
        logger.info(f"Target '{target_column}' label encoded.")
        # Save encoder and mapping
        if not os.path.exists('features'): os.makedirs('features')
        joblib.dump(le, f'features/label_encoder_{timestamp}.joblib')
        mapping = {int(v): k for k, v in zip(le.classes_, le.transform(le.classes_))}
        mapping_file = f'features/target_mapping_{timestamp}.json'
        with open(mapping_file, 'w') as f: json.dump(mapping, f, indent=4)
        logger.info("Saved label encoder and mapping.")
    elif not is_training:
        # Load encoder
        try:
            encoder_files = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
            if encoder_files: le = joblib.load(f'features/{encoder_files[-1]}'); logger.info(f"Loaded LE: {encoder_files[-1]}")
            else: logger.warning("No LE file found!"); le = None
        except Exception as e: logger.error(f"Failed load LE: {e}"); le = None
    elif is_training: # Training mode but no target
        logger.error(f"Target column '{target_column}' missing in training data!")
        raise ValueError(f"Target column '{target_column}' not found.")

    # 2. Define Feature Groups
    boolean_features = [f for f in ['feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_10', 'feature_11'] if f in data.columns]
    numerical_features = [f for f in ['feature_2', 'feature_9', 'feature_12'] if f in data.columns]
    job_desc_cols = [col for col in data.columns if col.startswith('job_desc_')]
    all_numerical_features = numerical_features + job_desc_cols

    # 3. Initial Cleaning
    logger.info("Initial cleaning: Numerical and Boolean Features...")
    for col in all_numerical_features:
        if col in data.columns:
            if data[col].dtype == 'object': data[col] = data[col].replace(['', ' ', 'NA', 'None', 'NULL'], np.nan)
            data[col] = pd.to_numeric(data[col], errors='coerce')
            median_val = data[col].median(); fill_value = median_val if not pd.isna(median_val) else 0
            data[col] = data[col].fillna(fill_value)
    for col in boolean_features:
        if col in data.columns:
            numeric_view = pd.to_numeric(data[col], errors='coerce'); is_boolean_like = numeric_view.dropna().isin([0, 1]).all()
            if is_boolean_like: data[col] = numeric_view.fillna(0).astype(int)
            else:
                num_non_bool = numeric_view.dropna().loc[~numeric_view.dropna().isin([0, 1])].count()
                logger.warning(f"Col '{col}' has non-0/1 vals ({num_non_bool}). Treat as numeric, impute median.")
                median_val = numeric_view.median(); fill_value = median_val if not pd.isna(median_val) else 0
                data[col] = numeric_view.fillna(fill_value)

    logger.info("Starting Feature Engineering...")
    engineered_feature_names = []
    target_encoded_title = 'job_title_encoded' # Define expected name

    # --- Feature Engineering Steps (Combined logic from analysis) ---
    if 'job_title' in data.columns:
        data['job_title'] = data['job_title'].fillna('Unknown')
        title_flags = ['is_senior', 'is_junior', 'is_developer', 'is_specialist']
        data['is_senior'] = data['job_title'].str.lower().str.contains('senior|sr|lead|principal').fillna(False).astype(int)
        data['is_junior'] = data['job_title'].str.lower().str.contains('junior|jr|associate|entry').fillna(False).astype(int)
        data['is_developer'] = data['job_title'].str.lower().str.contains('develop|programmer|coder|engineer').fillna(False).astype(int)
        data['is_specialist'] = data['job_title'].str.lower().str.contains('special|expert|consult').fillna(False).astype(int)
        engineered_feature_names.extend(title_flags)
        title_counts = data['job_title'].value_counts(); rare_titles = title_counts[title_counts < 10].index
        data['job_title_grouped'] = data['job_title'].apply(lambda x: 'Other_Title' if x in rare_titles else x)
        title_encoder_col = 'job_title_grouped'; engineered_feature_names.append(target_encoded_title)
        if is_training:
            job_encoder = TargetEncoder(cols=[title_encoder_col], handle_missing='value', handle_unknown='value')
            data[target_encoded_title] = job_encoder.fit_transform(data[[title_encoder_col]], y) # Use encoded y
            joblib.dump(job_encoder, f'features/job_title_encoder_{timestamp}.joblib')
            logger.info(f"Fit/saved TE for {title_encoder_col}")
        else:
            encoder_path = f'features/job_title_encoder_{timestamp}.joblib'
            fallback_files = sorted([f for f in os.listdir('features') if f.startswith('job_title_encoder_')])
            loaded = False
            
            if os.path.exists(encoder_path):
                try:
                    job_encoder = joblib.load(encoder_path)
                    data[target_encoded_title] = job_encoder.transform(data[[title_encoder_col]])
                    loaded = True
                    logger.info(f"Loaded TE: {encoder_path}")
                except Exception as e:
                    logger.error(f"Failed load TE '{encoder_path}': {e}. Try fallback.")
                    
            if not loaded and fallback_files:
                latest_encoder = fallback_files[-1]
                try:
                    job_encoder = joblib.load(f'features/{latest_encoder}')
                    data[target_encoded_title] = job_encoder.transform(data[[title_encoder_col]])
                    loaded = True
                    logger.info(f"Loaded fallback TE: {latest_encoder}")
                except Exception as e_fb:
                    logger.error(f"Fallback TE failed: {e_fb}. Fill 0.5")
            if not loaded: logger.error("No TE found. Fill 0.5"); data[target_encoded_title] = 0.5
        data = data.drop(['job_title', 'job_title_grouped'], axis=1, errors='ignore'); logger.info("Processed 'job_title'.")

    if 'job_posted_date' in data.columns:
        data['job_posted_date'] = data['job_posted_date'].fillna('2000/01')
        def extract_year(d): 
            try: return int(str(d)[:4]) 
            except: return 2000
        def extract_month(d): 
            try: return int(str(d).split('/')[1]) 
            except: return 1
        data['job_posted_year'] = data['job_posted_date'].apply(extract_year); data['job_posted_month'] = data['job_posted_date'].apply(extract_month); data['job_posted_month'] = data['job_posted_month'].clip(1, 12)
        date_features = ['month_sin', 'month_cos', 'job_recency', 'job_posted_year_norm']
        data['month_sin'] = np.sin(2 * np.pi * data['job_posted_month'] / 12); data['month_cos'] = np.cos(2 * np.pi * data['job_posted_month'] / 12); data['job_recency'] = data['job_posted_year'] * 12 + data['job_posted_month']
        mean_year = 2022; data['job_posted_year_norm'] = data['job_posted_year'] - mean_year
        engineered_feature_names.extend(date_features)
        data = data.drop(['job_posted_date', 'job_posted_year', 'job_posted_month'], axis=1, errors='ignore'); logger.info("Processed 'job_posted_date'.")

    num_transform_features = []
    # Feature 9 processing
    if 'feature_9' in data.columns:
        try: data['feature_9_bin'] = pd.qcut(data['feature_9'].rank(method='first'), q=5, labels=[0, 1, 2, 3, 4], duplicates='drop').astype(int)
        except ValueError: logger.warning("qcut fail f9, use cut.")
        try: 
            data['feature_9_bin'] = pd.cut(data['feature_9'], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True, duplicates='drop').astype(int) 
        except Exception as e_cut: logger.error(f"Cut fail f9: {e_cut}. Bin 0."); data['feature_9_bin'] = 0
        data['feature_9_bin'] = data['feature_9_bin'].fillna(int(data['feature_9_bin'].median())) # Fill potential NaNs from cut
        num_transform_features.append('feature_9_bin'); logger.info("Added bin f9.")
        if 'feature_2' in data.columns: interaction_name = 'feature_2_9_interaction'; data[interaction_name] = data['feature_2'] * data['feature_9']; num_transform_features.append(interaction_name); logger.info(f"Added: {interaction_name}")

    # Feature 2 processing
    if 'feature_2' in data.columns:
        # Log transform (handle potential zeros/negatives if necessary)
        data['feature_2_log'] = np.log1p(data['feature_2']) # Add log transform
        num_transform_features.append('feature_2_log')

        data['feature_2_squared'] = data['feature_2'] ** 2
        data['feature_2_sqrt'] = np.sqrt(np.abs(data['feature_2']))
        num_transform_features.extend(['feature_2_squared', 'feature_2_sqrt'])
        try: 
            data['feature_2_bin'] = pd.qcut(data['feature_2'].rank(method='first'), q=5, labels=[0, 1, 2, 3, 4], duplicates='drop').astype(int)
        except ValueError: 
            logger.warning("qcut fail f2, use cut."); 
            try: 
                data['feature_2_bin'] = pd.cut(data['feature_2'], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True, duplicates='drop').astype(int) 
            except Exception as e_cut: 
                logger.error(f"Cut fail f2: {e_cut}. Bin 0."); 
                data['feature_2_bin'] = 0
        data['feature_2_bin'] = data['feature_2_bin'].fillna(int(data['feature_2_bin'].median())) # Fill potential NaNs from cut
        num_transform_features.append('feature_2_bin')
        logger.info("Added transforms f2 (sq, sqrt, bin).")
    engineered_feature_names.extend(num_transform_features)

    bool_agg_features = []; actual_boolean_cols = [col for col in boolean_features if col in data.columns]
    if actual_boolean_cols: data['boolean_sum'] = data[actual_boolean_cols].sum(axis=1); data['boolean_sum_squared'] = data['boolean_sum'] ** 2; bool_agg_features.extend(['boolean_sum', 'boolean_sum_squared']); logger.info("Added bool aggs.")
    else: data['boolean_sum'] = 0; data['boolean_sum_squared'] = 0; logger.info("No bool features for agg.")
    engineered_feature_names.extend(bool_agg_features)

    if 'feature_10' in data.columns and 'feature_8' in data.columns: interaction_name = 'feature_10_8_interaction'; data[interaction_name] = data['feature_10'] * data['feature_8']; engineered_feature_names.append(interaction_name); logger.info(f"Added: {interaction_name}")

    # --- NEW Interactions based on Analysis ---
    new_interactions = []
    if 'feature_2' in data.columns:
        if target_encoded_title in data.columns:
             int_name = f'feat2_{target_encoded_title}'; data[int_name] = data['feature_2'] * data[target_encoded_title]; new_interactions.append(int_name)
        if 'boolean_sum' in data.columns:
             int_name = 'feat2_boolsum'; data[int_name] = data['feature_2'] * data['boolean_sum']; new_interactions.append(int_name)
        if 'job_recency' in data.columns:
             int_name = 'feat2_recency'; data[int_name] = data['feature_2'] * data['job_recency']; new_interactions.append(int_name)
        # Interaction with top PCA component (assuming pca_0 exists)
        if 'job_desc_pca_0' in data.columns:
            int_name = 'feat2_pca0'; data[int_name] = data['feature_2'] * data['job_desc_pca_0']; new_interactions.append(int_name)
    if target_encoded_title in data.columns and 'job_recency' in data.columns:
        int_name = f'{target_encoded_title}_recency'; data[int_name] = data[target_encoded_title] * data['job_recency']; new_interactions.append(int_name)
    if new_interactions: logger.info(f"Added new interactions: {new_interactions}"); engineered_feature_names.extend(new_interactions)
    # --- End New Interactions ---

    job_desc_eng_features = []
    if job_desc_cols:
        desc_agg = ['job_desc_mean', 'job_desc_std', 'job_desc_min', 'job_desc_max', 'job_desc_sum', 'job_desc_q25', 'job_desc_q75', 'job_desc_iqr']
        data['job_desc_mean'] = data[job_desc_cols].mean(axis=1); data['job_desc_std'] = data[job_desc_cols].std(axis=1).fillna(0); data['job_desc_min'] = data[job_desc_cols].min(axis=1); data['job_desc_max'] = data[job_desc_cols].max(axis=1); data['job_desc_sum'] = data[job_desc_cols].sum(axis=1); data['job_desc_q25'] = data[job_desc_cols].quantile(0.25, axis=1); data['job_desc_q75'] = data[job_desc_cols].quantile(0.75, axis=1); data['job_desc_iqr'] = data['job_desc_q75'] - data['job_desc_q25']; job_desc_eng_features.extend(desc_agg)
        n_pca_components = 15 # Keep default or adjust based on experiments
        if len(job_desc_cols) > n_pca_components:
            logger.info(f"Applying PCA (n={n_pca_components}) to job desc...")
            pca_names = [f'job_desc_pca_{i}' for i in range(n_pca_components)]; job_desc_eng_features.extend(pca_names); job_desc_pca_result = None
            if is_training: pca = PCA(n_components=n_pca_components, random_state=42); job_desc_pca_result = pca.fit_transform(data[job_desc_cols]); joblib.dump(pca, f'features/job_desc_pca_{timestamp}.joblib'); logger.info("Fit/saved PCA.")
            else:
                pca_path = f'features/job_desc_pca_{timestamp}.joblib'; fallback_files = sorted([f for f in os.listdir('features') if f.startswith('job_desc_pca_')]); pca_loaded = False; pca = None
                if os.path.exists(pca_path): 
                    try: 
                        pca = joblib.load(pca_path)
                        pca_loaded=True; logger.info(f"Loaded PCA: {pca_path}") 
                    except Exception as e: 
                        logger.error(f"Fail load PCA: {e}. Try fallback.")
                if not pca_loaded and fallback_files:
                    latest_pca = fallback_files[-1]
                    try: 
                        pca = joblib.load(f'features/{latest_pca}')
                        pca_loaded = True
                    except Exception as e_fb:
                        logger.error(f"Fallback PCA load failed: {e_fb}.")
                else:
                    latest_pca = None
                if pca_loaded and pca is not None: 
                    try: 
                        job_desc_pca_result = pca.transform(data[job_desc_cols]) 
                    except Exception as e_trans: 
                        logger.error(f"PCA transform fail: {e_trans}. Fill 0.")
                if job_desc_pca_result is None: logger.error("PCA result None. Fill 0.")
                job_desc_pca_result = np.zeros((data.shape[0], n_pca_components))
            for i in range(min(n_pca_components, job_desc_pca_result.shape[1])): data[pca_names[i]] = job_desc_pca_result[:, i]
        else: logger.warning(f"Skip PCA: Not enough features.")
        data = data.drop(columns=job_desc_cols, errors='ignore'); logger.info("Finished job desc features.")
    else: logger.info("No job desc features.")
    engineered_feature_names.extend(job_desc_eng_features)

    # --- Robust Categorical Handling ---
    if 'job_state' in data.columns: data['job_state'] = data['job_state'].fillna('Unknown')
    if 'feature_1' in data.columns: data['feature_1'] = data['feature_1'].fillna('Unknown')

    manual_ohe_features = []
    logger.info(f"Applying manual OHE for 'job_state' ({len(all_states)} unique).")
    if 'job_state' in data.columns:
        for state in all_states: col_name = f'state_{state}'; data[col_name] = (data['job_state'] == state).astype(int); manual_ohe_features.append(col_name)
        data = data.drop('job_state', axis=1, errors='ignore')
    else: logger.warning("'job_state' not found for OHE.")

    logger.info(f"Applying manual OHE for 'feature_1' ({len(all_feature1)} unique).")
    if 'feature_1' in data.columns:
        for feat in all_feature1: col_name = f'feat1_{feat}'; data[col_name] = (data['feature_1'] == feat).astype(int); manual_ohe_features.append(col_name)
        data = data.drop('feature_1', axis=1, errors='ignore')
    else: logger.warning("'feature_1' not found for OHE.")
    engineered_feature_names.extend(manual_ohe_features)
    # --- End FE ---

    # 5. Final Cleanup and Column Management
    logger.info("Final cleanup and column alignment...")
    columns_to_exclude = ['obs']
    if is_training and target_column in df.columns: columns_to_exclude.append(target_column)
    potential_feature_cols = [col for col in data.columns if col not in columns_to_exclude]

    inf_cols_handled = []; nan_cols_handled = []
    for col in potential_feature_cols:
        if pd.api.types.is_numeric_dtype(data[col]):
            if np.isinf(data[col]).any(): inf_cols_handled.append(col); data[col] = data[col].replace([np.inf, -np.inf], np.nan)
            if data[col].isnull().any(): nan_cols_handled.append(col); data[col] = data[col].fillna(0)
    if inf_cols_handled: logger.warning(f"Replaced Inf: {inf_cols_handled}")
    final_nan_cols = list(set(nan_cols_handled) - set(inf_cols_handled))
    if final_nan_cols: logger.info(f"Filled NaNs with 0: {final_nan_cols}")
    for col in potential_feature_cols:
        if data[col].dtype == 'bool': data[col] = data[col].astype(int)

    if is_training:
        constant_cols_found = []
        for col in potential_feature_cols:
            nunique_val = data[col].nunique(dropna=False)
            if nunique_val <= 1: logger.warning(f"Col '{col}' constant (nunique={nunique_val}) in train. Engineered: {col in engineered_feature_names}. Keeping col.") ; constant_cols_found.append(col)
            elif nunique_val <= 3 and col in engineered_feature_names: logger.info(f"Eng col '{col}' low card (nunique={nunique_val}) in train.")

        final_feature_columns = potential_feature_cols
        joblib.dump(final_feature_columns, f'features/feature_columns_{timestamp}.joblib')
        logger.info(f"Saved {len(final_feature_columns)} feature names (const cols NOT dropped).")
        X = data[final_feature_columns]
        logger.info(f"Preprocessing train done. Shape: {X.shape}. Time: {time.time() - start_time:.2f}s")
        try: X.head().to_csv(f'features/processed_features_head_{timestamp}.csv', index=False)
        except Exception as e: logger.warning(f"Could not save head: {e}")
        return X, y, final_feature_columns, le
    else: # Test Data
        if feature_columns_to_use is None:
            try:
                col_files = sorted([f for f in os.listdir('features') if f.startswith('feature_columns_')])
                if col_files:
                    latest_col = col_files[-1]
                    feature_columns_to_use = joblib.load(f'features/{latest_col}')
                    logger.info(f"Loaded {len(feature_columns_to_use)} cols from: {latest_col}")
                else:
                    logger.error("CRITICAL: No feature_columns file.")
                    raise FileNotFoundError("feature_columns_*.joblib missing.")
            except Exception as e:
                logger.error(f"Failed to load feature columns: {e}.")
                raise

        X = pd.DataFrame(columns=feature_columns_to_use); missing_cols = []; processed_cols = list(data.columns); extra_cols = list(set(processed_cols) - set(feature_columns_to_use) - set(columns_to_exclude))
        for col in feature_columns_to_use:
            if col in data.columns: X[col] = data[col]
            else: X[col] = 0; missing_cols.append(col)
        if missing_cols: logger.warning(f"Cols missing in test (filled 0): {missing_cols}")
        if extra_cols: logger.warning(f"Cols extra in test (dropped align): {extra_cols}")
        X = X[feature_columns_to_use]; logger.info(f"Preprocessing test done. Shape: {X.shape}. Time: {time.time() - start_time:.2f}s")
        return X, y, feature_columns_to_use, le # y is None

In [None]:
import numpy as np
import pandas as pd
import logging
import time
import json
import os
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight  # Added compute_sample_weight
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scikeras.wrappers import KerasClassifier
# Imports for models used within the function
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
                              ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import joblib  # For saving models
from sklearn.calibration import CalibratedClassifierCV  # For calibration step

# Assume build_keras_model function is defined elsewhere
# Assume logger is configured globally
# logger = logging.getLogger(__name__)
# Assume save_feature_importance function is defined
BOOSTING_EARLY_STOPPING_PATIENCE = 50
def optimize_model(X, y, timestamp, model_type, n_trials=30, n_jobs_optuna=1):
    """
    Optimizes hyperparameters for a given model type using Optuna,
    then trains and saves the final model with best parameters.
    Includes class_weight='balanced' or equivalent strategies.
    Correctly handles Optuna-specific trial parameters during final instantiation.
    Attempts calibration after successful model saving.
    """
    logger.info(f"Starting {model_type} optimization ({n_trials} trials)...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if not isinstance(y, (np.ndarray, pd.Series)):
        y = np.array(y)
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X)  # Ensure DataFrame for consistent .iloc

    n_classes = len(np.unique(y))
    n_features = X.shape[1]
    y_keras = to_categorical(y, num_classes=n_classes) if model_type == 'keras_mlp' else y

    KERAS_EPOCHS = 150  # Reduced Keras epochs slightly
    KERAS_PATIENCE = 25  # Increased Keras patience slightly
    OPTUNA_TIMEOUT_PER_MODEL = 3600  # Default 1 hour
    # Increase timeout for complex models
    if model_type in ['xgboost', 'catboost', 'randomforest', 'gradientboosting', 'extratrees', 'lightgbm']:
        OPTUNA_TIMEOUT_PER_MODEL = 7200  # Increase to 2 hours
    logger.info(f"Optuna timeout for {model_type}: {OPTUNA_TIMEOUT_PER_MODEL}s.")

    # --- Optuna Objective Function ---
    def objective(trial):
        model = None
        fit_params = {}
        use_gpu = False
        is_keras = False

        # --- Model Definitions for Optuna Trial (Includes custom weights/params) ---
        if model_type == 'xgboost':
            tree_method = trial.suggest_categorical('tree_method', ['hist', 'gpu_hist'])
            param = {
                'objective': 'multi:softprob', # Keep for probabilities
                'num_class': n_classes,
                'eval_metric': 'mlogloss',
                'n_estimators': trial.suggest_int('n_estimators', 300, 5000, step=100), # Wider range
                'max_depth': trial.suggest_int('max_depth', 4, 26, step=1), # Wider range, finer step
                'learning_rate': trial.suggest_float('learning_rate', 0.007, 0.15, log=True), # Slightly lower min
                'subsample': trial.suggest_float('subsample', 0.5, 1.0), # Allow full subsample
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0), # Wider range
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20), # Wider range (regularization)
                'gamma': trial.suggest_float('gamma', 1e-7, 1.0, log=True), # Wider upper bound (regularization)
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-7, 20.0, log=True), # Wider range (regularization)
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-7, 20.0, log=True), # Wider range (regularization)
                'random_state': 42,
                'booster': 'gbtree',
                'tree_method': tree_method,
                # 'n_jobs': 1 # Handled by tree_method check
            }
            if tree_method == 'gpu_hist':
                param['gpu_id'] = 0
                # param.pop('n_jobs', None) # n_jobs not used with gpu_hist
            else:
                param['n_jobs'] = 1 # Use 1 core for CPU hist for stability within Optuna
                param.pop('gpu_id', None)

            model = XGBClassifier(**param)
            # Early stopping will be added in the CV loop fit call
            fit_params = {} # Reset, ES handled in loop

        elif model_type == 'catboost':
            task_type = trial.suggest_categorical('task_type', ['CPU', 'GPU'])
            # More nuanced class weight options - focus on boosting High (0) and Medium (2) slightly
            class_weight_options = [
                None,
                'Balanced',
                # Note: Optuna often saves dict keys as strings, ensure final fit handles int conversion if needed
                {0: 1.0, 1: 1.1, 2: 1.1}, # Boost High slightly more
                {0: 1.0, 1: 1.0, 2: 1.2}, # Boost High more, Medium slightly
                {0: 1.0, 1: 1.0, 2: 1.3}, # Boost Medium slightly more
            ]
            chosen_class_weight_config = trial.suggest_categorical("class_weight_config",class_weight_options)

            param = {
                'iterations': trial.suggest_int('iterations', 300, 5500, step=100), # Wider range
                'depth': trial.suggest_int('depth', 4, 16), # Wider range
                'learning_rate': trial.suggest_float('learning_rate', 0.007, 0.15, log=True), # Slightly lower min
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.5, 30.0, log=True), # Wider range (regularization)
                'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True), # Exploration range
                'border_count': trial.suggest_categorical('border_count', [32, 64, 128, 254]), # Added 32
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 0.9), # Wider range
                'loss_function': 'MultiClass',
                'eval_metric': 'Accuracy', # Using Accuracy as metric, can change to MultiClass
                'random_seed': 42,
                'thread_count': -1, # Use all available CPU cores if task_type='CPU'
                'verbose': False,
                'task_type': task_type,
                # 'auto_class_weights': None, # Set based on choice below
                # 'class_weights': None # Set based on choice below
            }
            # Apply class weight strategy
            if isinstance(chosen_class_weight_config, dict):
                param['class_weights'] = chosen_class_weight_config
                trial.set_user_attr("class_weight_info", chosen_class_weight_config) # Log the dict
            elif chosen_class_weight_config == 'Balanced':
                param['auto_class_weights'] = 'Balanced'
                trial.set_user_attr("class_weight_info", 'Balanced')
            else: # None case
                trial.set_user_attr("class_weight_info", 'None')


            if task_type == 'GPU':
                param['devices'] = '0'
                param.pop('thread_count', None) # Not needed for GPU

            model = CatBoostClassifier(**param)
            fit_params = {'early_stopping_rounds': BOOSTING_EARLY_STOPPING_PATIENCE, 'verbose': False}

        elif model_type == 'randomforest':
            # More class weight dictionary options
            class_weight_choices = [
                'balanced',
                'balanced_subsample',
                # Note: Optuna often saves dict keys as strings, ensure final fit handles int conversion if needed
                {0: 1.0, 1: 1.0, 2: 1.1},
                {0: 1.0, 1: 1.0, 2: 1.2},
                {0: 1.0, 1: 1.0, 2: 1.3},
            ]
            class_weight = trial.suggest_categorical('class_weight', class_weight_choices)
            param = {
                'n_estimators': trial.suggest_int('n_estimators', 300, 5500, step=100), # Wider range
                'max_depth': trial.suggest_int('max_depth', 10, 100, step=2), # Allow deeper trees, rely on leaf constraints
                # 'max_depth': trial.suggest_categorical('max_depth', [10, 20, 30, 40, 50, 60, None]), # Alternative: includes None
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 50), # Wider range (regularization)
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 30), # Wider range (regularization)
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.6, 0.8]), # Added float options
                'bootstrap': True, # Usually best for RF
                'class_weight': class_weight, # Apply choice
                'random_state': 42,
                'n_jobs': n_jobs_optuna,
                'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
                'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.05) # Add slight pruning
            }
            model = RandomForestClassifier(**param)
            fit_params = {} # RF doesn't have special fit params here

        elif model_type == 'extratrees':
            # More class weight dictionary options
            class_weight_choices = [
                'balanced',
                'balanced_subsample',
                {0: 1.0, 1: 1.0, 2: 1.1},
                {0: 1.0, 1: 1.0, 2: 1.2},
                {0: 1.0, 1: 1.0, 2: 1.4},
            ]
            class_weight = trial.suggest_categorical('class_weight', class_weight_choices)
            param = {
                'n_estimators': trial.suggest_int('n_estimators', 300, 5500, step=500), # Wider range
                'max_depth': trial.suggest_int('max_depth', 10, 80, step=2), # Allow deeper
                # 'max_depth': trial.suggest_categorical('max_depth', [10, 20, 30, 40, 50, 60, 70, None]), # Alternative
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 40), # Wider range
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 30), # Wider range
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.6, 0.8]), # Added floats
                'bootstrap': trial.suggest_categorical('bootstrap', [False, True]), # Tune bootstrap for ET
                'class_weight': class_weight, # Apply choice
                'random_state': 42,
                'n_jobs': n_jobs_optuna,
                'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
                'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.05) # Add slight pruning
            }
            model = ExtraTreesClassifier(**param)
            fit_params = {}

        elif model_type == 'gradientboosting':
            param = {
                'n_estimators': trial.suggest_int('n_estimators', 200, 3500, step=100), # Wider range
                'learning_rate': trial.suggest_float('learning_rate', 0.007, 0.15, log=True), # Slightly lower min
                'max_depth': trial.suggest_int('max_depth', 3, 14), # Wider range
                'min_samples_split': trial.suggest_int('min_samples_split', 5, 50), # Wider range (regularization)
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 40), # Wider range (regularization)
                'subsample': trial.suggest_float('subsample', 0.5, 1.0), # Allow 1.0
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.7, 0.9]), # Added floats
                'random_state': 42,
                'loss': 'log_loss', # Keep log_loss for predict_proba
                'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.2), # Wider range
                # 'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.1) # Add cost-complexity pruning
            }
            model = GradientBoostingClassifier(**param)
            # Sample weights applied in the CV loop fit call below
            fit_params = {}

        elif model_type == 'adaboost':
            base_depth = trial.suggest_int('base_estimator_max_depth', 1, 6) # Allow slightly deeper base trees
            # More class weight options
            class_weights_options = [
                'balanced',
                {0: 1.0, 1: 1.0, 2: 1.1},
                {0: 1.0, 1: 1.0, 2: 1.2},
                {0: 1.0, 1: 1.0, 2: 1.4},
            ]
            weight_choice = trial.suggest_categorical('class_weight_choice', class_weights_options)
            param_ada = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 5000, step=50), # Wider range
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 2.0, log=True), # Wider range
                'algorithm':'SAMME', # Try both
                'random_state': 42
            }
            # Apply class weight choice to the base estimator
            base_est = DecisionTreeClassifier(max_depth=base_depth, random_state=42, class_weight=weight_choice)
            model = AdaBoostClassifier(estimator=base_est, **param_ada)
            # Log info for final model reconstruction
            trial.set_user_attr("base_estimator_max_depth", base_depth)
            trial.set_user_attr("class_weight_info", weight_choice if isinstance(weight_choice, dict) else 'balanced' if weight_choice=='balanced' else 'None')
            trial.set_user_attr("algorithm", param_ada['algorithm']) # Log algorithm choice
            fit_params = {}

        elif model_type == 'lightgbm':
            # More class weight options
            class_weight_options = [
                None,
                'balanced',
                {0: 1.0, 1: 1.0, 2: 1.1}, # Boost High slightly more
                {0: 1.0, 1: 1.0, 2: 1.2}, # Boost High more, Medium slightly
                {0: 1.0, 1: 1.0, 2: 1.4}, # Boost Medium more
            ]
            class_weight = trial.suggest_categorical('class_weight_option', class_weight_options)
            param = {
                'objective': 'multiclass',
                'num_class': n_classes,
                'metric': 'multi_logloss', # Standard metric for multi-class probabilities
                'n_estimators': trial.suggest_int('n_estimators', 300, 5500, step=100), # Wider range
                'learning_rate': trial.suggest_float('learning_rate', 0.007, 0.15, log=True), # Slightly lower min
                'num_leaves': trial.suggest_int('num_leaves', 20, 500, step=5), # Wider range (key param)
                'max_depth': trial.suggest_int('max_depth', 5, 50), # Wider range, can be -1 if num_leaves is constrained
                'subsample': trial.suggest_float('subsample', 0.5, 1.0), # Allow 1.0
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), # Allow 1.0
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 20.0, log=True), # Wider range (regularization)
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 20.0, log=True), # Wider range (regularization)
                'min_child_samples': trial.suggest_int('min_child_samples', 3, 60), # Wider range (regularization)
                'class_weight': class_weight, # Apply choice
                'random_state': 42,
                'n_jobs': n_jobs_optuna,
                'verbose': -1,
                'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']) # Keep both
                # Consider adding 'min_split_gain'
                # 'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.1)
            }
            model = lgb.LGBMClassifier(**param)
            # Use specific early stopping callback for LGBM
            fit_params = {'callbacks': [lgb.early_stopping(BOOSTING_EARLY_STOPPING_PATIENCE, verbose=False)]}
            

        else:
            logger.error(f"Unsupported model type: {model_type}")
            raise ValueError(f"Unsupported: {model_type}")

        # --- Cross-validation ---
        scores = []
        is_dataframe = isinstance(X, pd.DataFrame)
        try:
            for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
                # --- Use .iloc consistently for pandas objects ---
                if is_dataframe: # If X is a DataFrame, assume y is a Series
                    X_train_fold = X.iloc[train_idx]
                    X_valid_fold = X.iloc[valid_idx]
                    # Select training labels based on position
                    y_train_fold = y_keras[train_idx] if is_keras else y.iloc[train_idx] # ***MODIFIED***
                    # Select validation labels based on position
                    y_valid_fold_orig = y.iloc[valid_idx] # ***MODIFIED***
                else: # If X is a numpy array, assume y is also numpy array
                    X_train_fold = X[train_idx]
                    X_valid_fold = X[valid_idx]
                    y_train_fold = y_keras[train_idx] if is_keras else y[train_idx]
                    y_valid_fold_orig = y[valid_idx]
                current_fit_params = fit_params.copy()

                # --- Handle Sample Weights for Models That Need It in Fit ---
                fold_sample_weight = None
                if model_type in ['gradientboosting']:  
                    # Calculate balanced weights
                    sample_weight = compute_sample_weight('balanced', y=y_train_fold)
                    # Apply custom emphasis based on strategy (e.g., boost High/Medium)
                    emphasis_weights = {0: 1.0, 1: 1.0, 2: 1.2}  # Example emphasis
                    for cls_idx, weight_multiplier in emphasis_weights.items():
                         # Ensure y_train_fold is numpy for boolean indexing if it was a Series
                         y_train_fold_np = y_train_fold.values if isinstance(y_train_fold, pd.Series) else y_train_fold
                         sample_weight[y_train_fold_np == cls_idx] *= weight_multiplier
                    fold_sample_weight = sample_weight
                    current_fit_params['sample_weight'] = fold_sample_weight
                    logger.debug(f"Trial {trial.number} Fold {fold+1}: Applied sample weights for {model_type}")

                try:
                    # Pass eval_set for models that use it with callbacks/early stopping
                    eval_set = [(X_valid_fold, y_valid_fold_orig)]
                    
                    # --- XGBoost Specific Fit Call ---
                    if model_type == 'xgboost':
                         model.fit(X_train_fold, y_train_fold,
                                   eval_set=eval_set,
                                   # Pass directly
                                   verbose=False) # Pass other relevant args directly if needed
                    # --- LightGBM Specific Fit Call (already seemed correct) ---
                    elif model_type == 'lightgbm':
                         # Note: LGBM uses callbacks for early stopping, passed via fit_params
                         current_fit_params['eval_set'] = eval_set
                         current_fit_params['eval_metric'] = 'multi_logloss' # Or match objective metric
                         model.fit(X_train_fold, y_train_fold, **current_fit_params)
                    # --- CatBoost Specific Fit Call (already seemed correct) ---
                    elif model_type == 'catboost':
                         current_fit_params['eval_set'] = eval_set
                         # Early stopping rounds already part of CatBoost init/params
                         model.fit(X_train_fold, y_train_fold, **current_fit_params)
                    # --- Default Fit Call for other models ---
                    else:
                         # Pass sample_weight if applicable (e.g., for GB)
                         model.fit(X_train_fold, y_train_fold, **current_fit_params)
                    
                    # Predict and score
                    y_pred = model.predict(X_valid_fold)
                    if is_keras and y_pred.ndim > 1 and y_pred.shape[1] > 1:
                        y_pred = np.argmax(y_pred, axis=1)
                    score = accuracy_score(y_valid_fold_orig, y_pred)
                    scores.append(score)
                    logger.debug(f"Trial {trial.number} Fold {fold+1} Score: {score:.5f}")

                except ValueError as ve:
                    logger.warning(f"CV fold {fold+1} VAL ERROR {model_type} trial {trial.number}: {ve}")
                    return 0.0
                except Exception as e:
                    logger.error(f"CV fold {fold+1} EXCEPTION {model_type} trial {trial.number}: {e}", exc_info=True)
                    scores = []
                    break  # Log full traceback
        except Exception as outer_e:
            logger.error(f"Outer CV error {model_type} trial {trial.number}: {outer_e}", exc_info=True)
            return 0.0
        if not scores:
            logger.error(f"Cross-validation failed completely for {model_type} trial {trial.number}")
            return 0.0
        mean_score = np.mean(scores)
        logger.debug(f"Trial {trial.number} ({model_type}) completed. Avg CV Score: {mean_score:.5f}")
        return mean_score







    # --- Run Optuna Study ---
    study_name = f"{model_type}_opt_{timestamp}"
    storage_name = f"sqlite:///optuna_trials/{study_name}.db"
    study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, 
                              load_if_exists=True, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
    completed_trials = len([t for t in study.trials if t.state==optuna.trial.TrialState.COMPLETE])
    trials_to_run = n_trials-completed_trials
    
    if trials_to_run > 0:
        logger.info(f"Setting Optuna timeout {OPTUNA_TIMEOUT_PER_MODEL}s.")
        try:
            study.optimize(objective, n_trials=trials_to_run, timeout=OPTUNA_TIMEOUT_PER_MODEL, n_jobs=1)
        except Exception as opt_e:
            logger.error(f"Optuna optimize fail {model_type}: {opt_e}", exc_info=True)
            return None, -1, {}
    else:
        logger.info(f"Study {study_name} has {completed_trials} trials. Skip optimize.")

    # --- Retrieve Results ---
    try:
        if not any(t.state == optuna.trial.TrialState.COMPLETE for t in study.trials):
            logger.error(f"Optuna study {model_type} no successful trials.")
            return None, -1, {}
        best_trial = study.best_trial
        best_params = best_trial.params
        best_cv_score = best_trial.value
    except ValueError:
        logger.error(f"Optuna study {model_type} no best trial.")
        return None, -1, {}
    except Exception as res_e:
        logger.error(f"Error get Optuna results {model_type}: {res_e}", exc_info=True)
        return None, -1, {}
    logger.info(f"Opt complete {model_type}. Best CV score: {best_cv_score:.5f}. Best params: {best_params}")

    # --- Save Study Summary ---
    try:
        summary_file = f'optuna_trials/{model_type}_study_summary_{timestamp}.txt'
        params_json = best_params.copy()
        if model_type=='adaboost' and "base_estimator_max_depth" in best_trial.user_attrs:
            params_json['base_estimator_max_depth'] = best_trial.user_attrs["base_estimator_max_depth"]
            params_json['class_weight_info'] = best_trial.user_attrs.get("class_weight_info", "N/A")
        if model_type=='xgboost' and 'tree_method' in best_params:
            params_json['tree_method'] = best_params['tree_method']
        if model_type=='catboost' and 'task_type' in best_params:
            params_json['task_type'] = best_params['task_type']
        with open(summary_file, 'w') as f:
            f.write(f"Optuna Summary: {model_type}\nTS: {timestamp}\nBest Trial: {best_trial.number}\nScore: {best_cv_score:.5f}\n\nParams:\n")
            json.dump(params_json, f, indent=4)
        logger.info(f"Saved Optuna summary: {summary_file}")
    except Exception as file_e:
        logger.warning(f"Could not save Optuna summary {model_type}: {file_e}")

    # --- Train final model ---
    final_model = None
    final_fit_params = {}  # Reset for final fit
    try:
        logger.info(f"Instantiating final {model_type} model...")
        # Clean best_params from Optuna-specific args before final instantiation
        params_for_final = best_params.copy()
        optuna_internal_params = ['class_weight_option', 'class_weight_choice', 'class_weight_idx', 
                                 'class_weight_strategy', 'use_smote', 'smote_k', 
                                 'use_focal_loss', 'focal_gamma']  # Params used only in objective logic
        for p in optuna_internal_params:
            params_for_final.pop(p, None)

        # Inside optimize_model, after Optuna, in elif model_type == 'adaboost':
        if model_type == 'adaboost':
            # Clean best_params from Optuna-specific args before final instantiation
            params_for_final = best_params.copy()
            # List internal params used only during Optuna trials
            optuna_internal_params = ['class_weight_choice', 'base_estimator_max_depth']
            for p in optuna_internal_params:
                # --- THESE LINES REMOVE THE BAD PARAMETERS ---
                params_for_final.pop(p, None)
                # --- END OF REMOVAL ---

            # Retrieve correct values from Optuna trial attributes
            best_d = best_trial.user_attrs.get('base_estimator_max_depth', 1)
            weight_info_raw = best_trial.user_attrs.get("class_weight_info", 'balanced')
            
            # --- *** ADDED: Convert dictionary keys if needed *** ---
            weight_info_processed = weight_info_raw
            if isinstance(weight_info_raw, dict):
                try:
                    # Convert string keys ('0', '1', ...) to integers (0, 1, ...)
                    weight_info_processed = {int(k): v for k, v in weight_info_raw.items()}
                    logger.info(f"Converted AdaBoost class_weight keys to int: {weight_info_processed}")
                except ValueError as e:
                     logger.error(f"Error converting AdaBoost class_weight keys: {e}. Using raw: {weight_info_raw}")
                     weight_info_processed = weight_info_raw # Fallback to raw if conversion fails
            # --- *** END KEY CONVERSION *** ---
            
            logger.info(f"Reconstruct AdaBoost DT(max_depth={best_d}, class_weight={weight_info_processed}) using SAMME")
            # Create the base estimator correctly
            base_est_inst = DecisionTreeClassifier(max_depth=best_d, random_state=42, class_weight=weight_info_processed)

            final_p_ada = params_for_final # Use the cleaned dictionary for AdaBoost itself
            final_p_ada['algorithm'] = 'SAMME'
            # --- FINAL MODEL TRAINING WILL STILL HAPPEN USING base_est_inst and final_p_ada ---
            final_model = AdaBoostClassifier(estimator=base_est_inst, **final_p_ada)

        elif model_type == 'xgboost':
            final_params_xgb = params_for_final.copy()
            final_params_xgb['objective'] = 'multi:softprob'
            final_params_xgb['num_class'] = n_classes
            final_params_xgb['n_jobs'] = 1
            logger.info("XGBoost final model - balancing via sample_weight in fit.")
            final_model = XGBClassifier(**final_params_xgb)
            # Prepare sample weights for fit step
            sample_weights_xgb = compute_sample_weight('balanced', y=y)  # Start with balanced
            emphasis_weights = {0: 1.0, 1: 1.0, 2: 1.2}  # Emphasize High/Medium
            for cls_idx, weight_multiplier in emphasis_weights.items():
                sample_weights_xgb[y == cls_idx] *= weight_multiplier
            final_fit_params['sample_weight'] = sample_weights_xgb

        elif model_type == 'catboost':
            final_params_cat = params_for_final.copy()
            final_params_cat['loss_function'] = 'MultiClass'
            final_params_cat['verbose'] = False
            
            # Remove the problematic parameter that's causing the error
            if 'class_weight_config' in final_params_cat:
                final_params_cat.pop('class_weight_config')
            
            # Re-apply class weight strategy based on best trial's choice
            chosen_weight = best_params.get('class_weight_config')  # Use 'class_weight_config' to match the Optuna trial parameter
            
            # Handle dictionary class weights with proper key conversion
            if isinstance(chosen_weight, dict):
                try:
                    # Convert string keys ('0', '1', ...) to integers (0, 1, ...)
                    chosen_weight_processed = {int(k): v for k, v in chosen_weight.items()}
                    logger.info(f"Converted CatBoost class_weights keys to int: {chosen_weight_processed}")
                    final_params_cat['class_weights'] = chosen_weight_processed
                except ValueError as e:
                    logger.error(f"Error converting CatBoost class_weights keys: {e}. Using raw: {chosen_weight}")
                    final_params_cat['class_weights'] = chosen_weight  # Fallback to raw if conversion fails
                logger.info(f"CatBoost using custom weights: {final_params_cat['class_weights']}")
            elif chosen_weight == 'Balanced':
                final_params_cat['auto_class_weights'] = 'Balanced'
                logger.info("CatBoost using auto_class_weights=Balanced")
            else:
                logger.info("CatBoost using default balancing or no weights.")
            
            final_model = CatBoostClassifier(**final_params_cat)

        elif model_type == 'randomforest':
            final_params_rf = params_for_final.copy()
            final_params_rf['n_jobs'] = n_jobs_optuna
            
            # --- *** ADDED: Process class_weight dictionary keys *** ---
            class_weight_raw = best_params.get('class_weight', 'balanced')
            class_weight_processed = class_weight_raw
            if isinstance(class_weight_raw, dict):
                 try:
                     # Convert string keys ('0', '1', ...) to integers (0, 1, ...)
                     class_weight_processed = {int(k): v for k, v in class_weight_raw.items()}
                     logger.info(f"Converted RF class_weight keys to int: {class_weight_processed}")
                 except ValueError as e:
                      logger.error(f"Error converting RF class_weight keys: {e}. Using raw: {class_weight_raw}")
                      class_weight_processed = class_weight_raw # Fallback
            # --- *** END KEY CONVERSION *** ---
            
            final_params_rf['class_weight'] = class_weight_processed
            logger.info(f"RF final model using class_weight={final_params_rf['class_weight']}")
            final_model = RandomForestClassifier(**final_params_rf)

        elif model_type == 'extratrees':
            final_params_et = params_for_final.copy()
            final_params_et['n_jobs'] = n_jobs_optuna
            
            # --- *** ADDED: Process class_weight dictionary keys *** ---
            class_weight_raw = best_params.get('class_weight', 'balanced')
            class_weight_processed = class_weight_raw
            if isinstance(class_weight_raw, dict):
                 try:
                     # Convert string keys ('0', '1', ...) to integers (0, 1, ...)
                     class_weight_processed = {int(k): v for k, v in class_weight_raw.items()}
                     logger.info(f"Converted ET class_weight keys to int: {class_weight_processed}")
                 except ValueError as e:
                      logger.error(f"Error converting ET class_weight keys: {e}. Using raw: {class_weight_raw}")
                      class_weight_processed = class_weight_raw # Fallback
            # --- *** END KEY CONVERSION *** ---
            
            final_params_et['class_weight'] = best_params.get('class_weight', 'balanced')  # Use optimized or default balanced
            logger.info(f"ET final model using class_weight={final_params_et['class_weight']}")
            final_model = ExtraTreesClassifier(**final_params_et)

        elif model_type == 'gradientboosting':
            final_params_gb = params_for_final.copy()
            logger.info("GradientBoosting final model - applying sample_weight in fit")
            final_model = GradientBoostingClassifier(**final_params_gb)
            sample_weights_gb = compute_sample_weight('balanced', y=y)
            emphasis_weights = {0: 1.0, 1: 1.0, 2: 1.2}
            for cls_idx, mult in emphasis_weights.items():
                sample_weights_gb[y == cls_idx] *= mult
            final_fit_params['sample_weight'] = sample_weights_gb

        # Inside optimize_model, after Optuna, in elif model_type == 'knn':

        elif model_type == 'lightgbm':
            final_params_lgbm = params_for_final.copy()
            final_params_lgbm['objective'] = 'multiclass'
            final_params_lgbm['num_class'] = n_classes
            final_params_lgbm['n_jobs'] = n_jobs_optuna
            
            # --- *** ADDED: Refined Key Conversion for LGBM *** ---
            class_weight_value_to_use = 'balanced' # Default
            # Use 'class_weight_option' key from Optuna params for LGBM
            if 'class_weight_option' in best_params:
                class_weight_raw = best_params['class_weight_option'] # Get raw value from Optuna result
                class_weight_processed = class_weight_raw

                if isinstance(class_weight_raw, dict):
                    logger.info(f"Raw class_weight dict found for LGBM: {class_weight_raw}")
                    try:
                        if all(isinstance(k, int) for k in class_weight_raw.keys()):
                            logger.info("LGBM class_weight keys appear to be integers already.")
                            class_weight_processed = class_weight_raw
                        else:
                            logger.info("Attempting conversion of LGBM class_weight keys to int...")
                            class_weight_processed = {int(k): v for k, v in class_weight_raw.items()}
                            logger.info(f"Successfully converted LGBM class_weight keys to int: {class_weight_processed}")
                    except Exception as e_gen:
                         logger.error(f"Error processing LGBM class_weight dict: {e_gen}. Using 'balanced'.")
                         class_weight_processed = 'balanced'
                class_weight_value_to_use = class_weight_processed
            else:
                 logger.info("No 'class_weight_option' found in best_params for LGBM, using default 'balanced'.")
                 class_weight_value_to_use = 'balanced'
            
            final_params_lgbm['class_weight'] = class_weight_value_to_use
            logger.info(f"LGBM final model using class_weight={final_params_lgbm['class_weight']}")
            final_model = lgb.LGBMClassifier(**final_params_lgbm)

        # --- Fit the final model ---
        if final_model is not None:
            logger.info(f"Fitting final {model_type} model...")
            start_fit_time = time.time()
            model_fitted_successfully = False
            try:
                # Fit using specific params if they exist (like sample_weight)
                if final_fit_params:
                    logger.info(f"Fitting {model_type} with additional fit parameters: {list(final_fit_params.keys())}")
                    final_model.fit(X, y, **final_fit_params)  # Pass original y and weights dict
                else:
                    final_model.fit(X, y)  # Fit standard models

                fit_duration = time.time() - start_fit_time
                logger.info(f"Final {model_type} fitted in {fit_duration:.2f}s.")
                model_fitted_successfully = True

            except Exception as fit_e:
                logger.error(f"Error during final fit for {model_type}: {fit_e}", exc_info=True)
                # Keep going to return score/params, but model will be None

            # --- Save model and importance only if fit succeeded ---
            if model_fitted_successfully:
                model_path = f'models/{model_type}_{timestamp}.joblib'
                logger.info(f"Saving final {model_type} model...")
                try:
                    if isinstance(final_model, KerasClassifier):
                        tf_model_save_path = f'models/{model_type}_tfmodel_{timestamp}'
                        try:
                            final_model.model_.save(tf_model_save_path)
                            logger.info(f"Saved Keras TF model: {tf_model_save_path}")
                        except Exception as k_save_err:
                            logger.warning(f"Keras TF save fail ({k_save_err}), try joblib...")
                            joblib.dump(final_model, model_path)
                            logger.info(f"Saved Keras wrapper: {model_path}")
                    else:
                        joblib.dump(final_model, model_path)
                        logger.info(f"Saved final {model_type} via joblib: {model_path}")
                except Exception as save_err:
                    logger.error(f"Failed save model {model_type}: {save_err}", exc_info=True)

                # --- Attempt Calibration AFTER saving base model ---
                if model_type not in ['knn', 'mlp']:  # Models less suitable or needing sample_weight for calibration fit
                    try:
                        logger.info(f"Attempting calibration for {model_type}...")
                        # Use 'estimator' argument, not 'base_estimator'
                        calibrated_model = CalibratedClassifierCV(
                            estimator=final_model,
                            cv=3,
                            method='isotonic',
                            n_jobs=n_jobs_optuna,
                            ensemble=False
                        )
                        calibrated_model.fit(X, y)  # Calibrate on the full training data
                        calibrated_path = f'calibrated_models/{model_type}_calibrated_{timestamp}.joblib'
                        if not os.path.exists('calibrated_models'):
                            os.makedirs('calibrated_models')
                        joblib.dump(calibrated_model, calibrated_path)
                        logger.info(f"Saved calibrated model: {calibrated_path}")
                    except Exception as cal_err:
                        logger.warning(f"Calibration failed for {model_type}: {cal_err}", exc_info=False)

                # --- Save Importance ---
                feat_names = list(X.columns) if isinstance(X, pd.DataFrame) else None
                if feat_names:
                    logger.info(f"Saving importance {model_type}...")
                    save_feature_importance(final_model, feat_names, timestamp, model_type)
                else:
                    logger.warning(f"No feat names for importance {model_type}.")

            else:  # Fit failed
                final_model = None  # Ensure model is None if fit failed

        else:  # Instantiation failed
            logger.error(f"Could not instantiate final model {model_type}.")
            return None, best_cv_score, best_params

    except Exception as final_e:
        logger.error(f"Failed final instantiate/fit/save process {model_type}: {final_e}", exc_info=True)
        # Return score/params from Optuna, but model is None
        return None, best_cv_score, best_params

    # Return potentially None model if fit/save failed, but score/params if Optuna succeeded
    return final_model, best_cv_score, best_params

In [10]:
# Add this before the create_ensemble function
class DynamicEnsemble:
    """
    Custom ensemble that uses specialized models for each class.
    Each class gets its own best-performing model for prediction.
    """
    def __init__(self, class_models):
        self.class_models = class_models
    
    def predict(self, X):
        if not self.class_models:
            return np.zeros(len(X))
        
        # For each instance, get predictions from all specialized models
        final_probs = np.zeros((X.shape[0], 3))
        
        # Get probabilities from each class-specific model
        for class_idx, (_, model) in self.class_models.items():
            probs = model.predict_proba(X)
            final_probs[:, class_idx] = probs[:, class_idx]
        
        # Return the class with highest probability
        return np.argmax(final_probs, axis=1)
    
    def predict_proba(self, X):
        if not self.class_models:
            return np.zeros((len(X), 3))
        
        # Initialize with equal probabilities
        final_probs = np.zeros((X.shape[0], 3))
        
        # Get probabilities from each class-specific model
        for class_idx, (_, model) in self.class_models.items():
            probs = model.predict_proba(X)
            final_probs[:, class_idx] = probs[:, class_idx]
        
        # Normalize probabilities
        row_sums = final_probs.sum(axis=1)
        final_probs = final_probs / row_sums[:, np.newaxis]
        
        return final_probs

In [11]:
import numpy as np
import joblib
import logging
import os
from sklearn.ensemble import VotingClassifier, StackingClassifier
# Meta-learner imports
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb # For new default meta-learner
# Calibration import (needed if implementing Step 3)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
from scikeras.wrappers import KerasClassifier # For type checking

# Assume logger is configured globally
# logger = logging.getLogger(__name__)

def create_ensemble(qualified_models_with_scores, X_train_ensemble, y_train_ensemble, timestamp, n_jobs_ensemble=1):
    """
    Creates Voting and Stacking ensembles from a list of qualified models.
    Default Stacking meta-learner is now LightGBM with class weights.
    Includes comments for adding model calibration.

    Args:
        qualified_models_with_scores (list): List of tuples: (name, fitted_model, cv_score).
                                             Assumes models were potentially trained with class weights.
        X_train_ensemble (pd.DataFrame or np.ndarray): Training features for fitting ensembles.
        y_train_ensemble (pd.Series or np.ndarray): Training target for fitting ensembles.
        timestamp (str): Timestamp string for saving files.
        n_jobs_ensemble (int): Number of parallel jobs for ensemble fitting.

    Returns:
        tuple: (fitted_voting_classifier, fitted_stacking_classifier, best_individual_model_object)
               Models can be None if creation failed or skipped.
    """
    logger.info("Attempting ensemble creation...")

    if not qualified_models_with_scores:
        logger.error("No qualified models provided for ensemble creation.")
        return None, None, None

    sorted_models = sorted(qualified_models_with_scores, key=lambda x: x[2], reverse=True)
    logger.info(f"Qualified models for ensembling (Name, CV Score): {[(m[0], f'{m[2]:.5f}') for m in sorted_models]}")
    N_ens = len(sorted_models)

    if N_ens < 2:
        logger.warning(f"Less than 2 qualified models ({N_ens}). Skipping ensembles.")
        if N_ens == 1: n, m, s = sorted_models[0]; logger.info(f"Returning single best model: {n} (CV: {s:.5f})"); return None, None, m
        else: return None, None, None

    # Filter usable models (as before)
    estimators_valid_for_ensemble = []
    keras_models_excluded = []
    for name, model, score in sorted_models:
        is_keras_wrapper = isinstance(model, KerasClassifier); model_saved_path = f'models/{name}_{timestamp}.joblib'; tf_model_path = f'models/{name}_tfmodel_{timestamp}'
        if is_keras_wrapper and not os.path.exists(tf_model_path) and not os.path.exists(model_saved_path): logger.warning(f"Keras model {name} save files missing. Exclude."); keras_models_excluded.append(name)
        else: estimators_valid_for_ensemble.append((name, model))

    if len(estimators_valid_for_ensemble) < 2:
        logger.warning(f"<2 models usable for ensemble. Skipping."); non_keras_models = [(n, m, s) for n, m, s in sorted_models if n not in keras_models_excluded]
        if non_keras_models: best_n, best_m, best_s = non_keras_models[0]; logger.info(f"Return best non-excluded: {best_n} ({best_s:.5f})"); return None, None, best_m
        elif sorted_models: best_n, best_m, best_s = sorted_models[0]; logger.info(f"Return original best: {best_n} ({best_s:.5f})"); return None, None, best_m
        else: return None, None, None

    logger.info(f"Using {len(estimators_valid_for_ensemble)} models for ensemble: {[n for n,m in estimators_valid_for_ensemble]}")
    est_ens = estimators_valid_for_ensemble

    # --- Optional Calibration Step (Implement if needed based on Step 3 plan) ---
    # Example: Calibrate base models before putting them in est_ens_calibrated
    est_ens_calibrated = []
    logger.info("Calibrating base models for ensemble...")
    for name, model in est_ens:
        try:
            # Use isotonic calibration, fit on the same training data used for ensemble fitting
            # CV within CalibratedClassifierCV helps prevent overfitting during calibration itself
            calibrated_model = CalibratedClassifierCV(model, method='isotonic', cv=3, n_jobs=n_jobs_ensemble, ensemble=False) # Fit base estimator on each fold
            calibrated_model.fit(X_train_ensemble, y_train_ensemble)
            calibrated_model_path = f'calibrated_models/{name}_calibrated_{timestamp}.joblib'
            joblib.dump(calibrated_model, calibrated_model_path)
            est_ens_calibrated.append((name + "_calibrated", calibrated_model)) # Use new name and calibrated model
            logger.info(f"Calibrated model {name} and saved to {calibrated_model_path}")
        except Exception as cal_err:
            logger.error(f"Failed to calibrate model {name}: {cal_err}. Skipping calibration for this model.", exc_info=True)
            est_ens_calibrated.append((name, model)) # Use original model if calibration fails
    est_ens_to_use = est_ens_calibrated # Use the calibrated list for subsequent steps
    logger.info(f"Using {len(est_ens_to_use)} models (calibrated where possible) for final ensemble.")
    # --- End Optional Calibration Step ---

    # Use original (potentially uncalibrated) estimators for now
    est_ens_to_use = est_ens

    # --- Voting Classifier ---
    vote_clf = None
    can_soft = all(hasattr(m, 'predict_proba') for _, m in est_ens_to_use)
    weights_used = None

    if can_soft:
        logger.info("Attempting Soft Voting...")
        # Weighting based on original CV scores (even if models are calibrated)
        scores_map = {name: score for name, model, score in qualified_models_with_scores}
        # Use original names to get scores, handle potentially calibrated names
        scores = [scores_map.get(name.replace('_calibrated','')) for name, model in est_ens_to_use if name.replace('_calibrated','') in scores_map]

        if scores and len(scores) == len(est_ens_to_use): # Ensure weights align
            min_s = min(scores); shift_s = [s - min_s + 1e-6 for s in scores]; tot_s = sum(shift_s)
            weights_used = [s / tot_s for s in shift_s] if tot_s > 0 else None
            logger.info(f"Weights:{list(np.round(weights_used,3)) if weights_used else 'Uniform'}")
        else:
            logger.warning("Could not align scores for weighting. Using uniform weights.")
            weights_used = None

        try:
            vote_clf = VotingClassifier(estimators=est_ens_to_use, voting='soft', weights=weights_used, n_jobs=n_jobs_ensemble)
            vote_clf.fit(X_train_ensemble, y_train_ensemble)
            vote_path = f'models/voting_ensemble_soft_{timestamp}.joblib'
            joblib.dump(vote_clf, vote_path)
            logger.info(f"Saved Soft Voting Ensemble: {vote_path}")
        except Exception as e:
            logger.error(f"Failed Soft Voting: {e}", exc_info=True); vote_clf = None; can_soft = False
    else:
        logger.warning("Not all base models support predict_proba for Soft Voting.")

    if not can_soft: # Fallback or initial choice
        logger.warning("Attempting Hard Voting...")
        weights_used = None # No weights for hard voting
        try:
            vote_clf = VotingClassifier(estimators=est_ens_to_use, voting='hard', n_jobs=n_jobs_ensemble)
            vote_clf.fit(X_train_ensemble, y_train_ensemble)
            vote_path = f'models/voting_ensemble_hard_{timestamp}.joblib'
            joblib.dump(vote_clf, vote_path)
            logger.info(f"Saved Hard Voting Ensemble: {vote_path}")
        except Exception as e:
            logger.error(f"Failed Hard Voting: {e}", exc_info=True); vote_clf = None
    
    dynamic_ensemble = None
    try:
        if len(est_ens_to_use) >= 2:
            logger.info("Creating dynamic ensemble...")
            
            # Dynamic ensemble combines predictions from best models for each class
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            class_performance = {0: [], 1: [], 2: []}  # Track performance for each class
            
            for name, model in est_ens_to_use:
                # Skip if model doesn't have predict_proba
                if not hasattr(model, 'predict_proba'):
                    continue
                    
                class_scores = {0: [], 1: [], 2: []}
                
                # Evaluate model on each fold
                for train_idx, val_idx in skf.split(X_train_ensemble, y_train_ensemble):
                    X_fold, y_fold = X_train_ensemble.iloc[val_idx], y_train_ensemble.iloc[val_idx]
                    
                    # Get probabilities for each class
                    try:
                        y_probs = model.predict_proba(X_fold)
                        
                        # Calculate performance for each class
                        for class_idx in [0, 1, 2]:
                            # For each class, measure how well the model predicts that class
                            binary_y_true = (y_fold == class_idx).astype(int)
                            binary_y_score = y_probs[:, class_idx]
                            
                            # Use AUC as a class-specific performance measure
                            from sklearn.metrics import roc_auc_score
                            try:
                                auc = roc_auc_score(binary_y_true, binary_y_score)
                                class_scores[class_idx].append(auc)
                            except Exception as auc_err:
                                logger.warning(f"AUC calculation failed for {name}, class {class_idx}: {auc_err}")
                                class_scores[class_idx].append(0.5)  # Default if calculation fails
                    except Exception as pred_err:
                        logger.warning(f"Prediction failed for {name}: {pred_err}")
                        continue
                
                # Average scores across folds
                for class_idx in [0, 1, 2]:
                    if class_scores[class_idx]:
                        avg_score = sum(class_scores[class_idx]) / len(class_scores[class_idx])
                        # Keep track of model and its performance for each class
                        class_performance[class_idx].append((name, model, avg_score))
                    
            # Select best model for each class
            best_models = {}
            for class_idx in [0, 1, 2]:
                sorted_models = sorted(class_performance[class_idx], key=lambda x: x[2], reverse=True)
                if sorted_models:
                    best_name, best_model, best_score = sorted_models[0]
                    best_models[class_idx] = (best_name, best_model)
                    logger.info(f"Best model for class {class_idx}: {best_name} (AUC: {best_score:.5f})")

            # Create the dynamic ensemble with best models for each class
            if best_models:
                dynamic_ensemble = DynamicEnsemble(best_models)
                dynamic_path = f'models/dynamic_ensemble_{timestamp}.joblib'
                joblib.dump(dynamic_ensemble, dynamic_path)
                logger.info(f"Saved Dynamic Ensemble: {dynamic_path}")
                
    except Exception as e:
        logger.error(f"Failed to create Dynamic Ensemble: {e}", exc_info=True)
        dynamic_ensemble = None
    
    # --- Stacking Classifier ---
    stack_clf = None
    # Check predict_proba on the models actually used in the ensemble list
    can_stack = all(hasattr(m, 'predict_proba') for _, m in est_ens_to_use)
    meta_learner = None

    if can_stack:
        logger.info("Attempting Stacking...")
        try:
            # *** Use LightGBM as meta-learner with class weights ***
            meta_learner = lgb.LGBMClassifier(
                n_estimators=150, # Reasonably more estimators for meta
                learning_rate=0.05, # Slightly lower LR
                num_leaves=20,      # Limit complexity
                # max_depth=5,       # Optional: limit depth further
                class_weight='balanced', # Crucial for imbalanced meta-predictions
                random_state=42,
                n_jobs=1 # Meta learner should run on single core
            )
            # Alternative: Simpler Logistic Regression
            # meta_learner = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', C=1.0, n_jobs=1)

            logger.info(f"Using {meta_learner.__class__.__name__} as stacking meta-learner.")
            stack_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
            stack_clf = StackingClassifier(
                estimators=est_ens_to_use, # Use potentially calibrated models
                final_estimator=meta_learner,
                cv=stack_cv,
                stack_method='predict_proba',
                n_jobs=1,
                passthrough=False
            )
            stack_clf.fit(X_train_ensemble, y_train_ensemble)
            stack_path = f'models/stacking_ensemble_{timestamp}.joblib'
            joblib.dump(stack_clf, stack_path)
            logger.info(f"Saved Stacking Ensemble (Meta: {meta_learner.__class__.__name__}): {stack_path}")
        except Exception as e:
            logger.error(f"Failed Stacking: {e}", exc_info=True)
            stack_clf = None
    else:
        logger.warning("Cannot create Stacking Ensemble (requires predict_proba).")

    # Determine best individual model from the original qualified list
    best_ind_q_model = None; best_n = "N/A"; best_s = -1.0
    if sorted_models:
        best_n, best_m, best_s = sorted_models[0]
        best_ind_q_model = best_m
        logger.info(f"Best individual qualified model identified: {best_n} (CV Score: {best_s:.5f})")

    # Save summary (remains the same logic)
    try:
        summary_path = f'results/ensemble_creation_summary_{timestamp}.txt'
        with open(summary_path, 'w') as f:
            f.write("Ensemble Summary\n=============\nQualified Models:\n")
            for n, _, s in sorted_models: f.write(f"- {n}: CV {s:.5f} {'(Excl.)' if n in keras_models_excluded else '(Incl.)'}\n")
            f.write(f"\nEnsembles ({len(est_ens_to_use)} models):\n") # Reflect models used
            vote_t = 'Soft' if vote_clf and vote_clf.voting == 'soft' else ('Hard' if vote_clf and vote_clf.voting == 'hard' else 'N/A')
            f.write(f"- Voting ({vote_t}): {'Saved' if vote_clf else 'Failed/Skipped'}.\n")
            meta_name = meta_learner.__class__.__name__ if meta_learner and stack_clf else 'N/A'
            f.write(f"- Stacking (Meta:{meta_name}): {'Saved' if stack_clf else 'Failed/Skipped'}.\n")
            if keras_models_excluded: f.write(f"\nKeras Excluded: {', '.join(keras_models_excluded)}\n")
            if best_ind_q_model: f.write(f"\nBest individual model overall (from qualified list): {best_n}\n")
        logger.info(f"Saved ensemble summary: {summary_path}")
    except Exception as file_e: logger.warning(f"Could not save ensemble summary: {file_e}")
    
    

    return vote_clf, stack_clf, best_ind_q_model , dynamic_ensemble





In [12]:
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scikeras.wrappers import KerasClassifier # Keep for isinstance check
import joblib # Keep if fallback LE loading is needed
import os # For checking plot/results dirs

# Assume logger is configured globally
# logger = logging.getLogger(__name__)

def evaluate_model(model, X_eval, y_eval, model_name, timestamp, le):
    """Evaluates a trained model on a given dataset (e.g., validation or hold-out)."""
    if model is None:
        logger.warning(f"Skip eval {model_name}: model None.")
        return None, None # Return None for accuracy and report dictionary

    # Handle LabelEncoder loading if missing
    if le is None:
        logger.warning(f"Skip eval {model_name}: LabelEncoder (le) is None. Attempting fallback.")
        try:
            encoder_files = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
            if encoder_files:
                le = joblib.load(f'features/{encoder_files[-1]}')
                logger.info(f"Loaded fallback LE for evaluation: {encoder_files[-1]}")
            else:
                logger.error("Evaluation cannot proceed without LabelEncoder.")
                return None, None
        except Exception as load_err:
             logger.error(f"Evaluation failed: Could not load fallback LE: {load_err}")
             return None, None

    logger.info(f"Evaluating model '{model_name}'...")
    # Ensure y_eval is a numpy array for consistency
    if isinstance(y_eval, pd.Series):
        y_eval = y_eval.values

    is_keras_wrapper = isinstance(model, KerasClassifier)

    try:
        # Make predictions
        y_pred = model.predict(X_eval)
        
        # --- ADDED: Handle CatBoost prediction type ---
        # Check if the model is CatBoost AND predictions are not integers
        is_catboost = 'CatBoostClassifier' in str(type(model)) # Simple check
        if is_catboost and not np.issubdtype(y_pred.dtype, np.integer):
            logger.warning(f"CatBoost predictions seem non-integer ({y_pred.dtype}): {y_pred[:5]}. Attempting mapping to encoded labels.")
            # Create a mapping from original string labels -> encoded integers
            label_to_encoded = {label: i for i, label in enumerate(le.classes_)}
            try:
                # Attempt direct mapping (if CatBoost predicts original labels like 'low')
                y_pred_mapped = np.array([label_to_encoded.get(str(p), -1) for p in y_pred.flatten()])
                if np.any(y_pred_mapped == -1): # Check if direct mapping failed
                    logger.warning("Direct label mapping failed, trying string-to-int mapping (assuming '0', '1', '2').")
                    # Attempt string-to-int mapping (if CatBoost predicts '0', '1', '2')
                    y_pred_mapped = np.array([int(p) for p in y_pred.flatten()])
                y_pred = y_pred_mapped # Use the mapped integer predictions
                logger.info(f"Successfully mapped CatBoost predictions to integers: {y_pred[:5]}")
            except Exception as map_err:
                logger.error(f"Failed to map CatBoost predictions to integers: {map_err}. Evaluation/Prediction may fail.")
                # Let the original y_pred pass through, error will likely occur later
        # --- END CatBoost Handling ---

        # Ensure predictions are class indices (for Keras primarily, now redundant for CatBoost if mapped)
        if isinstance(model, KerasClassifier) and y_pred.ndim > 1 and y_pred.shape[1] > 1:
            # ... (argmax logic) ...
            y_pred = np.argmax(y_pred, axis=1)
        elif not np.issubdtype(y_pred.dtype, np.integer):
             logger.error(f"Predictions for {model_name} are not integers after processing: {y_pred.dtype}. Evaluation may fail.")
             # Optionally try to force conversion, but it's risky
             # y_pred = y_pred.astype(int)

        accuracy = accuracy_score(y_eval, y_pred)

        # Generate classification report and confusion matrix
        try:
            y_eval_labels = le.inverse_transform(y_eval)
            y_pred_labels = le.inverse_transform(y_pred)
            target_names = le.classes_ # Get class names in correct order
        except Exception as le_error:
            logger.warning(f"LabelEncoder inverse_transform failed for '{model_name}': {le_error}. Using numeric labels for report.")
            y_eval_labels = y_eval
            y_pred_labels = y_pred
            target_names = [str(i) for i in sorted(np.unique(y_eval))]

        # Classification Report
        report_str = classification_report(y_eval_labels, y_pred_labels, target_names=target_names, zero_division=0)
        report_dict = classification_report(y_eval_labels, y_pred_labels, target_names=target_names, output_dict=True, zero_division=0)

        # Confusion Matrix
        cm = confusion_matrix(y_eval_labels, y_pred_labels, labels=target_names)

        logger.info(f"Evaluation Results for '{model_name}':")
        logger.info(f"  Accuracy: {accuracy:.5f}")

        # Ensure results directory exists
        results_dir = 'results'
        if not os.path.exists(results_dir): os.makedirs(results_dir)

        # Save evaluation results to a file
        eval_filename = os.path.join(results_dir, f'{model_name}_evaluation_{timestamp}.txt')
        with open(eval_filename, 'w') as f:
            f.write(f"Model Evaluation Summary\n=========================\n")
            f.write(f"Model Name: {model_name}\nTimestamp: {timestamp}\nAccuracy: {accuracy:.5f}\n\n")
            f.write("Classification Report:\n"); f.write(report_str)
            f.write("\n\nConfusion Matrix:\n"); f.write(np.array2string(cm))
        logger.info(f"Saved evaluation summary: {eval_filename}")

        # Ensure plots directory exists
        plot_dir = 'plots'
        if not os.path.exists(plot_dir): os.makedirs(plot_dir)

        # Save confusion matrix plot
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
        plt.xlabel('Predicted Label'); plt.ylabel('True Label')
        plt.title(f'Confusion Matrix - {model_name} (Accuracy: {accuracy:.3f})')
        plt.tight_layout()
        plot_filename = os.path.join(plot_dir, f'{model_name}_confusion_matrix_{timestamp}.png')
        plt.savefig(plot_filename)
        plt.close() # Close the plot
        logger.info(f"Saved confusion matrix plot: {plot_filename}")

        return accuracy, report_dict

    except AttributeError as ae:
         if 'predict' in str(ae): logger.error(f"Evaluation error '{model_name}': Model not fitted? AttrErr: {ae}", exc_info=True)
         else: logger.error(f"AttributeError during eval '{model_name}': {ae}", exc_info=True)
         return None, None
    except Exception as e:
        logger.error(f"Unexpected error during eval '{model_name}': {e}", exc_info=True)
        return None, None

In [13]:
import pandas as pd
import numpy as np
import logging
import joblib
import os
from scikeras.wrappers import KerasClassifier # Keep for isinstance check

# Assume logger is configured globally
# logger = logging.getLogger(__name__)

def make_test_predictions(model, X_test, test_obs, timestamp, model_name, le):
    """Generates predictions on the test set and saves results."""
    logger.info(f"Generating test predictions using {model_name}...")

    if model is None:
        logger.error(f"Predict fail {model_name}: model None.")
        return None

    # Handle missing LabelEncoder (with fallback)
    if le is None:
        logger.warning(f"LE None for {model_name}. Try fallback...")
        try:
            encoder_files = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
            if encoder_files:
                le=joblib.load(f'features/{encoder_files[-1]}')
                logger.info(f"Loaded fallback LE: {encoder_files[-1]}.")
            else:
                logger.error(f"Predict fail: LE None, no fallback.")
                return None
        except Exception as load_e:
            logger.error(f"LE None, fallback load fail:{load_e}. No predict.")
            return None

    try:
        
        # Make predictions (potentially encoded integers or strings from CatBoost)
        y_pred_raw = model.predict(X_test)

        # --- ADDED: Handle CatBoost prediction type ---
        y_pred_enc = y_pred_raw # Default assumption
        is_catboost = 'CatBoostClassifier' in str(type(model))
        if is_catboost and not np.issubdtype(y_pred_raw.dtype, np.integer):
            logger.warning(f"CatBoost test predictions non-integer ({y_pred_raw.dtype}): {y_pred_raw[:5]}. Attempting mapping.")
            label_to_encoded = {label: i for i, label in enumerate(le.classes_)}
            try:
                y_pred_mapped = np.array([label_to_encoded.get(str(p), -1) for p in y_pred_raw.flatten()])
                if np.any(y_pred_mapped == -1):
                    logger.warning("Direct label mapping failed, trying string-to-int mapping.")
                    y_pred_mapped = np.array([int(p) for p in y_pred_raw.flatten()])
                y_pred_enc = y_pred_mapped # Use mapped integer predictions
                logger.info(f"Mapped CatBoost test predictions to integers: {y_pred_enc[:5]}")
            except Exception as map_err:
                logger.error(f"Failed map CatBoost test predictions: {map_err}. Prediction may fail.")
                # Use original predictions; inverse_transform will likely fail
                y_pred_enc = y_pred_raw
        # --- END CatBoost Handling ---

        # Ensure predictions are class indices (for Keras)
        if isinstance(model, KerasClassifier) and y_pred_enc.ndim > 1 and y_pred_enc.shape[1] > 1:
            y_pred_enc = np.argmax(y_pred_enc, axis=1)
        elif not np.issubdtype(y_pred_enc.dtype, np.integer):
             logger.error(f"Test predictions for {model_name} not integers after processing: {y_pred_enc.dtype}. Inverse transform likely to fail.")
             # Cannot proceed reliably if predictions aren't integer encoded labels here

        # Inverse transform to get original salary category labels
        predicted_labels = le.inverse_transform(y_pred_enc) # This now expects integer y_pred_enc


        # Create submission DataFrame
        submission_df = pd.DataFrame({'obs': test_obs, 'salary_category': predicted_labels})

        # Sanitize model name for filename
        safe_model_name = model_name.replace("/", "_").replace("\\", "_").replace(":", "_").replace(" ", "_")

        # Ensure directories exist
        submission_dir = 'submissions'
        results_dir = 'results'
        if not os.path.exists(submission_dir): os.makedirs(submission_dir)
        if not os.path.exists(results_dir): os.makedirs(results_dir)

        submission_path = os.path.join(submission_dir, f'solution_{safe_model_name}_{timestamp}.csv')

        # Save submission file
        submission_df.to_csv(submission_path, index=False)
        logger.info(f"Saved submission: {submission_path}")

        # Log value counts of predictions for analysis
        pred_value_counts = submission_df['salary_category'].value_counts().to_dict()
        logger.info(f"Test prediction distribution for '{model_name}': {pred_value_counts}")

        # Save prediction summary
        summary_filename = os.path.join(results_dir, f'{safe_model_name}_test_prediction_summary_{timestamp}.txt')
        with open(summary_filename, 'w') as f:
            f.write(f"Test Prediction Summary\n======================\n")
            f.write(f"Model Name: {model_name}\n")
            try: f.write(f"Model Class: {model.__class__.__name__}\n")
            except: f.write(f"Model Class: N/A\n")
            f.write(f"Timestamp: {timestamp}\nTotal Predictions: {len(predicted_labels)}\n\nDistribution:\n")
            total_preds = len(predicted_labels)
            if total_preds > 0:
                for label, count in sorted(pred_value_counts.items()):
                    percentage = (count / total_preds) * 100
                    f.write(f"- {label}: {count} ({percentage:.2f}%)\n")
            else:
                f.write("- No predictions were generated.\n")
        logger.info(f"Saved test prediction summary: {summary_filename}")

        return submission_df

    except AttributeError as ae:
        logger.error(f"AttributeError during predict/inverse_transform '{model_name}': {ae}.", exc_info=True)
        return None
    except Exception as e:
        logger.error(f"Unexpected error during test prediction '{model_name}': {e}", exc_info=True)
        return None

In [14]:
import time
import logging
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier # Keep for FS
# Imports for models used within the pipeline are assumed to be at the top of the file
# Imports for utility functions are assumed to be at the top of the file
import os

# Assume logger is configured globally
# logger = logging.getLogger(__name__)
# Assume utility functions (create_directory_structure, get_timestamp) are defined
# Assume preprocessing function (preprocess_data) is defined
# Assume model optimization function (optimize_model) is defined
# Assume ensemble function (create_ensemble) is defined
# Assume evaluation function (evaluate_model) is defined
# Assume prediction function (make_test_predictions) is defined

def run_complete_pipeline(perform_feature_selection=False, min_cv_score_threshold=0.72, fs_threshold='mean', n_jobs_sklearn=1):
    """
    Run the complete model training pipeline with combined FE logic,
    updated model list, and class weight balancing strategy integrated
    into optimize_model.
    """
    timestamp = get_timestamp()
    main_log_file = None
    file_handler = None
    pipeline_success = False # Track overall success

    try:
        # 1. Setup
        print("--- Starting Complete Pipeline Run (Class Weights, LGBM added) ---")
        create_directory_structure()
        main_log_file = f'logs/pipeline_run_{timestamp}.log'
        file_handler = logging.FileHandler(main_log_file)
        file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        if file_handler.baseFilename not in [h.baseFilename for h in logger.handlers if isinstance(h, logging.FileHandler)]:
             logger.addHandler(file_handler)

        logger.info(f"--- Starting Complete Pipeline Run --- Timestamp: {timestamp} ---")
        logger.info(f"Pipeline Config: Combined FE, Scaling=True, FeatSelect={perform_feature_selection} (Thresh={fs_threshold}), CV Thresh={min_cv_score_threshold}, n_jobs={n_jobs_sklearn} used, Class Weights Enabled, Const Cols Kept")
        logger.info(f"Logging detailed output to: {main_log_file}")

        # 2. Load Data
        logger.info("Loading data...")
        try: train_df = pd.read_csv('train.csv'); test_df = pd.read_csv('test.csv'); logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
        except FileNotFoundError as e: logger.error(f"Data load error: {e}."); raise
        if 'salary_category' not in train_df.columns: logger.error("Target missing."); raise ValueError("Missing target")
        if train_df.empty or test_df.empty: logger.error("Data empty."); raise ValueError("Empty data")
        if 'obs' not in test_df.columns: logger.error("Column 'obs' missing in test.csv."); raise ValueError("Missing obs")
        all_states = set(train_df['job_state'].dropna().unique()).union(set(test_df['job_state'].dropna().unique()))
        all_feature1 = set(train_df['feature_1'].dropna().unique()).union(set(test_df['feature_1'].dropna().unique()))
        logger.info(f"Found {len(all_states)} unique states and {len(all_feature1)} unique feature_1 values.")

        # 3. Preprocess Training Data
        logger.info("Preprocessing training data (using combined logic)...")
        X_train_orig, y_train_orig, feature_cols_initial, label_encoder = preprocess_data(train_df, all_states, all_feature1, timestamp, is_training=True)
        if X_train_orig is None or y_train_orig is None or label_encoder is None or feature_cols_initial is None: logger.error("Train preprocess failed."); raise RuntimeError("Preprocessing train failed")
        logger.info(f"Train preprocess done. Initial Feats: {X_train_orig.shape[1]}"); y_train_orig = pd.Series(y_train_orig)

        # 4. Train/Validation Split
        logger.info("Splitting data (80/20)...")
        X_train_full, X_val, y_train_full, y_val = train_test_split(X_train_orig, y_train_orig, test_size=0.20, random_state=42, stratify=y_train_orig)
        logger.info(f"Train (Pre-scale): {X_train_full.shape}, Val (Pre-scale): {X_val.shape}"); y_train_full = pd.Series(y_train_full, index=X_train_full.index); y_val = pd.Series(y_val, index=X_val.index)

        # 5. SCALING STEP
        logger.info("Applying StandardScaler...")
        scaler = StandardScaler(); X_train_full_scaled = scaler.fit_transform(X_train_full[feature_cols_initial]); X_val_scaled = scaler.transform(X_val[feature_cols_initial])
        X_train_full_scaled = pd.DataFrame(X_train_full_scaled, index=X_train_full.index, columns=feature_cols_initial)
        X_val_scaled = pd.DataFrame(X_val_scaled, index=X_val.index, columns=feature_cols_initial)
        scaler_path = f'scalers/scaler_{timestamp}.joblib'; joblib.dump(scaler, scaler_path); logger.info(f"Scaler saved: {scaler_path}")

        # 6. Preprocess & Scale Test Data
        logger.info("Preprocessing test data (using combined logic)...")
        X_test_orig, _, _, _ = preprocess_data(test_df, all_states, all_feature1, timestamp, is_training=False, feature_columns_to_use=feature_cols_initial)
        if X_test_orig is None: logger.error("Test preprocess failed."); raise RuntimeError("Preprocessing test failed")
        try: X_test_aligned = X_test_orig[feature_cols_initial]; logger.info("Test columns aligned.")
        except KeyError as ke: logger.error(f"Test col mismatch after preprocess: {ke}."); raise RuntimeError(f"Test column mismatch: {ke}")
        logger.info("Scaling test data..."); X_test_scaled = scaler.transform(X_test_aligned)
        X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test_aligned.index, columns=feature_cols_initial)
        logger.info(f"Test preprocess & scale done. Shape: {X_test_scaled.shape}")

        # Define Data Partitions
        X_opt_train = X_train_full_scaled.copy(); y_opt_train = y_train_full.copy(); X_holdout_val = X_val_scaled.copy(); y_holdout_val = y_val.copy(); X_final_test = X_test_scaled.copy()
        current_feature_cols = list(feature_cols_initial)

        # 7. Optional Feature Selection
        if perform_feature_selection:
            logger.info(f"Performing feature selection (Threshold: {fs_threshold})...")
            try:
                selector_model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=n_jobs_sklearn, class_weight='balanced', max_depth=20)
                logger.info("Fitting RF for feature selection..."); selector_model.fit(X_opt_train, y_opt_train)
                selector = SelectFromModel(selector_model, threshold=fs_threshold, prefit=True)
                selected_mask = selector.get_support(); selected_features = X_opt_train.columns[selected_mask]
                num_orig = X_opt_train.shape[1]; num_sel = len(selected_features)
                if num_sel == 0: logger.error("FS removed ALL features!"); raise RuntimeError("FS removed all features.")
                elif num_sel < num_orig:
                    num_removed = num_orig - num_sel; logger.info(f"Feat selection removed {num_removed} features. Selected {num_sel}.")
                    current_feature_cols = list(selected_features)
                    X_opt_train = X_opt_train[current_feature_cols]; X_holdout_val = X_holdout_val[current_feature_cols]; X_final_test = X_final_test[current_feature_cols]
                    logger.info(f"Selection applied to train/val/test partitions.")
                    joblib.dump(current_feature_cols, f'features/selected_feature_columns_{timestamp}.joblib')
                else: logger.info(f"Feature selection removed no features with threshold '{fs_threshold}'."); perform_feature_selection = False
            except Exception as e:
                logger.error(f"Error feature selection: {e}. Use all scaled.", exc_info=True); perform_feature_selection = False
                current_feature_cols = list(feature_cols_initial); X_opt_train=X_train_full_scaled[current_feature_cols]; X_holdout_val=X_val_scaled[current_feature_cols]; X_final_test=X_test_scaled[current_feature_cols]
        else:
            logger.info("Skipping feature selection.")
            X_opt_train = X_opt_train[current_feature_cols]; X_holdout_val = X_holdout_val[current_feature_cols]; X_final_test = X_final_test[current_feature_cols]

        logger.info(f"Data shapes post-scaling/selection: Train={X_opt_train.shape}, Val={X_holdout_val.shape}, Test={X_final_test.shape}")
        logger.info(f"Number of features used in modeling: {len(current_feature_cols)}")

        # 8. Optimize, Train Base Models & Make Individual Predictions
        # --- UPDATED LIST (No SVC, Added LightGBM) ---
        models_to_optimize = [
             ('adaboost', 100),
            ('catboost', 100), ('xgboost', 100), 
            ('lightgbm', 100),
            ('randomforest', 100), ('extratrees', 100),
            ('gradientboosting', 100)
        ]
        qualified_models_with_scores = [] # Stores (name, fitted_model, cv_score)
        optimized_params_all = {} # Stores best params found by Optuna

        logger.info(f"--- Optimizing models (Class Weights Enabled Where Applicable) ---")
        logger.info(f"Minimum CV score threshold for qualification: {min_cv_score_threshold}")
        logger.info(f"Optimization Order: {[m[0] for m in models_to_optimize]}")

        for model_name, n_trials in models_to_optimize:
            indiv_sub_df = None
            try:
                logger.info(f"--- Optimizing {model_name} ({n_trials} trials) ---")
                # Optimize_model now handles final training with class_weight where applicable
                final_model, best_cv_score, best_params = optimize_model(
                    X_opt_train, y_opt_train, timestamp, model_name,
                    n_trials=n_trials, n_jobs_optuna=n_jobs_sklearn
                )

                # Log details for qualification debugging
                log_score = best_cv_score if best_cv_score is not None else -1.0
                comparison_result = best_cv_score >= min_cv_score_threshold if best_cv_score is not None else False
                logger.info(f"Qualification Check for {model_name}: "
                            f"final_model exists? {final_model is not None}, "
                            f"best_cv_score={log_score:.7f}, "
                            f"threshold={min_cv_score_threshold}, "
                            f"comparison result: {comparison_result}")

                # Qualification Check
                if final_model is not None and best_cv_score is not None and best_cv_score >= min_cv_score_threshold:
                    logger.info(f"+++ QUALIFIED: {model_name} (CV Score: {best_cv_score:.5f})")
                    # Evaluate on Holdout
                    holdout_acc, _ = evaluate_model(final_model, X_holdout_val, y_holdout_val, f"{model_name}_qualified_holdout_eval", timestamp, label_encoder)
                    if holdout_acc is not None:
                        logger.info(f"Hold-out Acc ({model_name}): {holdout_acc:.5f}")
                        # Store model with HOLD-OUT score for potential ensemble weighting later
                        qualified_models_with_scores.append((model_name, final_model, holdout_acc)) # Store HOLD-OUT acc
                    else:
                        logger.warning(f"Hold-out Eval failed for {model_name}. Cannot use its score for weighting.")
                        # Still add model? Maybe add with CV score as fallback weight? Or exclude?
                        # Let's add with CV score for now, but this case needs consideration.
                        qualified_models_with_scores.append((model_name, final_model, best_cv_score))

                    if best_params: optimized_params_all[model_name] = best_params

                    # Generate Individual Predictions
                    logger.info(f"--- Generating individual predictions for {model_name} ---")
                    indiv_sub_df = make_test_predictions(final_model, X_final_test, test_df['obs'], timestamp, f"{model_name}_qual_individual_pred", label_encoder)
                    if indiv_sub_df is None: logger.error(f"Failed individual predictions for {model_name}.")
                    else: logger.info(f"Individual prediction file saved for {model_name}.")

                elif best_cv_score is not None: # Didn't meet threshold or final fit failed
                     logger.info(f"--- NOT QUALIFIED: {model_name} (CV Score: {best_cv_score:.5f} {' - Final model fit/save failed' if final_model is None else ''}) ---")
                     if best_params: optimized_params_all[model_name] = best_params # Still save params
                else: # Optuna itself failed or returned None score
                    logger.warning(f"Optimization failed or returned invalid score for {model_name}. Skip.")

            except Exception as e:
                logger.error(f"Error in main optimization loop for {model_name}: {e}", exc_info=True)

        # --- Post-Optimization Check ---
        logger.info("--- Model Optimization Phase Complete ---")
        if not qualified_models_with_scores:
            logger.error(f"CRITICAL: NO models met CV threshold {min_cv_score_threshold}. Aborting.")
            if file_handler: logger.removeHandler(file_handler); file_handler.close()
            return False
        logger.info(f"--- {len(qualified_models_with_scores)} models qualified. ---")
        # Log based on HOLD-OUT score now stored
        logger.info(f"Qualified Models (Name, Holdout Acc/CV Score): {[(m[0], f'{m[2]:.5f}') for m in qualified_models_with_scores]}")

        # 9. Create Ensembles & Select FINAL Best Model
        final_model = None; final_model_name = "N/A"; vote_ens = None; stack_ens = None; best_ind_q_model = None

        if len(qualified_models_with_scores) == 1:
            # Use holdout score here too
            final_model_name, final_model, final_holdout_score = qualified_models_with_scores[0]
            logger.warning(f"Only 1 qualified: {final_model_name} (Holdout Acc: {final_holdout_score:.5f}). Select it.")
            best_ind_q_model = final_model
        elif len(qualified_models_with_scores) > 1:
            logger.info(f"--- Creating and Evaluating Ensembles (using balanced models if applicable) ---")
            # Pass potentially balanced models to create_ensemble
            # create_ensemble now uses LGBM meta-learner by default
            vote_ens, stack_ens, best_ind_q_model_obj, dynamic_ens = create_ensemble(
                        qualified_models_with_scores, X_opt_train, y_opt_train, timestamp, n_jobs_ensemble=n_jobs_sklearn)

            # Evaluate candidate ensembles and best individual on HOLD-OUT set
            logger.info("--- Evaluating candidate final models on HOLD-OUT validation set ---")
            candidates = {} # Store {name: (holdout_accuracy, model_object)}
            best_ind_name_from_ensemble = None # Track name from ensemble function

            if vote_ens:
                vote_model_name = f"voting_ensemble_{vote_ens.voting}_qualified"
                logger.info(f"--- Eval {vote_model_name} ---")
                val_acc, _ = evaluate_model(vote_ens, X_holdout_val, y_holdout_val, f"{vote_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None: candidates[vote_model_name] = (val_acc, vote_ens); logger.info(f"Hold-out Acc ({vote_model_name}): {val_acc:.5f}")
                else: logger.warning(f"Eval fail: {vote_model_name}")

            if stack_ens:
                stack_meta_name = stack_ens.final_estimator_.__class__.__name__
                stack_model_name = f"stacking_ensemble_{stack_meta_name}_qualified"
                logger.info(f"--- Eval {stack_model_name} ---")
                val_acc, _ = evaluate_model(stack_ens, X_holdout_val, y_holdout_val, f"{stack_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None: candidates[stack_model_name] = (val_acc, stack_ens); logger.info(f"Hold-out Acc ({stack_model_name}): {val_acc:.5f}")
                else: logger.warning(f"Eval fail: {stack_model_name}")
                
            # Add evaluation for dynamic ensemble
            if dynamic_ens:
                dynamic_model_name = "dynamic_ensemble_qualified"
                logger.info(f"--- Eval {dynamic_model_name} ---")
                val_acc, _ = evaluate_model(dynamic_ens, X_holdout_val, y_holdout_val, f"{dynamic_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None:
                    candidates[dynamic_model_name] = (val_acc, dynamic_ens)
                    logger.info(f"Hold-out Acc ({dynamic_model_name}): {val_acc:.5f}")
                else:
                    logger.warning(f"Eval fail: {dynamic_model_name}")

            # Find the best individual model's info again to evaluate it
            if qualified_models_with_scores:
                 # Sort by stored score (holdout acc or fallback CV score)
                 best_ind_info = max(qualified_models_with_scores, key=lambda item: item[2])
                 best_ind_name_from_list = best_ind_info[0]
                 best_ind_q_model = best_ind_info[1] # The actual best model object
                 best_ind_stored_score = best_ind_info[2]
                 logger.info(f"--- Eval Best Individual ({best_ind_name_from_list}, Stored Score: {best_ind_stored_score:.5f}) ---")
                 eval_name = f"{best_ind_name_from_list}_best_qual_holdout_eval"
                 val_acc, _ = evaluate_model(best_ind_q_model, X_holdout_val, y_holdout_val, eval_name, timestamp, label_encoder)
                 if val_acc is not None:
                     cand_name = f"{best_ind_name_from_list}_best_qualified"
                     candidates[cand_name] = (val_acc, best_ind_q_model)
                     logger.info(f"Hold-out Acc ({best_ind_name_from_list}): {val_acc:.5f}")
                 else: logger.warning(f"Hold-out Eval failed for {best_ind_name_from_list}")

            # Select FINAL model based on highest HOLD-OUT accuracy among candidates
            if candidates:
                final_model_name = max(candidates, key=lambda k: candidates[k][0])
                final_val_score, final_model = candidates[final_model_name]
                logger.info(f"--- FINAL MODEL SELECTION ---")
                logger.info(f"Selected '{final_model_name}' as FINAL model (Hold-Out Acc: {final_val_score:.5f})")
            else:
                logger.error("Hold-out evaluation failed for all candidates.")
                # Fallback: Use best individual based on stored score (holdout or CV)
                if best_ind_q_model and best_ind_name_from_list:
                    final_model = best_ind_q_model
                    final_model_name = f"{best_ind_name_from_list}_best_qualified_fallback"
                    logger.warning(f"FALLBACK: Using best individual model '{final_model_name}' based on its stored score.")
                else:
                    logger.error("Could not determine final model even as fallback. Aborting.")
                    raise RuntimeError("Final model selection failed.")

        # Check if a final model was successfully selected
        if not final_model:
            logger.error("No final model could be selected. Aborting.")
            raise RuntimeError("Final model selection failed.")

        # 10. Make FINAL Test Predictions
        logger.info(f"--- Generating FINAL predictions using: {final_model_name} ---")
        final_sub_df = make_test_predictions(final_model, X_final_test, test_df['obs'], timestamp, f"{final_model_name}_FINAL", label_encoder)
        if final_sub_df is None:
            logger.error(f"Failed FINAL submission with {final_model_name}.")
            # Consider if pipeline should fail here
            pipeline_success = False # Mark as failed if submission fails
        else:
            logger.info(f"FINAL submission file generated with {final_model_name}.")
            pipeline_success = True # Mark successful only if prediction works

        # 11. Final Summary
        logger.info("--- Pipeline Run Summary ---")
        logger.info(f"Timestamp: {timestamp}")
        logger.info(f"Config: Combined FE, Scaling=True, FeatSelect={perform_feature_selection} (Thresh={fs_threshold}), CV Thresh={min_cv_score_threshold}, n_jobs={n_jobs_sklearn}, Class Weights Enabled, Const Cols Kept")
        logger.info(f"Final # Features: {len(current_feature_cols)}")
        logger.info("Models Optimized: " + ", ".join([m[0] for m in models_to_optimize]))
        qual_details = [(m[0], f"{m[2]:.5f}") for m in qualified_models_with_scores] if qualified_models_with_scores else ["None"]
        logger.info("Models Qualified (Name, Holdout Acc/CV Score): " + ", ".join([f"{n}({s})" for n, s in qual_details]))
        logger.info(f"Ensembles Created: Voting={'Yes' if vote_ens else 'No'}, Stacking={'Yes' if stack_ens else 'No'} (Meta: {stack_ens.final_estimator_.__class__.__name__ if stack_ens else 'N/A'}), Dynamic={'Yes' if dynamic_ens else 'No'}")
        logger.info(f"Final model selected: {final_model_name}")
        logger.info("Individual predictions saved for qualified models.")
        if final_sub_df is not None:
            safe_final_n = final_model_name.replace("/", "_").replace("\\", "_").replace(":", "_").replace(" ", "_")
            final_sub_path = f"submissions/solution_{safe_final_n}_FINAL_{timestamp}.csv"
            logger.info(f"Final submission file: {final_sub_path}")
        else:
            logger.warning("No FINAL submission file was generated.")
        logger.info(f"Logs in: {main_log_file}")
        logger.info(f"--- Pipeline {'Completed Successfully' if pipeline_success else 'Completed with Errors'} ---")

        # Close log handler
        if file_handler: logger.removeHandler(file_handler); file_handler.close()
        return pipeline_success

    # --- Main Exception Handling ---
    except Exception as e:
        logger.error(f"--- Pipeline Failed Critically --- Error Type: {type(e).__name__} ---")
        logger.error(f"Error Message: {e}", exc_info=True)
        # Ensure log handler is closed
        if file_handler and file_handler in logger.handlers:
            logger.removeHandler(file_handler)
            file_handler.close()
        return False # Indicate critical failure

In [None]:
import time
import logging

# Assume all necessary functions (run_complete_pipeline, etc.) are defined above
# Assume logger is configured globally
# logger = logging.getLogger(__name__)

# --- Execution Block ---
if __name__ == "__main__":
    # --- Configuration ---
    # Feature Selection Settings
    PERFORM_FEATURE_SELECTION = True # Set to True or False
    FS_THRESHOLD = 'mean' # Threshold ('mean', 'median', or float like 1e-5)

    # Model Qualification Threshold
    MIN_CV_SCORE_THRESHOLD = 0.72 # Minimum average CV score to qualify a model

    # Parallelism Setting for Sklearn Models (used in Optuna objective and Ensemble)
    # Set to 1 if experiencing issues, otherwise set to desired core count (e.g., os.cpu_count() - 2)
    # N_CORES_TO_USE = 1 # Start with 1 for stability, increase carefully
    # import os
    N_CORES_TO_USE = max(1, os.cpu_count() - 4) # Example: Use all but 2 cores

    # --- Run the Pipeline ---
    pipeline_start_time = time.time()

    success = run_complete_pipeline(
        perform_feature_selection=PERFORM_FEATURE_SELECTION,
        fs_threshold=FS_THRESHOLD,
        min_cv_score_threshold=MIN_CV_SCORE_THRESHOLD,
        n_jobs_sklearn=N_CORES_TO_USE # Pass the core count
        )

    pipeline_end_time = time.time()
    pipeline_duration = pipeline_end_time - pipeline_start_time

    # --- Final Status Output ---
    status_msg = f"Pipeline execution {'succeeded' if success else 'failed'}."
    duration_msg = f"Total time: {pipeline_duration:.2f} seconds ({pipeline_duration / 60:.2f} minutes)."

    print(f"\n{'='*30}\n{status_msg}")
    print(duration_msg)
    print(f"{'='*30}")

    # Log final status if possible
    try:
        logger.info(status_msg)
        logger.info(duration_msg)
    except Exception as log_final_e:
        # This might happen if the logger itself failed earlier
        print(f"Note: Final status logging failed: {log_final_e}")

2025-04-28 15:37:17,369 - INFO - Creating directory structure...
2025-04-28 15:37:17,369 - INFO - Created directory: models
2025-04-28 15:37:17,369 - INFO - Created directory: features
2025-04-28 15:37:17,369 - INFO - Created directory: results
2025-04-28 15:37:17,369 - INFO - Created directory: submissions
2025-04-28 15:37:17,369 - INFO - Created directory: logs
2025-04-28 15:37:17,369 - INFO - Created directory: plots
2025-04-28 15:37:17,369 - INFO - Created directory: optuna_trials
2025-04-28 15:37:17,369 - INFO - Created directory: scalers
2025-04-28 15:37:17,369 - INFO - Created directory: calibrated_models
2025-04-28 15:37:17,369 - INFO - Directory structure verified/created.
2025-04-28 15:37:17,384 - INFO - --- Starting Complete Pipeline Run --- Timestamp: 20250428_153717 ---
2025-04-28 15:37:17,385 - INFO - Pipeline Config: Combined FE, Scaling=True, FeatSelect=True (Thresh=mean), CV Thresh=0.72, n_jobs=16 used, Class Weights Enabled, Const Cols Kept
2025-04-28 15:37:17,385 - I

--- Starting Complete Pipeline Run (Class Weights, LGBM added) ---


2025-04-28 15:37:17,557 - INFO - Processed 'job_title'.
2025-04-28 15:37:17,565 - INFO - Processed 'job_posted_date'.
2025-04-28 15:37:17,568 - INFO - Added bin f9.
2025-04-28 15:37:17,568 - INFO - Added: feature_2_9_interaction
2025-04-28 15:37:17,568 - INFO - Added transforms f2 (sq, sqrt, bin).
2025-04-28 15:37:17,568 - INFO - Added bool aggs.
2025-04-28 15:37:17,568 - INFO - Added: feature_10_8_interaction
2025-04-28 15:37:17,568 - INFO - Added new interactions: ['feat2_job_title_encoded', 'feat2_boolsum', 'feat2_recency', 'job_title_encoded_recency']
2025-04-28 15:37:17,618 - INFO - Applying PCA (n=15) to job desc...
2025-04-28 15:37:17,668 - INFO - Fit/saved PCA.
2025-04-28 15:37:17,678 - INFO - Finished job desc features.
2025-04-28 15:37:17,678 - INFO - Applying manual OHE for 'job_state' (39 unique).
2025-04-28 15:37:17,687 - INFO - Applying manual OHE for 'feature_1' (5 unique).
2025-04-28 15:37:17,696 - INFO - Final cleanup and column alignment...
2025-04-28 15:37:17,702 - I