In [1]:
# Fix for wmic error in Windows
import os
os.environ["LOKY_MAX_CPU_COUNT"] = str(os.cpu_count())
print(f"Setting max CPU count to: {os.environ['LOKY_MAX_CPU_COUNT']}")

# For older joblib versions, you might also need:
os.environ["JOBLIB_TEMP_FOLDER"] = os.path.join(os.path.expanduser("~"), "temp_joblib")
if not os.path.exists(os.environ["JOBLIB_TEMP_FOLDER"]):
    os.makedirs(os.environ["JOBLIB_TEMP_FOLDER"])

Setting max CPU count to: 20


In [5]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch
  Downloading https://download.pytorch.org/whl/cu126/torch-2.7.0%2Bcu126-cp313-cp313-win_amd64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu126/torchvision-0.22.0%2Bcu126-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu126/torchaudio-2.7.0%2Bcu126-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting filelock (from torch)
  Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting jinja2 (from torch)
  Using cached https://download.pytorch.org/whl/Jinja2-3.1.4-py3-none-any.whl.metadata (2.6 k

In [2]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available")

GPU is available
GPU Name: NVIDIA GeForce RTX 4060 Laptop GPU


In [2]:
!pip3 install pandas numpy matplotlib seaborn scikit-learn category_encoders xgboost catboost optuna joblib tensorflow scikeras

Collecting matplotlib
  Using cached matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting category_encoders
  Using cached category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting catboost
  Using cached catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting optuna
  Using cached optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting tensorflow
  Using cached tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting scikeras
  Using cached scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting patsy>=0.5.1 (from category_encode

In [3]:
pip install pandas numpy matplotlib seaborn scikit-learn category_encoders xgboost catboost optuna joblib scikeras 

Note: you may need to restart the kernel to use updated packages.


In [4]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, VotingClassifier, StackingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, TruncatedSVD
from category_encoders import TargetEncoder, CatBoostEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
import warnings
import joblib
import os
import time
import json
from datetime import datetime
import shutil
import logging
import subprocess

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
warnings.filterwarnings('ignore')
# Set seeds
np.random.seed(42)
tf.random.set_seed(42)

# --- Utility Functions ---
def create_directory_structure():
    directories = ['models', 'features', 'results', 'submissions', 'logs', 'plots', 'optuna_trials', 'scalers']
    for directory in directories:
        try:
            if not os.path.exists(directory):
                os.makedirs(directory)
                logger.info(f"Created directory: {directory}")
        except Exception as e:
            logger.error(f"Error creating directory {directory}: {e}")
            raise

def get_timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def save_feature_importance(model, feature_names, timestamp, model_name):
    if not feature_names:
        logger.warning(f"No feature names provided for {model_name}. Skipping feature importance plot.")
        return

    importances = None
    importance_type = None
    is_fitted = True # Assume fitted unless Keras check fails

    if isinstance(model, KerasClassifier):
        try:
            _ = model.model_ # Check if internal model exists
        except AttributeError:
            is_fitted = False
            logger.warning(f"Keras model {model_name} not fitted. Skip importance.")
            return
        if not is_fitted: return
        logger.info(f"Importance plot not directly available for Keras model {model_name}.")
        return
    elif hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        importance_type = 'Importance'
    elif hasattr(model, 'coef_'):
        if model.coef_.ndim > 1:
            importances = np.abs(model.coef_).mean(axis=0)
        else:
            importances = np.abs(model.coef_)
        importance_type = 'Coefficient Magnitude'
    else:
        # Handling for ensembles or models without standard importance
        if isinstance(model, (VotingClassifier, StackingClassifier)):
            logger.info(f"Importance plot not generated for ensemble {model_name}.")
            return
        elif isinstance(model, MLPClassifier):
            logger.info(f"Importance plot not directly available for MLPClassifier {model_name}.")
            return
        elif hasattr(model, 'estimator_') and hasattr(model.estimator_, 'feature_importances_'): # e.g., AdaBoost
            logger.info(f"Using importance from base estimator of {model_name}.")
            importances = model.estimator_.feature_importances_
            importance_type = 'Base Estimator Importance'
        elif hasattr(model, 'estimators_') and model.estimators_: # e.g., RF, ET (already covered by feature_importances_) or others
            try:
                all_importances = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')]
                if all_importances:
                    importances = np.mean(all_importances, axis=0)
                    importance_type = 'Mean Base Importance'
                    logger.info(f"Averaged importance from base estimators for {model_name}.")
                else:
                    logger.info(f"No base estimators with importance found for {model_name}.")
                    return
            except Exception as avg_imp_e:
                logger.warning(f"Could not average base importances for {model_name}: {avg_imp_e}.")
                return
        else:
            logger.info(f"Model {model_name} ({model.__class__.__name__}) lacks importance attributes.")
            return

    if importances is None:
        logger.warning(f"Could not retrieve importances for {model_name}.")
        return
    if importances.ndim > 1: # Ensure 1D array
        logger.warning(f"Importances shape {importances.shape} for {model_name}. Mean over axis 0.")
        importances = importances.mean(axis=0)
    if len(importances) != len(feature_names):
        logger.warning(f"Importance len ({len(importances)}) vs names ({len(feature_names)}) mismatch for {model_name}.")
        return

    try:
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values('Importance', ascending=False)
        plt.figure(figsize=(12, 8))
        top_n = min(30, len(importance_df))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(top_n), palette='viridis')
        plt.title(f'Top {top_n} Feat Importances - {model_name}')
        plt.xlabel(f'Relative {importance_type}')
        plt.tight_layout()
        plot_filename = f'plots/{model_name}_feature_importance_{timestamp}.png'
        plt.savefig(plot_filename)
        plt.close()
        logger.info(f"Saved importance plot: {plot_filename}")
        csv_filename = f'results/{model_name}_feature_importance_{timestamp}.csv'
        importance_df.to_csv(csv_filename, index=False)
        logger.info(f"Saved importance csv: {csv_filename}")
    except Exception as e:
        logger.warning(f"Could not save importance plot/CSV {model_name}: {e}", exc_info=True)

def build_keras_model(n_features, n_classes, optimizer='adam', learning_rate=0.001,
                      hidden_units=[128, 64], dropout_rate=0.3, activation='relu'):
    model = keras.Sequential(name="keras_mlp_tabular")
    model.add(layers.Input(shape=(n_features,)))
    for units in hidden_units:
        model.add(layers.Dense(units))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(n_classes, activation='softmax'))
    if optimizer.lower() == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer.lower() == 'sgd':
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    else:
        logger.warning(f"Unsupported optimizer '{optimizer}'. Defaulting to Adam.")
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# --- MODIFIED preprocess_data Function ---
def preprocess_data(df, all_states, all_feature1, timestamp, is_training=True, feature_columns_to_use=None):
    """Preprocesses data incorporating FE from simpler model and robust categorical handling."""
    logger.info(f"Starting preprocessing (Combined Logic). Is training: {is_training}")
    start_time = time.time()
    data = df.copy()
    y = None
    le = None
    target_column = 'salary_category'

    # 1. Handle Target Variable (Training Only)
    if target_column in data.columns and is_training:
        target = data[target_column] # Keep target for TargetEncoding later
        le = LabelEncoder()
        y = le.fit_transform(target)
        logger.info(f"Target variable '{target_column}' found and label encoded.")
        logger.info(f"Target distribution (Encoded): {np.bincount(y)}")
        joblib.dump(le, f'features/label_encoder_{timestamp}.joblib')
        mapping = {int(v): k for k, v in zip(le.classes_, le.transform(le.classes_))}
        mapping_file = f'features/target_mapping_{timestamp}.json'
        with open(mapping_file, 'w') as f:
            json.dump(mapping, f, indent=4)
        logger.info(f"Saved label encoder and target mapping: {mapping_file}")
    elif not is_training:
        # Load encoder
        try:
            encoder_files = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
            if encoder_files:
                latest_encoder_file = encoder_files[-1]
                le = joblib.load(f'features/{latest_encoder_file}')
                logger.info(f"Loaded latest label encoder: {latest_encoder_file}")
            else:
                logger.warning("No label encoder file found for test data!")
                le = None
        except Exception as e:
            logger.error(f"Failed to load label encoder: {e}")
            le = None
    else: # is_training is True but target column is missing
        logger.error(f"Target column '{target_column}' missing in training data!")
        raise ValueError(f"Target column '{target_column}' not found in training data.")

    # 2. Define Feature Groups
    boolean_features_potential = ['feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_10', 'feature_11']
    boolean_features = [f for f in boolean_features_potential if f in data.columns]
    numerical_features = [f for f in ['feature_2', 'feature_9', 'feature_12'] if f in data.columns] # Base numerical
    job_desc_cols = [col for col in data.columns if col.startswith('job_desc_')]
    all_numerical_features = numerical_features + job_desc_cols

    # 3. Initial Cleaning (Numerical & Boolean)
    logger.info("Initial cleaning: Numerical and Boolean Features...")
    for col in all_numerical_features:
        if col in data.columns:
            if data[col].dtype == 'object':
                data[col] = data[col].replace(['', ' ', 'NA', 'None', 'NULL'], np.nan)
            data[col] = pd.to_numeric(data[col], errors='coerce')
            median_val = data[col].median()
            fill_value = median_val if not pd.isna(median_val) else 0
            data[col] = data[col].fillna(fill_value)

    for col in boolean_features:
        if col in data.columns:
            numeric_view = pd.to_numeric(data[col], errors='coerce')
            is_boolean_like = numeric_view.dropna().isin([0, 1]).all()
            if is_boolean_like:
                data[col] = numeric_view.fillna(0).astype(int)
            else:
                num_non_bool = numeric_view.dropna().loc[~numeric_view.dropna().isin([0, 1])].count()
                logger.warning(f"Column '{col}' contains non-0/1 values ({num_non_bool} instances). Treating as numerical and imputing with median.")
                median_val = numeric_view.median()
                fill_value = median_val if not pd.isna(median_val) else 0
                data[col] = numeric_view.fillna(fill_value)

    logger.info("Starting Feature Engineering (using logic from simpler model)...")
    engineered_feature_names = [] # Track engineered features

    # --- Feature Engineering Steps ---
    if 'job_title' in data.columns:
        data['job_title'] = data['job_title'].fillna('Unknown')
        title_flags = ['is_senior', 'is_junior', 'is_developer', 'is_specialist']
        data['is_senior'] = data['job_title'].str.lower().str.contains('senior|sr|lead|principal').fillna(False).astype(int)
        data['is_junior'] = data['job_title'].str.lower().str.contains('junior|jr|associate|entry').fillna(False).astype(int)
        data['is_developer'] = data['job_title'].str.lower().str.contains('develop|programmer|coder|engineer').fillna(False).astype(int)
        data['is_specialist'] = data['job_title'].str.lower().str.contains('special|expert|consult').fillna(False).astype(int)
        engineered_feature_names.extend(title_flags)
        title_counts = data['job_title'].value_counts()
        rare_titles = title_counts[title_counts < 10].index
        data['job_title_grouped'] = data['job_title'].apply(lambda x: 'Other_Title' if x in rare_titles else x)
        title_encoder_col = 'job_title_grouped'
        target_encoded_title = 'job_title_encoded'
        engineered_feature_names.append(target_encoded_title)
        if is_training:
            logger.info(f"Applying Target Encoding to '{title_encoder_col}'...")
            job_encoder = TargetEncoder(cols=[title_encoder_col], handle_missing='value', handle_unknown='value')
            data[target_encoded_title] = job_encoder.fit_transform(data[[title_encoder_col]], y)
            joblib.dump(job_encoder, f'features/job_title_encoder_{timestamp}.joblib')
            logger.info(f"Fit and saved TargetEncoder for {title_encoder_col}")
        else:
            encoder_path = f'features/job_title_encoder_{timestamp}.joblib'
            encoder_files_fallback = sorted([f for f in os.listdir('features') if f.startswith('job_title_encoder_')])
            loaded_encoder = False
            if os.path.exists(encoder_path):
                try:
                    job_encoder = joblib.load(encoder_path)
                    data[target_encoded_title] = job_encoder.transform(data[[title_encoder_col]])
                    logger.info(f"Loaded and applied TargetEncoder: {encoder_path}")
                    loaded_encoder = True
                except Exception as e:
                    logger.error(f"Failed to load/apply specific TargetEncoder '{encoder_path}': {e}. Trying fallback.")
            if not loaded_encoder and encoder_files_fallback:
                 latest_encoder_file = encoder_files_fallback[-1]
                 try:
                     job_encoder = joblib.load(f'features/{latest_encoder_file}')
                     data[target_encoded_title] = job_encoder.transform(data[[title_encoder_col]])
                     logger.info(f"Loaded and applied fallback TargetEncoder: {latest_encoder_file}")
                     loaded_encoder = True
                 except Exception as e_fb:
                      logger.error(f"Fallback TargetEncoder failed: {e_fb}. Filling with 0.5")
                      data[target_encoded_title] = 0.5
            if not loaded_encoder:
                 logger.error("No TargetEncoder file found. Filling with 0.5")
                 data[target_encoded_title] = 0.5
        data = data.drop(['job_title', 'job_title_grouped'], axis=1, errors='ignore')
        logger.info("Processed 'job_title' (flags, grouping, target encoding).")

    if 'job_posted_date' in data.columns:
        data['job_posted_date'] = data['job_posted_date'].fillna('2000/01')
        def extract_year(date_str):
            try: return int(str(date_str)[:4])
            except: return 2000
        def extract_month(date_str):
            try: return int(str(date_str).split('/')[1])
            except: return 1
        data['job_posted_year'] = data['job_posted_date'].apply(extract_year)
        data['job_posted_month'] = data['job_posted_date'].apply(extract_month)
        data['job_posted_month'] = data['job_posted_month'].clip(1, 12)
        date_features = ['month_sin', 'month_cos', 'job_recency', 'job_posted_year_norm']
        data['month_sin'] = np.sin(2 * np.pi * data['job_posted_month'] / 12)
        data['month_cos'] = np.cos(2 * np.pi * data['job_posted_month'] / 12)
        data['job_recency'] = data['job_posted_year'] * 12 + data['job_posted_month']
        mean_year = 2022
        data['job_posted_year_norm'] = data['job_posted_year'] - mean_year
        engineered_feature_names.extend(date_features)
        data = data.drop(['job_posted_date', 'job_posted_year', 'job_posted_month'], axis=1, errors='ignore')
        logger.info("Processed 'job_posted_date' (cyclical, recency, norm year).")

    num_transform_features = []
    if 'feature_9' in data.columns:
        try:
            data['feature_9_bin'] = pd.qcut(data['feature_9'].rank(method='first'), q=5, labels=[0, 1, 2, 3, 4]).astype(int)
        except ValueError:
            logger.warning("qcut failed for feature_9, using pd.cut fallback.")
            try:
                data['feature_9_bin'] = pd.cut(data['feature_9'], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True).astype(int)
            except Exception as e_cut:
                 logger.error(f"pd.cut also failed for feature_9: {e_cut}. Setting bin to 0.")
                 data['feature_9_bin'] = 0
        num_transform_features.append('feature_9_bin')
        logger.info("Added binned feature for feature_9.")
        if 'feature_2' in data.columns:
            interaction_name = 'feature_2_9_interaction'
            data[interaction_name] = data['feature_2'] * data['feature_9']
            num_transform_features.append(interaction_name)
            logger.info(f"Added interaction: {interaction_name}")

    if 'feature_2' in data.columns:
        data['feature_2_squared'] = data['feature_2'] ** 2
        data['feature_2_sqrt'] = np.sqrt(np.abs(data['feature_2']))
        num_transform_features.extend(['feature_2_squared', 'feature_2_sqrt'])
        try:
            data['feature_2_bin'] = pd.qcut(data['feature_2'].rank(method='first'), q=5, labels=[0, 1, 2, 3, 4]).astype(int)
        except ValueError:
            logger.warning("qcut failed for feature_2, using pd.cut fallback.")
            try:
                data['feature_2_bin'] = pd.cut(data['feature_2'], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True).astype(int)
            except Exception as e_cut:
                 logger.error(f"pd.cut also failed for feature_2: {e_cut}. Setting bin to 0.")
                 data['feature_2_bin'] = 0
        num_transform_features.append('feature_2_bin')
        logger.info("Added squared, sqrt, and binned features for feature_2.")
    engineered_feature_names.extend(num_transform_features)

    bool_agg_features = []
    actual_boolean_cols = [col for col in boolean_features if col in data.columns]
    if actual_boolean_cols:
        data['boolean_sum'] = data[actual_boolean_cols].sum(axis=1)
        data['boolean_sum_squared'] = data['boolean_sum'] ** 2
        bool_agg_features.extend(['boolean_sum', 'boolean_sum_squared'])
        logger.info("Added boolean sum and squared sum features.")
    else:
        data['boolean_sum'] = 0
        data['boolean_sum_squared'] = 0
        logger.info("No boolean features found for aggregation.")
    engineered_feature_names.extend(bool_agg_features)

    if 'feature_10' in data.columns and 'feature_8' in data.columns:
        interaction_name = 'feature_10_8_interaction'
        data[interaction_name] = data['feature_10'] * data['feature_8']
        engineered_feature_names.append(interaction_name)
        logger.info(f"Added interaction: {interaction_name}")

    job_desc_eng_features = []
    if job_desc_cols:
        desc_agg = ['job_desc_mean', 'job_desc_std', 'job_desc_min', 'job_desc_max', 'job_desc_sum', 'job_desc_q25', 'job_desc_q75', 'job_desc_iqr']
        data['job_desc_mean'] = data[job_desc_cols].mean(axis=1)
        data['job_desc_std'] = data[job_desc_cols].std(axis=1).fillna(0)
        data['job_desc_min'] = data[job_desc_cols].min(axis=1)
        data['job_desc_max'] = data[job_desc_cols].max(axis=1)
        data['job_desc_sum'] = data[job_desc_cols].sum(axis=1)
        data['job_desc_q25'] = data[job_desc_cols].quantile(0.25, axis=1)
        data['job_desc_q75'] = data[job_desc_cols].quantile(0.75, axis=1)
        data['job_desc_iqr'] = data['job_desc_q75'] - data['job_desc_q25']
        job_desc_eng_features.extend(desc_agg)
        n_pca_components = 15
        if len(job_desc_cols) > n_pca_components:
            logger.info(f"Applying PCA (n={n_pca_components}) to job description features...")
            pca_names = [f'job_desc_pca_{i}' for i in range(n_pca_components)]
            job_desc_eng_features.extend(pca_names)
            job_desc_pca_result = None # Initialize
            if is_training:
                pca = PCA(n_components=n_pca_components, random_state=42)
                job_desc_pca_result = pca.fit_transform(data[job_desc_cols])
                joblib.dump(pca, f'features/job_desc_pca_{timestamp}.joblib')
                logger.info("Fit and saved PCA model for job description.")
            else:
                pca_path = f'features/job_desc_pca_{timestamp}.joblib'
                pca_files_fallback = sorted([f for f in os.listdir('features') if f.startswith('job_desc_pca_')])
                pca_loaded = False
                pca = None # Define pca before try block
                if os.path.exists(pca_path):
                    try:
                        pca = joblib.load(pca_path)
                        pca_loaded=True
                        logger.info(f"Loaded specific PCA model: {pca_path}")
                    except Exception as e: logger.error(f"Failed load specific PCA: {e}. Try fallback.")
                if not pca_loaded and pca_files_fallback:
                     latest_pca_file = pca_files_fallback[-1]
                     try:
                         pca = joblib.load(f'features/{latest_pca_file}')
                         pca_loaded=True
                         logger.info(f"Loaded fallback PCA model: {latest_pca_file}")
                     except Exception as e_fb: logger.error(f"Fallback PCA load failed: {e_fb}.")
                if pca_loaded and pca is not None: # Check if pca object exists
                     try: job_desc_pca_result = pca.transform(data[job_desc_cols])
                     except Exception as e_trans: logger.error(f"PCA transform failed: {e_trans}. Filling PCA features with 0.")
                # Fallback if loading/transform failed or no model found
                if job_desc_pca_result is None:
                    logger.error("PCA result not generated. Filling PCA features with 0.")
                    job_desc_pca_result = np.zeros((data.shape[0], n_pca_components))

            # Add PCA features to dataframe
            for i in range(min(n_pca_components, job_desc_pca_result.shape[1])):
                data[pca_names[i]] = job_desc_pca_result[:, i]
        else:
            logger.warning(f"Skipping PCA for job description: Not enough features ({len(job_desc_cols)}) for {n_pca_components} components.")
        data = data.drop(columns=job_desc_cols, errors='ignore')
        logger.info("Finished processing job description features (aggregates and PCA).")
    else:
        logger.info("No job description features found.")
    engineered_feature_names.extend(job_desc_eng_features)

    # --- Robust Categorical Handling ---
    if 'job_state' in data.columns:
        data['job_state'] = data['job_state'].fillna('Unknown')
    if 'feature_1' in data.columns:
        data['feature_1'] = data['feature_1'].fillna('Unknown')

    manual_ohe_features = []
    logger.info(f"Applying manual One-Hot Encoding for 'job_state' using {len(all_states)} total unique states.")
    if 'job_state' in data.columns:
        for state in all_states:
            col_name = f'state_{state}' # Consistent naming
            data[col_name] = (data['job_state'] == state).astype(int)
            manual_ohe_features.append(col_name)
        data = data.drop('job_state', axis=1, errors='ignore') # Drop original
    else:
        logger.warning("'job_state' column not found for manual OHE.")

    logger.info(f"Applying manual One-Hot Encoding for 'feature_1' using {len(all_feature1)} total unique values.")
    if 'feature_1' in data.columns:
        for feat in all_feature1:
            col_name = f'feat1_{feat}' # Consistent naming
            data[col_name] = (data['feature_1'] == feat).astype(int)
            manual_ohe_features.append(col_name)
        data = data.drop('feature_1', axis=1, errors='ignore') # Drop original
    else:
        logger.warning("'feature_1' column not found for manual OHE.")
    engineered_feature_names.extend(manual_ohe_features)
    # --- End FE ---

    # 5. Final Cleanup and Column Management
    logger.info("Final cleanup and column alignment...")
    columns_to_exclude = ['obs']
    if is_training and target_column in df.columns:
        columns_to_exclude.append(target_column)
    potential_feature_cols = [col for col in data.columns if col not in columns_to_exclude]

    inf_cols_handled = []
    nan_cols_handled = []
    for col in potential_feature_cols:
        if pd.api.types.is_numeric_dtype(data[col]):
            if np.isinf(data[col]).any():
                inf_cols_handled.append(col)
                data[col] = data[col].replace([np.inf, -np.inf], np.nan)
            if data[col].isnull().any():
                nan_cols_handled.append(col)
                data[col] = data[col].fillna(0) # Simple fill with 0
    if inf_cols_handled: logger.warning(f"Replaced Inf values with NaN in: {inf_cols_handled}")
    final_nan_cols = list(set(nan_cols_handled) - set(inf_cols_handled))
    if final_nan_cols: logger.info(f"Filled NaN values with 0 in columns: {final_nan_cols}")

    for col in potential_feature_cols:
        if data[col].dtype == 'bool': data[col] = data[col].astype(int)

    # --- Constant Column Handling (Warn only) ---
    if is_training:
        constant_cols_found = []
        for col in potential_feature_cols:
            nunique_val = data[col].nunique(dropna=False)
            if nunique_val <= 1:
                is_engineered = col in engineered_feature_names
                logger.warning(f"Column '{col}' identified as constant (nunique={nunique_val}) in training data. Engineered: {is_engineered}. Keeping column.")
                constant_cols_found.append(col)
            elif nunique_val <= 3 and col in engineered_feature_names:
                logger.info(f"Engineered column '{col}' has low cardinality (nunique={nunique_val}) in training data.")

        final_feature_columns = potential_feature_cols
        joblib.dump(final_feature_columns, f'features/feature_columns_{timestamp}.joblib')
        logger.info(f"Saved {len(final_feature_columns)} final feature column names (constant columns NOT dropped).")
        X = data[final_feature_columns]
        logger.info(f"Preprocessing train done. Shape: {X.shape}. Time: {time.time() - start_time:.2f}s")
        try: X.head().to_csv(f'features/processed_features_head_{timestamp}.csv', index=False)
        except Exception as e: logger.warning(f"Could not save head: {e}")
        return X, y, final_feature_columns, le
    else: # Test Data Processing (Alignment)
        if feature_columns_to_use is None:
            try:
                col_files = sorted([f for f in os.listdir('features') if f.startswith('feature_columns_')])
                if col_files:
                    latest_col_file = col_files[-1]
                    feature_columns_to_use = joblib.load(f'features/{latest_col_file}')
                    logger.info(f"Loaded {len(feature_columns_to_use)} feature columns from: {latest_col_file}")
                else:
                    logger.error("CRITICAL: No feature_columns file found."); raise FileNotFoundError("feature_columns_*.joblib missing.")
            except Exception as e: logger.error(f"Failed load feature columns: {e}."); raise

        X = pd.DataFrame(columns=feature_columns_to_use)
        missing_cols_in_test = []
        processed_test_cols = list(data.columns)
        extra_cols_in_test = list(set(processed_test_cols) - set(feature_columns_to_use) - set(columns_to_exclude))

        for col in feature_columns_to_use:
            if col in data.columns:
                X[col] = data[col]
            else:
                X[col] = 0 # Add missing column, fill with 0
                missing_cols_in_test.append(col)

        if missing_cols_in_test: logger.warning(f"Cols missing in test (filled 0): {missing_cols_in_test}")
        if extra_cols_in_test: logger.warning(f"Cols extra in test (dropped during align): {extra_cols_in_test}")

        X = X[feature_columns_to_use] # Reorder to match training
        logger.info(f"Preprocessing test done. Shape: {X.shape}. Time: {time.time() - start_time:.2f}s")
        return X, y, feature_columns_to_use, le # y is None

# --- optimize_model ---
def optimize_model(X, y, timestamp, model_type, n_trials=30, n_jobs_optuna=18): # Allow passing n_jobs
    logger.info(f"Starting {model_type} optimization ({n_trials} trials)...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if not isinstance(y, (np.ndarray, pd.Series)): y = np.array(y)
    if isinstance(X, np.ndarray): X = pd.DataFrame(X)

    n_classes = len(np.unique(y))
    n_features = X.shape[1]
    y_keras = to_categorical(y, num_classes=n_classes) if model_type == 'keras_mlp' else y

    KERAS_EPOCHS = 150
    KERAS_PATIENCE = 20
    OPTUNA_TIMEOUT_PER_MODEL = 3600
    if model_type in ['xgboost', 'catboost', 'randomforest', 'gradientboosting', 'keras_mlp', 'mlp', 'extratrees']:
        OPTUNA_TIMEOUT_PER_MODEL = 5400
    logger.info(f"Optuna timeout for {model_type}: {OPTUNA_TIMEOUT_PER_MODEL}s.")

    def objective(trial):
        model = None
        fit_params = {}
        use_gpu = False
        is_keras = False
        # --- Model Definitions ---
        if model_type == 'xgboost':
            tree_method = trial.suggest_categorical('tree_method', ['hist', 'gpu_hist'])
            param = { 'objective':'multi:softmax', 'num_class':n_classes, 'eval_metric':'mlogloss','n_estimators':trial.suggest_int('n_estimators',200,2000,step=100), 'max_depth':trial.suggest_int('max_depth',3,15),'learning_rate':trial.suggest_float('learning_rate',0.005,0.3,log=True), 'subsample':trial.suggest_float('subsample',0.5,1.0),'colsample_bytree':trial.suggest_float('colsample_bytree',0.4,1.0), 'min_child_weight':trial.suggest_int('min_child_weight',1,12),'gamma':trial.suggest_float('gamma',1e-8,1.0,log=True), 'reg_alpha':trial.suggest_float('reg_alpha',1e-8,15.0,log=True),'reg_lambda':trial.suggest_float('reg_lambda',1e-8,15.0,log=True), 'random_state':42,'n_jobs': 1,'booster':'gbtree', 'tree_method':tree_method } # n_jobs=1 for wrapper
            if tree_method == 'gpu_hist':
                param['gpu_id'] = 0; use_gpu = True; param.pop('n_jobs', None)
            else: param.pop('gpu_id', None)
            model = XGBClassifier(**param); fit_params = {'verbose': False}
        elif model_type == 'catboost':
            task_type = trial.suggest_categorical('task_type', ['CPU', 'GPU'])
            param = { 'iterations':trial.suggest_int('iterations',200,2000,step=100), 'depth':trial.suggest_int('depth',4,14),'learning_rate':trial.suggest_float('learning_rate',0.005,0.3,log=True), 'l2_leaf_reg':trial.suggest_float('l2_leaf_reg',1,20,log=True),'random_strength':trial.suggest_float('random_strength',1e-3,10.0,log=True), 'border_count':trial.suggest_categorical('border_count',[32,64,128,254]),'bagging_temperature':trial.suggest_float('bagging_temperature',0.0,1.0), 'loss_function':'MultiClass', 'eval_metric':'Accuracy','random_seed':42, 'thread_count':-1,'verbose':False, 'task_type':task_type }
            if task_type == 'GPU': param['devices'] = '0'; use_gpu = True
            model = CatBoostClassifier(**param); fit_params = {'early_stopping_rounds': KERAS_PATIENCE, 'verbose': False}
        elif model_type == 'keras_mlp':
            is_keras = True; optimizer_name = trial.suggest_categorical('optimizer', ['adam']); lr = trial.suggest_float('learning_rate', 1e-4, 5e-3, log=True); dropout = trial.suggest_float('dropout_rate', 0.1, 0.6); activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'swish']); n_layers = trial.suggest_int('n_layers', 2, 4); units_list = []; last_units = n_features
            for i in range(n_layers): max_units = max(32, int(last_units / 1.5)); min_units = max(16, int(last_units / 4)); units = trial.suggest_int(f'n_units_l{i}', min_units, max_units, step=4, log=True); units_list.append(units); last_units = units
            model = KerasClassifier(model=build_keras_model, n_features=n_features, n_classes=n_classes, optimizer=optimizer_name, learning_rate=lr, hidden_units=units_list, dropout_rate=dropout, activation=activation, epochs=KERAS_EPOCHS, batch_size=trial.suggest_categorical('batch_size', [64, 128, 256, 512]), verbose=0)
            keras_callbacks = [ EarlyStopping(monitor='val_accuracy', patience=KERAS_PATIENCE, restore_best_weights=True, verbose=0), ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=KERAS_PATIENCE // 2, min_lr=1e-6, verbose=0) ]; fit_params = {'callbacks': keras_callbacks, 'validation_split': 0.15}
        elif model_type == 'mlp':
            layer_choices = [(100,), (50, 50), (100, 50), (64, 32, 16), (128, 64), (256, 128)]; layers = trial.suggest_categorical('hidden_layer_sizes', layer_choices)
            param = {'hidden_layer_sizes': layers, 'activation': trial.suggest_categorical('activation', ['relu', 'tanh']), 'solver': trial.suggest_categorical('solver', ['adam']), 'alpha': trial.suggest_float('alpha', 1e-6, 1e-2, log=True), 'learning_rate': 'adaptive', 'learning_rate_init': trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True), 'max_iter': trial.suggest_int('max_iter', 400, 1200), 'early_stopping': True, 'n_iter_no_change': KERAS_PATIENCE + 10, 'validation_fraction': 0.15, 'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256]), 'random_state': 42, 'warm_start': False }; model = MLPClassifier(**param)
        elif model_type == 'randomforest':
             param={'n_estimators':trial.suggest_int('n_estimators',100,2000,step=100), 'max_depth':trial.suggest_int('max_depth',5,40,step=5), 'min_samples_split':trial.suggest_int('min_samples_split',2,25), 'min_samples_leaf':trial.suggest_int('min_samples_leaf',1,20), 'max_features':trial.suggest_categorical('max_features',['sqrt','log2',0.6,0.8]), 'bootstrap':trial.suggest_categorical('bootstrap',[True,False]), 'class_weight':trial.suggest_categorical('class_weight',['balanced','balanced_subsample',None]), 'random_state':42, 'n_jobs':n_jobs_optuna}; model=RandomForestClassifier(**param)
        elif model_type == 'extratrees':
             param={'n_estimators':trial.suggest_int('n_estimators',100,2000,step=100), 'max_depth':trial.suggest_int('max_depth',5,45,step=5), 'min_samples_split':trial.suggest_int('min_samples_split',2,25), 'min_samples_leaf':trial.suggest_int('min_samples_leaf',1,20), 'max_features':trial.suggest_categorical('max_features',['sqrt','log2',0.6,0.8]), 'bootstrap':trial.suggest_categorical('bootstrap',[True,False]), 'class_weight':trial.suggest_categorical('class_weight',['balanced','balanced_subsample',None]), 'random_state':42, 'n_jobs':n_jobs_optuna}; model=ExtraTreesClassifier(**param)
        elif model_type == 'logistic':
            param={'C':trial.suggest_float('C',1e-4,1e3,log=True), 'penalty':trial.suggest_categorical('penalty',['l1','l2']),'solver':'liblinear','class_weight':'balanced', 'max_iter':trial.suggest_int('max_iter',100,1000), 'random_state':42}; model=LogisticRegression(**param)
        elif model_type == 'gradientboosting':
             param={'n_estimators':trial.suggest_int('n_estimators',100,1500,step=100), 'learning_rate':trial.suggest_float('learning_rate',0.005,0.3,log=True), 'max_depth':trial.suggest_int('max_depth',3,12), 'min_samples_split':trial.suggest_int('min_samples_split',2,25), 'min_samples_leaf':trial.suggest_int('min_samples_leaf',1,20), 'subsample':trial.suggest_float('subsample',0.5,1.0), 'max_features':trial.suggest_categorical('max_features',['sqrt','log2',None]),'random_state':42}; model=GradientBoostingClassifier(**param)
        elif model_type == 'adaboost':
            base_depth=trial.suggest_int('base_estimator_max_depth',1,6); param_ada={'n_estimators':trial.suggest_int('n_estimators',50,800,step=50), 'learning_rate':trial.suggest_float('learning_rate',0.01,1.5,log=True), 'algorithm':'SAMME', 'random_state':42}; base_est=DecisionTreeClassifier(max_depth=base_depth,random_state=42); model=AdaBoostClassifier(estimator=base_est,**param_ada); trial.set_user_attr("base_estimator_max_depth",base_depth)
        elif model_type == 'knn':
            metric=trial.suggest_categorical('metric',['minkowski','manhattan','chebyshev']); param={'n_neighbors':trial.suggest_int('n_neighbors',3,65,step=2), 'weights':trial.suggest_categorical('weights',['uniform','distance']), 'metric':metric,'n_jobs':n_jobs_optuna};
            if metric=='minkowski': param['p']=trial.suggest_int('p',1,3)
            model=KNeighborsClassifier(**param)
        elif model_type == 'svc':
             kernel=trial.suggest_categorical('kernel',['rbf','poly','sigmoid', 'linear']); param={'C':trial.suggest_float('C',1e-3,1e4,log=True), 'kernel':kernel, 'probability':True, 'class_weight':'balanced', 'random_state':42, 'cache_size':700 };
             if kernel=='poly': param['degree']=trial.suggest_int('degree',2,5)
             if kernel in ['rbf','poly','sigmoid']: param['gamma']=trial.suggest_float('gamma',1e-5,1e1,log=True)
             model=SVC(**param)
        else: logger.error(f"Unsupported model type: {model_type}"); raise ValueError(f"Unsupported: {model_type}")

        # --- Cross-validation ---
        scores = []; is_dataframe = isinstance(X, pd.DataFrame)
        try:
            for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
                X_train = X.iloc[train_idx] if is_dataframe else X[train_idx]; X_valid = X.iloc[valid_idx] if is_dataframe else X[valid_idx]
                y_train_fold = y_keras[train_idx] if is_keras else (y.iloc[train_idx] if isinstance(y, pd.Series) else y[train_idx])
                y_valid_fold_orig = y.iloc[valid_idx] if isinstance(y, pd.Series) else y[valid_idx]
                current_fit_params = fit_params.copy()
                try:
                    if model_type == 'xgboost': eval_set_xgb = [(X_valid, y_valid_fold_orig)]; model.fit(X_train, y_train_fold, eval_set=eval_set_xgb, early_stopping_rounds=KERAS_PATIENCE, verbose=False)
                    elif model_type == 'catboost': current_fit_params['eval_set'] = [(X_valid, y_valid_fold_orig)]; model.fit(X_train, y_train_fold, **current_fit_params)
                    elif model_type == 'keras_mlp': model.fit(X_train, y_train_fold, **current_fit_params)
                    elif model_type == 'mlp': model.fit(X_train, y_train_fold)
                    else: model.fit(X_train, y_train_fold, **current_fit_params)
                    y_pred = model.predict(X_valid)
                    if is_keras and y_pred.ndim > 1 and y_pred.shape[1] > 1: y_pred = np.argmax(y_pred, axis=1)
                    score = accuracy_score(y_valid_fold_orig, y_pred); scores.append(score)
                except ValueError as ve: logger.warning(f"CV fold {fold+1} VAL ERROR {model_type} trial {trial.number}: {ve}"); return 0.0
                except Exception as e: logger.warning(f"CV fold {fold+1} fail {model_type} trial {trial.number} (GPU:{use_gpu}, Keras:{is_keras}): {e}", exc_info=False); scores = []; break
        except Exception as outer_e: logger.error(f"Outer CV error {model_type} trial {trial.number} (GPU:{use_gpu}, Keras:{is_keras}): {outer_e}", exc_info=True); return 0.0
        if not scores: logger.error(f"Cross-validation failed completely for {model_type} trial {trial.number}"); return 0.0
        mean_score = np.mean(scores); logger.debug(f"Trial {trial.number} ({model_type}) completed. Avg CV Score: {mean_score:.5f}"); return mean_score

    # --- Run Optuna Study ---
    study_name = f"{model_type}_opt_{timestamp}"; storage_name = f"sqlite:///optuna_trials/{study_name}.db"; study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, load_if_exists=True, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)); completed_trials = len([t for t in study.trials if t.state==optuna.trial.TrialState.COMPLETE]); trials_to_run = n_trials-completed_trials

    if trials_to_run > 0:
        logger.info(f"Setting Optuna timeout for {model_type} to {OPTUNA_TIMEOUT_PER_MODEL} seconds.")
        try:
            study.optimize(objective, n_trials=trials_to_run, timeout=OPTUNA_TIMEOUT_PER_MODEL, n_jobs=1) # Use n_jobs=1 for stability
        except Exception as opt_e:
            logger.error(f"Optuna optimize call failed for {model_type}: {opt_e}", exc_info=True)
            return None, -1, {} # Indicate failure
    else:
        logger.info(f"Study '{study_name}' already has {completed_trials} completed trials (target was {n_trials}). Skipping optimization run.")

    # --- Retrieve Results ---
    try:
        if not any(t.state == optuna.trial.TrialState.COMPLETE for t in study.trials):
            logger.error(f"Optuna study '{study_name}' for {model_type} has no successfully completed trials.")
            return None, -1, {}
        best_trial = study.best_trial
        best_params = best_trial.params
        best_cv_score = best_trial.value
    except ValueError:
        logger.error(f"Optuna study '{study_name}' for {model_type} has no best trial available.")
        return None, -1, {}
    except Exception as res_e:
        logger.error(f"Error retrieving Optuna results for {model_type}: {res_e}", exc_info=True)
        return None, -1, {}

    logger.info(f"Optimization complete for {model_type}.")
    logger.info(f"Best CV score: {best_cv_score:.5f}")
    logger.info(f"Best params: {best_params}")

    # --- Save Study Summary ---
    try:
        summary_file = f'optuna_trials/{model_type}_study_summary_{timestamp}.txt'
        with open(summary_file, 'w') as f:
            f.write(f"Optuna Summary: {model_type}\nTS: {timestamp}\nBest Trial: {best_trial.number}\nScore: {best_cv_score:.5f}\n\nParams:\n"); params_json = best_params.copy()
            if model_type=='adaboost' and "base_estimator_max_depth" in best_trial.user_attrs:
                params_json['base_estimator_max_depth'] = best_trial.user_attrs["base_estimator_max_depth"]; params_json['algorithm'] = 'SAMME'
            if model_type=='xgboost' and 'tree_method' in best_params:
                params_json['tree_method'] = best_params['tree_method']
            if model_type=='catboost' and 'task_type' in best_params:
                params_json['task_type'] = best_params['task_type']
            if model_type=='keras_mlp' or model_type=='mlp':
                 if 'hidden_layer_sizes' in params_json:
                     params_json['hidden_layer_sizes'] = str(params_json['hidden_layer_sizes'])
                 if 'n_layers' in best_params:
                     units_list = [best_params.get(f'n_units_l{i}') for i in range(best_params['n_layers']) if best_params.get(f'n_units_l{i}')]
                     params_json['hidden_units_structure'] = str(units_list)
                 params_json = {k: v for k, v in params_json.items() if not k.startswith('n_units_l')}
            json.dump(params_json, f, indent=4); logger.info(f"Saved Optuna summary: {summary_file}")
    except Exception as file_e: logger.warning(f"Could not save Optuna summary {model_type}: {file_e}")

    # --- Train final model ---
    final_model = None; final_fit_params = {}
    try:
        logger.info(f"Instantiating final {model_type} model...")
        # --- Instantiate model correctly based on its type (as previously corrected) ---
        if model_type == 'adaboost':
            best_d = best_trial.user_attrs.get('base_estimator_max_depth', 1); logger.info(f"Reconstruct AdaBoost DT(max_depth={best_d}) using SAMME")
            base_est_inst = DecisionTreeClassifier(max_depth=best_d, random_state=42); final_p_ada = {k:v for k,v in best_params.items() if k != 'base_estimator_max_depth'}; final_p_ada['algorithm'] = 'SAMME'; final_model = AdaBoostClassifier(estimator=base_est_inst, **final_p_ada)
        elif model_type == 'xgboost': final_params_xgb = best_params.copy(); final_params_xgb['objective']='multi:softmax'; final_params_xgb['num_class']=n_classes; final_params_xgb['n_jobs']=1; final_model = XGBClassifier(**final_params_xgb) # Use n_jobs=1 for wrapper
        elif model_type == 'catboost': final_params_cat = best_params.copy(); final_params_cat['loss_function']='MultiClass'; final_params_cat['verbose']=False; final_model = CatBoostClassifier(**final_params_cat)
        elif model_type == 'mlp': final_model = MLPClassifier(**best_params)
        elif model_type == 'keras_mlp':
            keras_build_params = { 'n_features': n_features, 'n_classes': n_classes, 'optimizer': best_params.get('optimizer', 'adam'), 'learning_rate': best_params.get('learning_rate', 0.001), 'dropout_rate': best_params.get('dropout_rate', 0.3), 'activation': best_params.get('activation', 'relu') }
            if 'n_layers' in best_params:
                hidden_units = []
                for i in range(best_params['n_layers']):
                   unit_val = best_params.get(f'n_units_l{i}')
                   if unit_val is not None: hidden_units.append(unit_val)
                keras_build_params['hidden_units'] = hidden_units if hidden_units else [64]
            else:
                hidden_units_final = best_params.get('hidden_layer_sizes', [128, 64])
                if isinstance(hidden_units_final, str): 
                    try: 
                        hidden_units_final = eval(hidden_units_final)
                    except: 
                        hidden_units_final = [128, 64]
                keras_build_params['hidden_units'] = list(hidden_units_final)
            final_model = KerasClassifier( model=build_keras_model, model__n_features=keras_build_params['n_features'], model__n_classes=keras_build_params['n_classes'], model__optimizer=keras_build_params['optimizer'], model__learning_rate=keras_build_params['learning_rate'], model__hidden_units=keras_build_params['hidden_units'], model__dropout_rate=keras_build_params['dropout_rate'], model__activation=keras_build_params['activation'], epochs=KERAS_EPOCHS, batch_size=best_params.get('batch_size', 128), verbose=0 )
            final_callbacks = [ EarlyStopping(monitor='val_accuracy', patience=KERAS_PATIENCE, restore_best_weights=True, verbose=0), ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=KERAS_PATIENCE // 2, min_lr=1e-6, verbose=0) ]; final_fit_params = {'callbacks': final_callbacks, 'validation_split': 0.15}
        elif model_type == 'randomforest': final_params_rf = best_params.copy(); final_params_rf['n_jobs'] = n_jobs_optuna; final_model = RandomForestClassifier(**final_params_rf) # Use defined n_jobs
        elif model_type == 'extratrees': final_params_et = best_params.copy(); final_params_et['n_jobs'] = n_jobs_optuna; final_model = ExtraTreesClassifier(**final_params_et) # Use defined n_jobs
        elif model_type == 'logistic': final_p_log = best_params.copy(); final_p_log['solver'] = 'liblinear'; 
        if 'class_weight' not in final_p_log: final_p_log['class_weight'] = 'balanced'; final_model = LogisticRegression(**final_p_log)
        elif model_type == 'gradientboosting': final_model = GradientBoostingClassifier(**best_params)
        elif model_type == 'knn': final_params_knn = best_params.copy(); final_params_knn['n_jobs'] = n_jobs_optuna; final_model = KNeighborsClassifier(**final_params_knn) # Use defined n_jobs
        elif model_type == 'svc': svc_p = best_params.copy(); svc_p['probability'] = True; 
        if 'class_weight' not in svc_p: svc_p['class_weight'] = 'balanced'; final_model = SVC(**svc_p)

        # --- Fit the final model (Check existence first) ---
        if final_model is not None:
            logger.info(f"Fitting final {model_type} model...")
            start_fit_time = time.time()
            try:
                if model_type == 'keras_mlp':
                    logger.info("Using validation_split for Keras final fit...")
                    final_model.fit(X, y_keras, **final_fit_params)
                else:
                    final_model.fit(X, y)
                fit_duration = time.time() - start_fit_time
                logger.info(f"Final {model_type} fitted in {fit_duration:.2f}s.")

                # --- Save model and importance (AFTER fitting) ---
                model_path = f'models/{model_type}_{timestamp}.joblib'
                logger.info(f"Saving final {model_type} model...")
                try:
                    if isinstance(final_model, KerasClassifier):
                        tf_model_save_path = f'models/{model_type}_tfmodel_{timestamp}'
                        try:
                            final_model.model_.save(tf_model_save_path)
                            logger.info(f"Saved Keras TF model: {tf_model_save_path}")
                        except Exception as keras_save_err:
                            logger.warning(f"Keras TF save failed ({keras_save_err}), attempt joblib wrapper...")
                            joblib.dump(final_model, model_path)
                            logger.info(f"Saved Keras wrapper via joblib: {model_path}")
                    else:
                        joblib.dump(final_model, model_path)
                        logger.info(f"Saved final {model_type} via joblib: {model_path}")
                except Exception as save_err:
                    logger.error(f"Failed save model {model_type}: {save_err}", exc_info=True)

                feat_names = list(X.columns) if isinstance(X, pd.DataFrame) else None
                if feat_names:
                    logger.info(f"Saving importance {model_type}...")
                    save_feature_importance(final_model, feat_names, timestamp, model_type)
                else:
                    logger.warning(f"No feat names for importance {model_type}.")
            except Exception as fit_save_e:
                logger.error(f"Error during final fit/save {model_type}: {fit_save_e}", exc_info=True)
                return None, best_cv_score, best_params # Return score/params, but no model
        else: # Instantiation failed
            logger.error(f"Could not instantiate final model {model_type}.")
            return None, best_cv_score, best_params
    except Exception as final_e:
        logger.error(f"Failed final instantiate/fit/save process {model_type}: {final_e}", exc_info=True)
        return None, best_cv_score, best_params

    return final_model, best_cv_score, best_params

def create_ensemble(qualified_models_with_scores, X_train_ensemble, y_train_ensemble, timestamp, n_jobs_ensemble=18):
    """
    Create ensemble models (Voting and Stacking) from qualified models.
    """
    logger.info("Attempting ensemble creation...")

    if not qualified_models_with_scores:
        logger.error("No qualified models.")
        return None, None, None

    sorted_models = sorted(qualified_models_with_scores, key=lambda x: x[2], reverse=True)
    logger.info(f"Qualified models: {[(m[0], f'{m[2]:.5f}') for m in sorted_models]}")
    N_ens = len(sorted_models)

    if N_ens < 2:
        logger.warning(f"Less than 2 qualified models. Skipping ensembles.")
        if N_ens == 1:
            n, m, s = sorted_models[0]
            logger.info(f"Returning best individual model: {n} (CV: {s:.5f})")
            return None, None, m
        else:
            return None, None, None

    estimators_valid_for_ensemble = []
    keras_models_excluded = []
    for name, model, score in sorted_models:
        is_keras_wrapper = isinstance(model, KerasClassifier)
        model_saved_path = f'models/{name}_{timestamp}.joblib'
        tf_model_path = f'models/{name}_tfmodel_{timestamp}'
        if is_keras_wrapper and not os.path.exists(tf_model_path) and not os.path.exists(model_saved_path):
            logger.warning(f"Keras model {name} save files missing. Excluding from ensemble.")
            keras_models_excluded.append(name)
        else:
            estimators_valid_for_ensemble.append((name, model))

    if len(estimators_valid_for_ensemble) < 2:
        logger.warning(f"Less than 2 models usable for ensemble. Skipping ensembles.")
        non_keras_models = [(n, m, s) for n, m, s in sorted_models if n not in keras_models_excluded]
        if non_keras_models:
            best_n, best_m, best_s = non_keras_models[0]
            logger.info(f"Returning best non-excluded model: {best_n} (CV: {best_s:.5f})")
            return None, None, best_m
        elif sorted_models:
            best_n, best_m, best_s = sorted_models[0]
            logger.info(f"Returning original best model: {best_n} (CV: {best_s:.5f})")
            return None, None, best_m
        else:
            return None, None, None

    logger.info(f"Using {len(estimators_valid_for_ensemble)} models for ensemble: {[n for n, _ in estimators_valid_for_ensemble]}")

    scores = [s for n, m, s in sorted_models if n in dict(estimators_valid_for_ensemble)]
    min_s = min(scores) if scores else 0
    shift_s = [s - min_s + 1e-6 for s in scores]
    tot_s = sum(shift_s)
    norm_w = [s / tot_s for s in shift_s] if tot_s > 0 else None

    vote_clf = None
    can_soft = all(hasattr(m, 'predict_proba') for _, m in estimators_valid_for_ensemble)

    if can_soft:
        logger.info("Attempting Soft Voting...")
        logger.info(f"Weights: {list(np.round(norm_w, 3)) if norm_w else 'Uniform'}")
        try:
            vote_clf = VotingClassifier(estimators=estimators_valid_for_ensemble, voting='soft', weights=norm_w, n_jobs=n_jobs_ensemble)
            vote_clf.fit(X_train_ensemble, y_train_ensemble)
            vote_path = f'models/voting_ensemble_soft_{timestamp}.joblib'
            joblib.dump(vote_clf, vote_path)
            logger.info(f"Saved Soft Voting Ensemble: {vote_path}")
        except Exception as e:
            logger.error(f"Failed Soft Voting: {e}", exc_info=True)
            vote_clf = None
            can_soft = False

    if not can_soft:
        logger.warning("Attempting Hard Voting...")
        try:
            vote_clf = VotingClassifier(estimators=estimators_valid_for_ensemble, voting='hard', n_jobs=n_jobs_ensemble)
            vote_clf.fit(X_train_ensemble, y_train_ensemble)
            vote_path = f'models/voting_ensemble_hard_{timestamp}.joblib'
            joblib.dump(vote_clf, vote_path)
            logger.info(f"Saved Hard Voting Ensemble: {vote_path}")
        except Exception as e:
            logger.error(f"Failed Hard Voting: {e}", exc_info=True)
            vote_clf = None

    stack_clf = None
    can_stack = all(hasattr(m, 'predict_proba') for _, m in estimators_valid_for_ensemble)
    meta_learner = None
    if can_stack:
        logger.info("Attempting Stacking...")
        try:
            meta_learner = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', C=1.0, n_jobs=1)
            stack_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
            stack_clf = StackingClassifier(estimators=estimators_valid_for_ensemble, final_estimator=meta_learner, cv=stack_cv, stack_method='predict_proba', n_jobs=n_jobs_ensemble, passthrough=False)
            stack_clf.fit(X_train_ensemble, y_train_ensemble)
            stack_path = f'models/stacking_ensemble_{timestamp}.joblib'
            joblib.dump(stack_clf, stack_path)
            logger.info(f"Saved Stacking Ensemble: {stack_path}")
        except Exception as e:
            logger.error(f"Failed Stacking: {e}", exc_info=True)
            stack_clf = None
    else:
        logger.warning("Cannot create Stacking Ensemble.")

    best_ind_q_model = None
    best_n = "N/A"
    best_s = -1
    if sorted_models:
        best_n, best_m, best_s = sorted_models[0]
        best_ind_q_model = best_m
        logger.info(f"Best individual qualified model: {best_n} (CV: {best_s:.5f})")

    try:
        summary_path = f'results/ensemble_creation_summary_{timestamp}.txt'
        with open(summary_path, 'w') as f:
            f.write("Ensemble Summary\n=============\nQualified Models:\n")
            for n, _, s in sorted_models:
                f.write(f"- {n}: CV {s:.5f} {'(Excluded)' if n in keras_models_excluded else '(Included)'}\n")
            f.write(f"\nEnsembles ({len(estimators_valid_for_ensemble)} models):\n")
            vote_t = 'Soft' if can_soft and vote_clf and vote_clf.voting == 'soft' else ('Hard' if vote_clf and vote_clf.voting == 'hard' else 'N/A')
            f.write(f"- Voting ({vote_t}): {'Saved' if vote_clf else 'Failed/Skipped'}.\n")
            meta_name = meta_learner.__class__.__name__ if meta_learner and stack_clf else 'N/A'
            f.write(f"- Stacking (Meta: {meta_name}): {'Saved' if stack_clf else 'Failed/Skipped'}.\n")
            if keras_models_excluded:
                f.write(f"\nKeras Excluded: {', '.join(keras_models_excluded)}\n")
            if best_ind_q_model: 
                f.write(f"\nBest individual: {best_n}\n")
        logger.info(f"Saved ensemble summary: {summary_path}")
    except Exception as file_e:
        logger.warning(f"Could not save ensemble summary: {file_e}")

    return vote_clf, stack_clf, best_ind_q_model

def evaluate_model(model, X_eval, y_eval, model_name, timestamp, le):
    if model is None:
        logger.warning(f"Skip eval {model_name}: model None.")
        return None, None
        
    if le is None:
        logger.warning(f"Skip eval {model_name}: LE None.")
        return None, None
        
    logger.info(f"Evaluating {model_name}...")
    
    if isinstance(y_eval, pd.Series):
        y_eval = y_eval.values
        
    is_keras_wrapper = isinstance(model, KerasClassifier)
    
    try:
        y_pred = model.predict(X_eval)
        if is_keras_wrapper and hasattr(model, 'predict_proba') and y_pred.ndim > 1 and y_pred.shape[1] > 1:
            y_pred = np.argmax(y_pred, axis=1)
            
        acc = accuracy_score(y_eval, y_pred)
        
        try:
            y_eval_lbls = le.inverse_transform(y_eval)
            y_pred_lbls = le.inverse_transform(y_pred)
            tg_names = le.classes_
        except Exception as e_le:
            logger.warning(f"LE inverse fail {model_name}:{e_le}. Use numeric.")
            y_eval_lbls = y_eval
            y_pred_lbls = y_pred
            tg_names = [str(i) for i in sorted(np.unique(y_eval))]
            
        rpt_str = classification_report(y_eval_lbls, y_pred_lbls, target_names=tg_names, zero_division=0)
        rpt_dict = classification_report(y_eval_lbls, y_pred_lbls, target_names=tg_names, output_dict=True, zero_division=0)
        cm = confusion_matrix(y_eval_lbls, y_pred_lbls, labels=tg_names)
        
        logger.info(f"{model_name} Eval Acc: {acc:.5f}")
        
        eval_fname = f'results/{model_name}_evaluation_{timestamp}.txt'
        with open(eval_fname, 'w') as f:
            f.write(f"Model Eval Summary\n=============\nModel:{model_name}\nTS:{timestamp}\nAcc:{acc:.5f}\n\nReport:\n{rpt_str}\n\nCM:\n{np.array2string(cm)}\n")
            
        logger.info(f"Saved eval: {eval_fname}")
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=tg_names, yticklabels=tg_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'CM - {model_name} (Acc:{acc:.3f})')
        plt.tight_layout()
        plot_fname = f'plots/{model_name}_confusion_matrix_{timestamp}.png'
        plt.savefig(plot_fname)
        plt.close()
        logger.info(f"Saved CM plot: {plot_fname}")
        
        return acc, rpt_dict
        
    except AttributeError as ae:
        logger.error(f"Eval error {model_name}: AttrErr: {ae}", exc_info=True)
        return None, None
        
    except Exception as e:
        logger.error(f"Error eval {model_name}: {e}", exc_info=True)
        return None, None
    
def make_test_predictions(model, X_test, test_obs, timestamp, model_name, le):
    """
    Generate test predictions and save results to CSV and summary files.
    """
    logger.info(f"Generating test predictions using {model_name}...")

    if model is None:
        logger.error(f"Prediction failed for {model_name}: model is None.")
        return None

    if le is None:
        logger.warning(f"LabelEncoder is None for {model_name}. Attempting fallback...")
        enc_fs = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
        if enc_fs:
            try:
                le = joblib.load(f'features/{enc_fs[-1]}')
                logger.info(f"Loaded fallback LabelEncoder: {enc_fs[-1]}.")
            except Exception as load_e:
                logger.error(f"LabelEncoder fallback failed: {load_e}. Prediction aborted.")
                return None
        else:
            logger.error(f"Prediction failed: LabelEncoder is None and no fallback available.")
            return None

    try:
        y_pred_enc = model.predict(X_test)

        # Handle KerasClassifier predictions
        is_keras_wrapper = isinstance(model, KerasClassifier)
        if is_keras_wrapper and hasattr(model, 'predict_proba') and y_pred_enc.ndim > 1 and y_pred_enc.shape[1] > 1:
            y_pred_enc = np.argmax(y_pred_enc, axis=1)

        pred_lbls = le.inverse_transform(y_pred_enc)

        # Create submission DataFrame
        sub = pd.DataFrame({'obs': test_obs, 'salary_category': pred_lbls})
        safe_name = model_name.replace("/", "_").replace("\\", "_").replace(":", "_").replace(" ", "_")
        sub_path = f'submissions/solution_{safe_name}_{timestamp}.csv'
        sub.to_csv(sub_path, index=False)
        logger.info(f"Saved submission: {sub_path}")

        # Log prediction distribution
        val_cts = sub['salary_category'].value_counts().to_dict()
        logger.info(f"Test prediction distribution for {model_name}: {val_cts}")

        # Save prediction summary
        sum_fname = f'results/{safe_name}_test_prediction_summary_{timestamp}.txt'
        with open(sum_fname, 'w') as f:
            f.write(f"Test Prediction Summary - {model_name}\n\n")
            try:
                m_cls = model.__class__.__name__
            except AttributeError:
                m_cls = "N/A"
            f.write(f"Model Class: {m_cls}\n")
            f.write(f"Timestamp: {timestamp}\n")
            f.write(f"Total Predictions: {len(pred_lbls)}\n")
            f.write("Distribution:\n")
            total = len(pred_lbls)
            if total > 0:
                for lbl, cnt in sorted(val_cts.items()):
                    f.write(f"- {lbl}: {cnt} ({cnt / total:.2%})\n")
            else:
                f.write("- No predictions.\n")
        logger.info(f"Saved prediction summary: {sum_fname}")

        return sub

    except AttributeError as ae:
        logger.error(f"LabelEncoder or prediction error for {model_name}: {ae}.", exc_info=True)
        return None
    except Exception as e:
        logger.error(f"Error during prediction for {model_name}: {e}", exc_info=True)
        return None

# --- Main Pipeline (Uses modified preprocess_data) ---
def run_complete_pipeline(perform_feature_selection=False, min_cv_score_threshold=0.72, fs_threshold='mean', n_jobs_sklearn=18):
    """Run the complete model training pipeline with combined FE logic."""
    timestamp = get_timestamp()
    main_log_file = None
    file_handler = None
    try:
        # 1. Setup
        print("--- Starting Complete Pipeline Run (Combined FE) ---")
        create_directory_structure()
        main_log_file = f'logs/pipeline_run_{timestamp}.log'
        file_handler = logging.FileHandler(main_log_file)
        file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        if file_handler not in logger.handlers:
            logger.addHandler(file_handler)

        logger.info(f"--- Starting Complete Pipeline Run --- Timestamp: {timestamp} ---")
        logger.info(f"Pipeline Config: Combined FE, Scaling=True, FeatSelect={perform_feature_selection} (Thresh={fs_threshold}), CV Thresh={min_cv_score_threshold}, n_jobs={n_jobs_sklearn} used for Sklearn, Const Cols Kept in Preproc")
        logger.info(f"Logging detailed output to: {main_log_file}")

        # 2. Load Data
        logger.info("Loading data...")
        try:
            train_df = pd.read_csv('train.csv')
            test_df = pd.read_csv('test.csv')
            logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
        except FileNotFoundError as e:
            logger.error(f"Data load error: {e}.")
            return False
        if 'salary_category' not in train_df.columns:
            logger.error("Target missing.")
            return False
        if train_df.empty or test_df.empty:
            logger.error("Data empty.")
            return False
        if 'obs' not in test_df.columns:
            logger.error("Column 'obs' missing in test.csv.")
            return False
            
        # Get unique values for manual OHE (CRITICAL: do this before preprocessing)
        all_states = set(train_df['job_state'].dropna().unique()).union(set(test_df['job_state'].dropna().unique()))
        all_feature1 = set(train_df['feature_1'].dropna().unique()).union(set(test_df['feature_1'].dropna().unique()))
        logger.info(f"Found {len(all_states)} unique states and {len(all_feature1)} unique feature_1 values across train/test.")

        # 3. Preprocess Training Data (Using the NEW preprocess_data)
        logger.info("Preprocessing training data (using combined logic)...")
        X_train_orig, y_train_orig, feature_cols_initial, label_encoder = preprocess_data(
            train_df, all_states, all_feature1, timestamp, is_training=True
        )
        if X_train_orig is None or y_train_orig is None or label_encoder is None or feature_cols_initial is None:
            logger.error("Train preprocess failed.")
            return False
        logger.info(f"Train preprocess done. Initial Feats: {X_train_orig.shape[1]}")
        y_train_orig = pd.Series(y_train_orig)  # Ensure Series

        # 4. Train/Validation Split
        logger.info("Splitting data (80/20)...")
        X_train_full, X_val, y_train_full, y_val = train_test_split(X_train_orig, y_train_orig, test_size=0.20, random_state=42, stratify=y_train_orig)
        logger.info(f"Train (Pre-scale): {X_train_full.shape}, Val (Pre-scale): {X_val.shape}")
        y_train_full = pd.Series(y_train_full, index=X_train_full.index)
        y_val = pd.Series(y_val, index=X_val.index)

        # 5. SCALING STEP
        logger.info("Applying StandardScaler...")
        scaler = StandardScaler()
        # Fit scaler only on the columns that exist in the training partition
        X_train_full_scaled = scaler.fit_transform(X_train_full[feature_cols_initial])
        X_val_scaled = scaler.transform(X_val[feature_cols_initial])
        X_train_full_scaled = pd.DataFrame(X_train_full_scaled, index=X_train_full.index, columns=feature_cols_initial)
        X_val_scaled = pd.DataFrame(X_val_scaled, index=X_val.index, columns=feature_cols_initial)
        scaler_path = f'scalers/scaler_{timestamp}.joblib'
        joblib.dump(scaler, scaler_path)
        logger.info(f"Scaler saved: {scaler_path}")
        logger.info(f"Scaled Train shape: {X_train_full_scaled.shape}, Scaled Val shape: {X_val_scaled.shape}")

        # 6. Preprocess & Scale Test Data (Using the NEW preprocess_data)
        logger.info("Preprocessing test data (using combined logic)...")
        # Pass the *initial* feature list determined during training preprocessing
        X_test_orig, _, _, _ = preprocess_data(
            test_df, all_states, all_feature1, timestamp, is_training=False, feature_columns_to_use=feature_cols_initial
        )
        if X_test_orig is None:
            logger.error("Test preprocess failed.")
            return False
            
        # Ensure columns match (should be handled by preprocess_data now)
        try:
            X_test_aligned = X_test_orig[feature_cols_initial]  # Align columns
            logger.info("Test columns aligned.")
        except KeyError as ke:
            logger.error(f"Test col mismatch after preprocess: {ke}.")
            return False
            
        logger.info("Scaling test data...")
        X_test_scaled = scaler.transform(X_test_aligned)
        X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test_aligned.index, columns=feature_cols_initial)
        logger.info(f"Test preprocess & scale done. Shape: {X_test_scaled.shape}")

        # Define Data Partitions
        X_opt_train = X_train_full_scaled.copy()
        y_opt_train = y_train_full.copy()
        X_holdout_val = X_val_scaled.copy()
        y_holdout_val = y_val.copy()
        X_final_test = X_test_scaled.copy()
        current_feature_cols = list(feature_cols_initial)  # Start with all features

        # 7. Optional Feature Selection (On SCALED data)
        if perform_feature_selection:
            logger.info(f"Performing feature selection (Threshold: {fs_threshold})...")
            try:
                # Note: RF selector might rank constant columns low, effectively removing them here if threshold allows
                selector_model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=n_jobs_sklearn, class_weight='balanced', max_depth=20)  # Use param
                logger.info("Fitting RF for feature selection...")
                selector_model.fit(X_opt_train, y_opt_train)
                selector = SelectFromModel(selector_model, threshold=fs_threshold, prefit=True)  # Use specified threshold
                selected_mask = selector.get_support()
                selected_features = X_opt_train.columns[selected_mask]
                num_orig = X_opt_train.shape[1]
                num_sel = len(selected_features)
                if num_sel < num_orig:
                    num_removed = num_orig - num_sel
                    logger.info(f"Feat selection removed {num_removed} features. Selected {num_sel}.")
                    current_feature_cols = list(selected_features)
                    X_opt_train = X_opt_train[current_feature_cols]
                    X_holdout_val = X_holdout_val[current_feature_cols]
                    X_final_test = X_final_test[current_feature_cols]
                    logger.info(f"Selection applied to train/val/test.")
                    joblib.dump(current_feature_cols, f'features/selected_feature_columns_{timestamp}.joblib')
                else:
                    logger.info(f"Feature selection removed no features with threshold '{fs_threshold}'.")
                    perform_feature_selection = False  # Update flag
            except Exception as e:
                logger.error(f"Error feature selection: {e}. Use all scaled.", exc_info=True)
                perform_feature_selection = False
                current_feature_cols = list(feature_cols_initial)
                X_opt_train = X_train_full_scaled[current_feature_cols]
                X_holdout_val = X_val_scaled[current_feature_cols]
                X_final_test = X_test_scaled[current_feature_cols]
        else:
            logger.info("Skipping feature selection.")
            X_opt_train = X_opt_train[current_feature_cols]
            X_holdout_val = X_holdout_val[current_feature_cols]
            X_final_test = X_final_test[current_feature_cols]

        logger.info(f"Data shapes post-scaling/selection: Train={X_opt_train.shape}, Val={X_holdout_val.shape}, Test={X_final_test.shape}")
        logger.info(f"Number of features used in modeling: {len(current_feature_cols)}")

        # 8. Optimize, Train Base Models & Make Individual Predictions
        models_to_optimize = [
            ('logistic', 30), ('knn', 25), ('adaboost', 30),
            ('randomforest', 40), ('extratrees', 40),
            ('gradientboosting', 40), ('mlp', 25), ('keras_mlp', 20),
            ('catboost', 40), ('xgboost', 50)
        ]
        qualified_models_with_scores = []
        optimized_params_all = {}
        logger.info(f"--- Optimizing models & Making Individual Predictions (Thresh: {min_cv_score_threshold}) ---")
        for model_name, n_trials in models_to_optimize:
            indiv_sub_df = None
            try:
                logger.info(f"--- Optimizing {model_name} ---")
                final_model, best_cv_score, best_params = optimize_model(
                    X_opt_train, y_opt_train, timestamp, model_name, n_trials=n_trials, n_jobs_optuna=n_jobs_sklearn
                )
                if final_model is not None and best_cv_score is not None and best_cv_score >= min_cv_score_threshold:
                    logger.info(f"+++ QUALIFIED: {model_name} (CV Score: {best_cv_score:.5f})")
                    qualified_models_with_scores.append((model_name, final_model, best_cv_score))
                    if best_params:
                        optimized_params_all[model_name] = best_params
                    logger.info(f"--- Evaluating {model_name} on HOLD-OUT set ---")
                    holdout_acc, _ = evaluate_model(final_model, X_holdout_val, y_holdout_val, f"{model_name}_qualified_holdout_eval", timestamp, label_encoder)
                    if holdout_acc is not None:
                        logger.info(f"Hold-out Acc ({model_name}): {holdout_acc:.5f}")
                    else:
                        logger.warning(f"Hold-out Eval failed for {model_name}.")
                    logger.info(f"--- Generating individual predictions for {model_name} ---")
                    indiv_sub_df = make_test_predictions(final_model, X_final_test, test_df['obs'], timestamp, f"{model_name}_qual_individual_pred", label_encoder)
                    if indiv_sub_df is not None:
                        logger.info(f"Individual prediction file saved for {model_name}.")
                    else:
                        logger.error(f"Failed individual predictions for {model_name}.")
                elif best_cv_score is not None:
                    logger.info(f"--- NOT QUALIFIED: {model_name} (CV Score: {best_cv_score:.5f} {' - Final fit/save failed' if final_model is None else ''}) ---")
                    if best_params:
                        optimized_params_all[model_name] = best_params
                else:
                    logger.warning(f"Optimization failed for {model_name}. Skip.")
            except Exception as e:
                logger.error(f"Error in main loop for {model_name}: {e}", exc_info=True)

        logger.info("--- Model Optimization Phase Complete ---")
        if not qualified_models_with_scores:
            logger.error(f"CRITICAL: NO models met CV threshold {min_cv_score_threshold}. Abort.")
            if file_handler:
                logger.removeHandler(file_handler)
                file_handler.close()
            return False
            
        logger.info(f"--- {len(qualified_models_with_scores)} models qualified. ---")
        logger.info(f"Qualified Models (Name, CV Score): {[(m[0], f'{m[2]:.5f}') for m in qualified_models_with_scores]}")

        # 9. Create Ensembles & Select FINAL Best Model
        final_model = None
        final_model_name = "N/A"
        vote_ens = None
        stack_ens = None
        best_ind_q_model = None
        if len(qualified_models_with_scores) == 1:
            final_model_name, final_model, final_cv_score = qualified_models_with_scores[0]
            logger.warning(f"Only 1 qualified: {final_model_name} (CV:{final_cv_score:.5f}). Select it.")
            best_ind_q_model = final_model
        elif len(qualified_models_with_scores) > 1:
            logger.info(f"--- Creating and Evaluating Ensembles ---")
            vote_ens, stack_ens, best_ind_q_model = create_ensemble(
                qualified_models_with_scores, X_opt_train, y_opt_train, timestamp, n_jobs_ensemble=n_jobs_sklearn  # Pass n_jobs
            )
            logger.info("--- Evaluating candidate final models on HOLD-OUT validation set ---")
            candidates = {}
            best_ind_name = None
            if vote_ens:
                vote_model_name = f"voting_ensemble_{vote_ens.voting}_qualified"
                logger.info(f"--- Eval {vote_model_name} ---")
                val_acc, _ = evaluate_model(vote_ens, X_holdout_val, y_holdout_val, f"{vote_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None:
                    candidates[vote_model_name] = (val_acc, vote_ens)
                    logger.info(f"Hold-out Acc ({vote_model_name}): {val_acc:.5f}")
                else:
                    logger.warning(f"Eval fail: {vote_model_name}")
            if stack_ens:
                stack_model_name = "stacking_ensemble_qualified"
                logger.info(f"--- Eval {stack_model_name} ---")
                val_acc, _ = evaluate_model(stack_ens, X_holdout_val, y_holdout_val, f"{stack_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None:
                    candidates[stack_model_name] = (val_acc, stack_ens)
                    logger.info(f"Hold-out Acc ({stack_model_name}): {val_acc:.5f}")
                else:
                    logger.warning(f"Eval fail: {stack_model_name}")
            if best_ind_q_model:
                best_ind_info = next((m for m in qualified_models_with_scores if m[1] == best_ind_q_model), None)
                if best_ind_info:
                    best_ind_name = best_ind_info[0]
                    logger.info(f"--- Eval Best Indiv ({best_ind_name}) ---")
                    eval_name = f"{best_ind_name}_best_qual_holdout_eval"
                    val_acc, _ = evaluate_model(best_ind_q_model, X_holdout_val, y_holdout_val, eval_name, timestamp, label_encoder)
                    if val_acc is not None:
                        cand_name = f"{best_ind_name}_best_qualified"
                        candidates[cand_name] = (val_acc, best_ind_q_model)
                        logger.info(f"Hold-out Acc ({best_ind_name}): {val_acc:.5f}")
                    else:
                        logger.warning(f"Eval fail: {best_ind_name}")
                else:
                    logger.warning("Could not find name for best individual.")
            if candidates:
                final_model_name = max(candidates, key=lambda k: candidates[k][0])
                final_val_score, final_model = candidates[final_model_name]
                logger.info(f"--- FINAL MODEL: '{final_model_name}' (Hold-Out Acc: {final_val_score:.5f}) ---")
            else:
                logger.error("Hold-out eval failed for all candidates.")
                if best_ind_q_model and best_ind_name:
                    final_model = best_ind_q_model
                    final_model_name = f"{best_ind_name}_best_qualified_cv_fallback"
                    logger.warning(f"FALLBACK: Using '{final_model_name}'.")
                else:
                    logger.error("No final model fallback.")
                    final_model = None

        if not final_model:
            logger.error("No final model selected. Abort.")
            if file_handler:
                logger.removeHandler(file_handler)
                file_handler.close()
            return False

        # 10. Make FINAL Test Predictions
        logger.info(f"--- Generating FINAL predictions using: {final_model_name} ---")
        final_sub_df = make_test_predictions(final_model, X_final_test, test_df['obs'], timestamp, f"{final_model_name}_FINAL", label_encoder)
        if final_sub_df is None:
            logger.error(f"Failed FINAL submission with {final_model_name}.")
        else:
            logger.info(f"FINAL submission file generated with {final_model_name}.")

        # 11. Final Summary
        logger.info("--- Pipeline Run Summary ---")
        logger.info(f"Timestamp: {timestamp}")
        logger.info(f"Config: Combined FE, Scaling=True, FeatSelect={perform_feature_selection} (Thresh={fs_threshold}), CV Thresh={min_cv_score_threshold}, n_jobs={n_jobs_sklearn}, ConstCols Kept")
        logger.info(f"Final # Features: {len(current_feature_cols)}")
        logger.info("Models Optimized: " + ", ".join([m[0] for m in models_to_optimize]))
        qual_details = [(m[0], f"{m[2]:.5f}") for m in qualified_models_with_scores] if qualified_models_with_scores else ["None"]
        logger.info("Models Qualified (Name, CV Score): " + ", ".join([f"{n}({s})" for n, s in qual_details]))
        logger.info(f"Ensembles Created: Voting={'Yes' if vote_ens else 'No'}, Stacking={'Yes' if stack_ens else 'No'}")
        logger.info(f"Final model selected: {final_model_name}")
        logger.info("Individual predictions saved for qualified models.")
        if final_sub_df is not None:
            safe_final_n = final_model_name.replace("/", "_").replace("\\", "_").replace(":", "_").replace(" ", "_")
            final_sub_path = f"submissions/solution_{safe_final_n}_FINAL_{timestamp}.csv"
            logger.info(f"Final submission file: {final_sub_path}")
        else:
            logger.warning("No FINAL submission file generated.")
        logger.info(f"Logs in: {main_log_file}")
        logger.info("--- Pipeline Completed Successfully ---")
        if file_handler:
            logger.removeHandler(file_handler)
            file_handler.close()
        return True

    except Exception as e:
        logger.error(f"--- Pipeline Failed Critically --- Error: {e}", exc_info=True)
        if file_handler and file_handler in logger.handlers:
            logger.removeHandler(file_handler)
            file_handler.close()
        return False

# --- Execution ---
if __name__ == "__main__":
    N_CORES_TO_USE = 18 # Define the number of cores to use for scikit-learn tasks (adjust as needed)

    start_time = time.time()
    success = run_complete_pipeline(
        perform_feature_selection=True,
        fs_threshold='mean', # Use 'mean' threshold like simpler model, or keep 'median'
        min_cv_score_threshold=0.72, # Adjust as needed
        n_jobs_sklearn=N_CORES_TO_USE # Pass the core count
        )
    end_time = time.time()
    duration = end_time - start_time
    status_msg = f"Pipeline execution {'succeeded' if success else 'failed'}."
    duration_msg = f"Total time: {duration:.2f} sec ({duration/60:.2f} min)."
    print(f"\n{'='*30}\n{status_msg}")
    print(duration_msg)
    print(f"{'='*30}")
    try: # Log final status if possible
        logger.info(status_msg)
        logger.info(duration_msg)
    except Exception as log_final_e:
        print(f"Final logging failed: {log_final_e}")

  from .autonotebook import tqdm as notebook_tqdm
2025-04-24 12:19:03,809 - INFO - Created directory: models
2025-04-24 12:19:03,811 - INFO - Created directory: features
2025-04-24 12:19:03,814 - INFO - Created directory: results
2025-04-24 12:19:03,816 - INFO - Created directory: submissions
2025-04-24 12:19:03,819 - INFO - Created directory: logs
2025-04-24 12:19:03,821 - INFO - Created directory: plots
2025-04-24 12:19:03,824 - INFO - Created directory: optuna_trials
2025-04-24 12:19:03,827 - INFO - Created directory: scalers
2025-04-24 12:19:03,830 - INFO - --- Starting Complete Pipeline Run --- Timestamp: 20250424_121903 ---
2025-04-24 12:19:03,833 - INFO - Pipeline Config: Combined FE, Scaling=True, FeatSelect=True (Thresh=mean), CV Thresh=0.72, n_jobs=18 used for Sklearn, Const Cols Kept in Preproc
2025-04-24 12:19:03,835 - INFO - Logging detailed output to: logs/pipeline_run_20250424_121903.log
2025-04-24 12:19:03,837 - INFO - Loading data...


--- Starting Complete Pipeline Run (Combined FE) ---


2025-04-24 12:19:04,321 - INFO - Train shape: (1280, 317), Test shape: (854, 316)
2025-04-24 12:19:04,327 - INFO - Found 39 unique states and 5 unique feature_1 values across train/test.
2025-04-24 12:19:04,329 - INFO - Preprocessing training data (using combined logic)...
2025-04-24 12:19:04,336 - INFO - Starting preprocessing (Combined Logic). Is training: True
2025-04-24 12:19:04,343 - INFO - Target variable 'salary_category' found and label encoded.
2025-04-24 12:19:04,349 - INFO - Target distribution (Encoded): [501 419 360]
2025-04-24 12:19:04,358 - INFO - Saved label encoder and target mapping: features/target_mapping_20250424_121903.json
2025-04-24 12:19:04,361 - INFO - Initial cleaning: Numerical and Boolean Features...
2025-04-24 12:19:04,853 - INFO - Starting Feature Engineering (using logic from simpler model)...
2025-04-24 12:19:04,897 - INFO - Applying Target Encoding to 'job_title_grouped'...
2025-04-24 12:19:04,941 - INFO - Fit and saved TargetEncoder for job_title_grou


Pipeline execution failed.
Total time: 6227.75 sec (103.80 min).
