In [7]:
!pip install pandas numpy matplotlib seaborn scikit-learn category_encoders xgboost catboost optuna joblib tensorflow scikeras logger lightbgm




ERROR: Could not find a version that satisfies the requirement lightbgm (from versions: none)
ERROR: No matching distribution found for lightbgm


In [2]:
# Fix for wmic error in Windows
import os
os.environ["LOKY_MAX_CPU_COUNT"] = str(os.cpu_count())
print(f"Setting max CPU count to: {os.environ['LOKY_MAX_CPU_COUNT']}")

# For older joblib versions, you might also need:
os.environ["JOBLIB_TEMP_FOLDER"] = os.path.join(os.path.expanduser("~"), "temp_joblib")
if not os.path.exists(os.environ["JOBLIB_TEMP_FOLDER"]):
    os.makedirs(os.environ["JOBLIB_TEMP_FOLDER"])

Setting max CPU count to: 20


In [8]:
pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, VotingClassifier, StackingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# SVC import removed
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, TruncatedSVD
from category_encoders import TargetEncoder, CatBoostEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb # Added LightGBM
import optuna
import warnings
import joblib
import os
import time
import json
from datetime import datetime
import shutil
import logging
import subprocess
import math

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
warnings.filterwarnings('ignore')
# Set seeds
np.random.seed(42)
tf.random.set_seed(42)

In [13]:
def create_directory_structure():
    directories = ['models', 'features', 'results', 'submissions', 'logs', 'plots', 'optuna_trials', 'scalers']
    for directory in directories:
        try:
            if not os.path.exists(directory):
                os.makedirs(directory)
                logger.info(f"Created directory: {directory}")
        except Exception as e:
            logger.error(f"Error creating directory {directory}: {e}")
            raise

In [14]:
def get_timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

In [15]:
def save_feature_importance(model, feature_names, timestamp, model_name):
    if not feature_names:
        logger.warning(f"No feature names for {model_name}. Skip importance.")
        return

    importances = None
    importance_type = None
    is_fitted = True

    if isinstance(model, KerasClassifier):
        try:
            _ = model.model_
        except AttributeError:
            is_fitted = False
            logger.warning(f"Keras model {model_name} not fitted. Skip importance.")
            return
        if not is_fitted: return
        logger.info(f"Importance plot not directly available for Keras model {model_name}.")
        return
    elif hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        importance_type = 'Importance'
    elif hasattr(model, 'coef_'):
        if model.coef_.ndim > 1:
            importances = np.abs(model.coef_).mean(axis=0)
        else:
            importances = np.abs(model.coef_)
        importance_type = 'Coefficient Magnitude'
    else:
        if isinstance(model, (VotingClassifier, StackingClassifier)):
            logger.info(f"Importance plot not generated for ensemble {model_name}.")
            return
        elif isinstance(model, MLPClassifier):
            logger.info(f"Importance plot not directly available for MLPClassifier {model_name}.")
            return
        elif hasattr(model, 'estimator_') and hasattr(model.estimator_, 'feature_importances_'):
            logger.info(f"Using importance from base estimator of {model_name}.")
            importances = model.estimator_.feature_importances_
            importance_type = 'Base Estimator Importance'
        elif hasattr(model, 'estimators_') and model.estimators_:
            try:
                all_importances = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')]
                if all_importances:
                    importances = np.mean(all_importances, axis=0)
                    importance_type = 'Mean Base Importance'
                    logger.info(f"Averaged importance from base estimators for {model_name}.")
                else:
                    logger.info(f"No base estimators with importance found for {model_name}.")
                    return
            except Exception as avg_imp_e:
                logger.warning(f"Could not average base importances for {model_name}: {avg_imp_e}.")
                return
        else:
            logger.info(f"Model {model_name} ({model.__class__.__name__}) lacks importance attributes.")
            return

    if importances is None:
        logger.warning(f"Could not retrieve importances for {model_name}.")
        return
    if importances.ndim > 1:
        logger.warning(f"Importances shape {importances.shape} for {model_name}. Mean over axis 0.")
        importances = importances.mean(axis=0)
    if len(importances) != len(feature_names):
        logger.warning(f"Importance len ({len(importances)}) vs names ({len(feature_names)}) mismatch for {model_name}.")
        return

    try:
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values('Importance', ascending=False)
        plt.figure(figsize=(12, 8))
        top_n = min(30, len(importance_df))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(top_n), palette='viridis')
        plt.title(f'Top {top_n} Feat Importances - {model_name}')
        plt.xlabel(f'Relative {importance_type}')
        plt.tight_layout()
        plot_filename = f'plots/{model_name}_feature_importance_{timestamp}.png'
        plt.savefig(plot_filename)
        plt.close()
        logger.info(f"Saved importance plot: {plot_filename}")
        csv_filename = f'results/{model_name}_feature_importance_{timestamp}.csv'
        importance_df.to_csv(csv_filename, index=False)
        logger.info(f"Saved importance csv: {csv_filename}")
    except Exception as e:
        logger.warning(f"Could not save importance plot/CSV {model_name}: {e}", exc_info=True)

In [16]:
def build_keras_model(n_features, n_classes, optimizer='adam', learning_rate=0.001,
                      hidden_units=[128, 64], dropout_rate=0.3, activation='relu'):
    model = keras.Sequential(name="keras_mlp_tabular")
    model.add(layers.Input(shape=(n_features,)))
    for units in hidden_units:
        model.add(layers.Dense(units))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(n_classes, activation='softmax'))
    if optimizer.lower() == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer.lower() == 'sgd':
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    else:
        logger.warning(f"Unsupported optimizer '{optimizer}'. Defaulting to Adam.")
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, VotingClassifier, StackingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# SVC is removed
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, TruncatedSVD
from category_encoders import TargetEncoder, CatBoostEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb # Added LightGBM
import optuna
import warnings
import joblib
import os
import time
import json
from datetime import datetime
import shutil
import logging
import subprocess
import math

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
warnings.filterwarnings('ignore')
# Set seeds
np.random.seed(42)
tf.random.set_seed(42)

# --- Utility Functions ---
def create_directory_structure():
    directories = ['models', 'features', 'results', 'submissions', 'logs', 'plots', 'optuna_trials', 'scalers']
    for directory in directories:
        try:
            if not os.path.exists(directory):
                os.makedirs(directory)
                logger.info(f"Created directory: {directory}")
        except Exception as e:
            logger.error(f"Error creating directory {directory}: {e}")
            raise

def get_timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def save_feature_importance(model, feature_names, timestamp, model_name):
    if not feature_names:
        logger.warning(f"No feature names for {model_name}. Skip importance.")
        return

    importances = None
    importance_type = None
    is_fitted = True

    if isinstance(model, KerasClassifier):
        try:
            _ = model.model_
        except AttributeError:
            is_fitted = False
            logger.warning(f"Keras model {model_name} not fitted. Skip importance.")
            return
        if not is_fitted: return
        logger.info(f"Importance plot not directly available for Keras model {model_name}.")
        return
    elif hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        importance_type = 'Importance'
    elif hasattr(model, 'coef_'):
        if model.coef_.ndim > 1:
            importances = np.abs(model.coef_).mean(axis=0)
        else:
            importances = np.abs(model.coef_)
        importance_type = 'Coefficient Magnitude'
    else:
        if isinstance(model, (VotingClassifier, StackingClassifier)):
            logger.info(f"Importance plot not generated for ensemble {model_name}.")
            return
        elif isinstance(model, MLPClassifier):
            logger.info(f"Importance plot not directly available for MLPClassifier {model_name}.")
            return
        elif hasattr(model, 'estimator_') and hasattr(model.estimator_, 'feature_importances_'):
            logger.info(f"Using importance from base estimator of {model_name}.")
            importances = model.estimator_.feature_importances_
            importance_type = 'Base Estimator Importance'
        elif hasattr(model, 'estimators_') and model.estimators_:
            try:
                all_importances = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')]
                if all_importances:
                    importances = np.mean(all_importances, axis=0)
                    importance_type = 'Mean Base Importance'
                    logger.info(f"Averaged importance from base estimators for {model_name}.")
                else:
                    logger.info(f"No base estimators with importance found for {model_name}.")
                    return
            except Exception as avg_imp_e:
                logger.warning(f"Could not average base importances for {model_name}: {avg_imp_e}.")
                return
        else:
            logger.info(f"Model {model_name} ({model.__class__.__name__}) lacks importance attributes.")
            return

    if importances is None:
        logger.warning(f"Could not retrieve importances for {model_name}.")
        return
    if importances.ndim > 1:
        logger.warning(f"Importances shape {importances.shape} for {model_name}. Mean over axis 0.")
        importances = importances.mean(axis=0)
    if len(importances) != len(feature_names):
        logger.warning(f"Importance len ({len(importances)}) vs names ({len(feature_names)}) mismatch for {model_name}.")
        return

    try:
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values('Importance', ascending=False)
        plt.figure(figsize=(12, 8))
        top_n = min(30, len(importance_df))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(top_n), palette='viridis')
        plt.title(f'Top {top_n} Feat Importances - {model_name}')
        plt.xlabel(f'Relative {importance_type}')
        plt.tight_layout()
        plot_filename = f'plots/{model_name}_feature_importance_{timestamp}.png'
        plt.savefig(plot_filename)
        plt.close()
        logger.info(f"Saved importance plot: {plot_filename}")
        csv_filename = f'results/{model_name}_feature_importance_{timestamp}.csv'
        importance_df.to_csv(csv_filename, index=False)
        logger.info(f"Saved importance csv: {csv_filename}")
    except Exception as e:
        logger.warning(f"Could not save importance plot/CSV {model_name}: {e}", exc_info=True)

def build_keras_model(n_features, n_classes, optimizer='adam', learning_rate=0.001,
                      hidden_units=[128, 64], dropout_rate=0.3, activation='relu'):
    model = keras.Sequential(name="keras_mlp_tabular")
    model.add(layers.Input(shape=(n_features,)))
    for units in hidden_units:
        model.add(layers.Dense(units))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(n_classes, activation='softmax'))
    if optimizer.lower() == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer.lower() == 'sgd':
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    else:
        logger.warning(f"Unsupported optimizer '{optimizer}'. Defaulting to Adam.")
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# --- MODIFIED preprocess_data Function ---
def preprocess_data(df, all_states, all_feature1, timestamp, is_training=True, feature_columns_to_use=None):
    """Preprocesses data incorporating FE from simpler model and robust categorical handling."""
    logger.info(f"Starting preprocessing (Combined Logic). Is training: {is_training}")
    start_time = time.time()
    data = df.copy()
    y = None
    le = None
    target_column = 'salary_category'

    # 1. Handle Target Variable (Training Only)
    if target_column in data.columns and is_training:
        target = data[target_column] # Keep target for TargetEncoding later
        le = LabelEncoder()
        y = le.fit_transform(target)
        logger.info(f"Target variable '{target_column}' found and label encoded.")
        logger.info(f"Target distribution (Encoded): {np.bincount(y)}")
        joblib.dump(le, f'features/label_encoder_{timestamp}.joblib')
        mapping = {int(v): k for k, v in zip(le.classes_, le.transform(le.classes_))}
        mapping_file = f'features/target_mapping_{timestamp}.json'
        with open(mapping_file, 'w') as f:
            json.dump(mapping, f, indent=4)
        logger.info(f"Saved label encoder and target mapping: {mapping_file}")
    elif not is_training:
        # Load encoder
        try:
            encoder_files = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
            if encoder_files:
                latest_encoder_file = encoder_files[-1]
                le = joblib.load(f'features/{latest_encoder_file}')
                logger.info(f"Loaded latest label encoder: {latest_encoder_file}")
            else:
                logger.warning("No label encoder file found for test data!")
                le = None
        except Exception as e:
            logger.error(f"Failed to load label encoder: {e}")
            le = None
    elif is_training: # is_training is True but target column is missing
        logger.error(f"Target column '{target_column}' missing in training data!")
        raise ValueError(f"Target column '{target_column}' not found in training data.")

    # 2. Define Feature Groups
    boolean_features_potential = ['feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_10', 'feature_11']
    boolean_features = [f for f in boolean_features_potential if f in data.columns]
    numerical_features = [f for f in ['feature_2', 'feature_9', 'feature_12'] if f in data.columns] # Base numerical
    job_desc_cols = [col for col in data.columns if col.startswith('job_desc_')]
    all_numerical_features = numerical_features + job_desc_cols

    # 3. Initial Cleaning (Numerical & Boolean)
    logger.info("Initial cleaning: Numerical and Boolean Features...")
    for col in all_numerical_features:
        if col in data.columns:
            if data[col].dtype == 'object':
                data[col] = data[col].replace(['', ' ', 'NA', 'None', 'NULL'], np.nan)
            data[col] = pd.to_numeric(data[col], errors='coerce')
            median_val = data[col].median()
            fill_value = median_val if not pd.isna(median_val) else 0
            data[col] = data[col].fillna(fill_value)

    for col in boolean_features:
        if col in data.columns:
            numeric_view = pd.to_numeric(data[col], errors='coerce')
            is_boolean_like = numeric_view.dropna().isin([0, 1]).all()
            if is_boolean_like:
                data[col] = numeric_view.fillna(0).astype(int)
            else:
                num_non_bool = numeric_view.dropna().loc[~numeric_view.dropna().isin([0, 1])].count()
                logger.warning(f"Column '{col}' contains non-0/1 values ({num_non_bool} instances). Treating as numerical and imputing with median.")
                median_val = numeric_view.median()
                fill_value = median_val if not pd.isna(median_val) else 0
                data[col] = numeric_view.fillna(fill_value)

    logger.info("Starting Feature Engineering (using logic from simpler model)...")
    engineered_feature_names = [] # Track engineered features

    # --- Feature Engineering Steps ---
    if 'job_title' in data.columns:
        data['job_title'] = data['job_title'].fillna('Unknown')
        title_flags = ['is_senior', 'is_junior', 'is_developer', 'is_specialist']
        data['is_senior'] = data['job_title'].str.lower().str.contains('senior|sr|lead|principal').fillna(False).astype(int)
        data['is_junior'] = data['job_title'].str.lower().str.contains('junior|jr|associate|entry').fillna(False).astype(int)
        data['is_developer'] = data['job_title'].str.lower().str.contains('develop|programmer|coder|engineer').fillna(False).astype(int)
        data['is_specialist'] = data['job_title'].str.lower().str.contains('special|expert|consult').fillna(False).astype(int)
        engineered_feature_names.extend(title_flags)
        title_counts = data['job_title'].value_counts()
        rare_titles = title_counts[title_counts < 10].index
        data['job_title_grouped'] = data['job_title'].apply(lambda x: 'Other_Title' if x in rare_titles else x)
        title_encoder_col = 'job_title_grouped'
        target_encoded_title = 'job_title_encoded'
        engineered_feature_names.append(target_encoded_title)
        if is_training:
            logger.info(f"Applying Target Encoding to '{title_encoder_col}'...")
            job_encoder = TargetEncoder(cols=[title_encoder_col], handle_missing='value', handle_unknown='value')
            data[target_encoded_title] = job_encoder.fit_transform(data[[title_encoder_col]], y)
            joblib.dump(job_encoder, f'features/job_title_encoder_{timestamp}.joblib')
            logger.info(f"Fit and saved TargetEncoder for {title_encoder_col}")
        else:
            encoder_path = f'features/job_title_encoder_{timestamp}.joblib'
            encoder_files_fallback = sorted([f for f in os.listdir('features') if f.startswith('job_title_encoder_')])
            loaded_encoder = False
            if os.path.exists(encoder_path):
                try:
                    job_encoder = joblib.load(encoder_path)
                    data[target_encoded_title] = job_encoder.transform(data[[title_encoder_col]])
                    logger.info(f"Loaded and applied TargetEncoder: {encoder_path}")
                    loaded_encoder = True
                except Exception as e:
                    logger.error(f"Failed to load/apply specific TargetEncoder '{encoder_path}': {e}. Trying fallback.")
            if not loaded_encoder and encoder_files_fallback:
                 latest_encoder_file = encoder_files_fallback[-1]
                 try:
                     job_encoder = joblib.load(f'features/{latest_encoder_file}')
                     data[target_encoded_title] = job_encoder.transform(data[[title_encoder_col]])
                     logger.info(f"Loaded and applied fallback TargetEncoder: {latest_encoder_file}")
                     loaded_encoder = True
                 except Exception as e_fb:
                      logger.error(f"Fallback TargetEncoder failed: {e_fb}. Filling with 0.5")
                      data[target_encoded_title] = 0.5
            if not loaded_encoder:
                 logger.error("No TargetEncoder file found ('features/job_title_encoder_*.joblib'). Filling with 0.5")
                 data[target_encoded_title] = 0.5
        data = data.drop(['job_title', 'job_title_grouped'], axis=1, errors='ignore')
        logger.info("Processed 'job_title' (flags, grouping, target encoding).")

    if 'job_posted_date' in data.columns:
        data['job_posted_date'] = data['job_posted_date'].fillna('2000/01')
        def extract_year(date_str):
            try: return int(str(date_str)[:4])
            except: return 2000
        def extract_month(date_str):
            try: return int(str(date_str).split('/')[1])
            except: return 1
        data['job_posted_year'] = data['job_posted_date'].apply(extract_year)
        data['job_posted_month'] = data['job_posted_date'].apply(extract_month)
        data['job_posted_month'] = data['job_posted_month'].clip(1, 12)
        date_features = ['month_sin', 'month_cos', 'job_recency', 'job_posted_year_norm']
        data['month_sin'] = np.sin(2 * np.pi * data['job_posted_month'] / 12)
        data['month_cos'] = np.cos(2 * np.pi * data['job_posted_month'] / 12)
        data['job_recency'] = data['job_posted_year'] * 12 + data['job_posted_month']
        mean_year = 2022
        data['job_posted_year_norm'] = data['job_posted_year'] - mean_year
        engineered_feature_names.extend(date_features)
        data = data.drop(['job_posted_date', 'job_posted_year', 'job_posted_month'], axis=1, errors='ignore')
        logger.info("Processed 'job_posted_date' (cyclical, recency, norm year).")

    num_transform_features = []
    if 'feature_9' in data.columns:
        try: data['feature_9_bin'] = pd.qcut(data['feature_9'].rank(method='first'), q=5, labels=[0, 1, 2, 3, 4]).astype(int)
        except ValueError:
            logger.warning("qcut failed for feature_9, using pd.cut fallback.")
            try: data['feature_9_bin'] = pd.cut(data['feature_9'], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True).astype(int)
            except Exception as e_cut: logger.error(f"pd.cut also failed for feature_9: {e_cut}. Setting bin to 0."); data['feature_9_bin'] = 0
        num_transform_features.append('feature_9_bin')
        logger.info("Added binned feature for feature_9.")
        if 'feature_2' in data.columns:
            interaction_name = 'feature_2_9_interaction'
            data[interaction_name] = data['feature_2'] * data['feature_9']
            num_transform_features.append(interaction_name)
            logger.info(f"Added interaction: {interaction_name}")

    if 'feature_2' in data.columns:
        data['feature_2_squared'] = data['feature_2'] ** 2
        data['feature_2_sqrt'] = np.sqrt(np.abs(data['feature_2']))
        num_transform_features.extend(['feature_2_squared', 'feature_2_sqrt'])
        try: data['feature_2_bin'] = pd.qcut(data['feature_2'].rank(method='first'), q=5, labels=[0, 1, 2, 3, 4]).astype(int)
        except ValueError:
            logger.warning("qcut failed for feature_2, using pd.cut fallback.")
            try: data['feature_2_bin'] = pd.cut(data['feature_2'], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True).astype(int)
            except Exception as e_cut: logger.error(f"pd.cut also failed for feature_2: {e_cut}. Setting bin to 0."); data['feature_2_bin'] = 0
        num_transform_features.append('feature_2_bin')
        logger.info("Added squared, sqrt, and binned features for feature_2.")
    engineered_feature_names.extend(num_transform_features)

    bool_agg_features = []
    actual_boolean_cols = [col for col in boolean_features if col in data.columns]
    if actual_boolean_cols:
        data['boolean_sum'] = data[actual_boolean_cols].sum(axis=1)
        data['boolean_sum_squared'] = data['boolean_sum'] ** 2
        bool_agg_features.extend(['boolean_sum', 'boolean_sum_squared'])
        logger.info("Added boolean sum and squared sum features.")
    else:
        data['boolean_sum'] = 0
        data['boolean_sum_squared'] = 0
        logger.info("No boolean features found for aggregation.")
    engineered_feature_names.extend(bool_agg_features)

    if 'feature_10' in data.columns and 'feature_8' in data.columns:
        interaction_name = 'feature_10_8_interaction'
        data[interaction_name] = data['feature_10'] * data['feature_8']
        engineered_feature_names.append(interaction_name)
        logger.info(f"Added interaction: {interaction_name}")

    job_desc_eng_features = []
    if job_desc_cols:
        desc_agg = ['job_desc_mean', 'job_desc_std', 'job_desc_min', 'job_desc_max', 'job_desc_sum', 'job_desc_q25', 'job_desc_q75', 'job_desc_iqr']
        data['job_desc_mean'] = data[job_desc_cols].mean(axis=1)
        data['job_desc_std'] = data[job_desc_cols].std(axis=1).fillna(0)
        data['job_desc_min'] = data[job_desc_cols].min(axis=1)
        data['job_desc_max'] = data[job_desc_cols].max(axis=1)
        data['job_desc_sum'] = data[job_desc_cols].sum(axis=1)
        data['job_desc_q25'] = data[job_desc_cols].quantile(0.25, axis=1)
        data['job_desc_q75'] = data[job_desc_cols].quantile(0.75, axis=1)
        data['job_desc_iqr'] = data['job_desc_q75'] - data['job_desc_q25']
        job_desc_eng_features.extend(desc_agg)
        n_pca_components = 15
        if len(job_desc_cols) > n_pca_components:
            logger.info(f"Applying PCA (n={n_pca_components}) to job description features...")
            pca_names = [f'job_desc_pca_{i}' for i in range(n_pca_components)]
            job_desc_eng_features.extend(pca_names)
            job_desc_pca_result = None # Initialize
            if is_training:
                pca = PCA(n_components=n_pca_components, random_state=42)
                job_desc_pca_result = pca.fit_transform(data[job_desc_cols])
                joblib.dump(pca, f'features/job_desc_pca_{timestamp}.joblib')
                logger.info("Fit and saved PCA model for job description.")
            else:
                pca_path = f'features/job_desc_pca_{timestamp}.joblib'
                pca_files_fallback = sorted([f for f in os.listdir('features') if f.startswith('job_desc_pca_')])
                pca_loaded = False
                pca = None # Define pca before try block
                if os.path.exists(pca_path):
                    try: pca = joblib.load(pca_path); pca_loaded=True; logger.info(f"Loaded specific PCA model: {pca_path}")
                    except Exception as e: logger.error(f"Failed load specific PCA: {e}. Try fallback.")
                if not pca_loaded and pca_files_fallback:
                     latest_pca_file = pca_files_fallback[-1]
                     try: pca = joblib.load(f'features/{latest_pca_file}'); pca_loaded=True; logger.info(f"Loaded fallback PCA model: {latest_pca_file}")
                     except Exception as e_fb: logger.error(f"Fallback PCA load failed: {e_fb}.")
                if pca_loaded and pca is not None: # Check if pca object exists
                     try: job_desc_pca_result = pca.transform(data[job_desc_cols])
                     except Exception as e_trans: logger.error(f"PCA transform failed: {e_trans}. Filling PCA features with 0.")
                # Fallback if loading/transform failed or no model found
                if job_desc_pca_result is None:
                    logger.error("PCA result not generated. Filling PCA features with 0.")
                    job_desc_pca_result = np.zeros((data.shape[0], n_pca_components))

            # Add PCA features to dataframe
            for i in range(min(n_pca_components, job_desc_pca_result.shape[1])):
                data[pca_names[i]] = job_desc_pca_result[:, i]
        else:
            logger.warning(f"Skipping PCA for job description: Not enough features ({len(job_desc_cols)}) for {n_pca_components} components.")
        data = data.drop(columns=job_desc_cols, errors='ignore')
        logger.info("Finished processing job description features (aggregates and PCA).")
    else:
        logger.info("No job description features found.")
    engineered_feature_names.extend(job_desc_eng_features)

    # --- Robust Categorical Handling ---
    if 'job_state' in data.columns: data['job_state'] = data['job_state'].fillna('Unknown')
    if 'feature_1' in data.columns: data['feature_1'] = data['feature_1'].fillna('Unknown')

    manual_ohe_features = []
    logger.info(f"Applying manual One-Hot Encoding for 'job_state' using {len(all_states)} total unique states.")
    if 'job_state' in data.columns:
        for state in all_states:
            col_name = f'state_{state}'
            data[col_name] = (data['job_state'] == state).astype(int)
            manual_ohe_features.append(col_name)
        data = data.drop('job_state', axis=1, errors='ignore')
    else: logger.warning("'job_state' column not found for manual OHE.")

    logger.info(f"Applying manual One-Hot Encoding for 'feature_1' using {len(all_feature1)} total unique values.")
    if 'feature_1' in data.columns:
        for feat in all_feature1:
            col_name = f'feat1_{feat}'
            data[col_name] = (data['feature_1'] == feat).astype(int)
            manual_ohe_features.append(col_name)
        data = data.drop('feature_1', axis=1, errors='ignore')
    else: logger.warning("'feature_1' column not found for manual OHE.")
    engineered_feature_names.extend(manual_ohe_features)
    # --- End FE ---

    # 5. Final Cleanup and Column Management
    logger.info("Final cleanup and column alignment...")
    columns_to_exclude = ['obs']
    if is_training and target_column in df.columns: columns_to_exclude.append(target_column)
    potential_feature_cols = [col for col in data.columns if col not in columns_to_exclude]

    inf_cols_handled = []
    nan_cols_handled = []
    for col in potential_feature_cols:
        if pd.api.types.is_numeric_dtype(data[col]):
            if np.isinf(data[col]).any(): inf_cols_handled.append(col); data[col] = data[col].replace([np.inf, -np.inf], np.nan)
            if data[col].isnull().any(): nan_cols_handled.append(col); data[col] = data[col].fillna(0)
    if inf_cols_handled: logger.warning(f"Replaced Inf values with NaN in: {inf_cols_handled}")
    final_nan_cols = list(set(nan_cols_handled) - set(inf_cols_handled))
    if final_nan_cols: logger.info(f"Filled NaN values with 0 in columns: {final_nan_cols}")
    for col in potential_feature_cols:
        if data[col].dtype == 'bool': data[col] = data[col].astype(int)

    if is_training:
        constant_cols_found = []
        for col in potential_feature_cols:
            nunique_val = data[col].nunique(dropna=False)
            if nunique_val <= 1:
                is_engineered = col in engineered_feature_names
                logger.warning(f"Column '{col}' identified as constant (nunique={nunique_val}) in training data. Engineered: {is_engineered}. Keeping column.")
                constant_cols_found.append(col)
            elif nunique_val <= 3 and col in engineered_feature_names:
                logger.info(f"Engineered column '{col}' has low cardinality (nunique={nunique_val}) in training data.")

        final_feature_columns = potential_feature_cols
        joblib.dump(final_feature_columns, f'features/feature_columns_{timestamp}.joblib')
        logger.info(f"Saved {len(final_feature_columns)} final feature column names (constant columns NOT dropped).")
        X = data[final_feature_columns]
        logger.info(f"Preprocessing train done. Shape: {X.shape}. Time: {time.time() - start_time:.2f}s")
        try: X.head().to_csv(f'features/processed_features_head_{timestamp}.csv', index=False)
        except Exception as e: logger.warning(f"Could not save head: {e}")
        return X, y, final_feature_columns, le
    else: # Test Data
        if feature_columns_to_use is None:
            try:
                col_files = sorted([f for f in os.listdir('features') if f.startswith('feature_columns_')])
                if col_files:
                    latest_col_file = col_files[-1]
                    feature_columns_to_use = joblib.load(f'features/{latest_col_file}')
                    logger.info(f"Loaded {len(feature_columns_to_use)} feature columns from: {latest_col_file}")
                else:
                    logger.error("CRITICAL: No feature_columns file found.")
                    raise FileNotFoundError("feature_columns_*.joblib missing.")
            except Exception as e:
                logger.error(f"Failed load feature columns: {e}.")
                raise

        X = pd.DataFrame(columns=feature_columns_to_use)
        missing_cols_in_test = []
        processed_test_cols = list(data.columns)
        extra_cols_in_test = list(set(processed_test_cols) - set(feature_columns_to_use) - set(columns_to_exclude))

        for col in feature_columns_to_use:
            if col in data.columns:
                X[col] = data[col]
            else:
                X[col] = 0 # Add missing column, fill with 0
                missing_cols_in_test.append(col)

        if missing_cols_in_test:
            logger.warning(f"Cols missing in test (filled 0): {missing_cols_in_test}")
        if extra_cols_in_test:
             logger.warning(f"Cols extra in test (dropped during align): {extra_cols_in_test}")

        X = X[feature_columns_to_use] # Reorder to match training
        logger.info(f"Preprocessing test done. Shape: {X.shape}. Time: {time.time() - start_time:.2f}s")
        return X, y, feature_columns_to_use, le # y is None

In [18]:
def optimize_model(X, y, timestamp, model_type, n_trials=30, n_jobs_optuna=1):
    """General optimization function including Keras MLP and GPU options."""
    logger.info(f"Starting {model_type} optimization ({n_trials} trials)...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if not isinstance(y, (np.ndarray, pd.Series)): y = np.array(y)
    if isinstance(X, np.ndarray): X = pd.DataFrame(X)

    n_classes = len(np.unique(y))
    n_features = X.shape[1]
    y_keras = to_categorical(y, num_classes=n_classes) if model_type == 'keras_mlp' else y

    KERAS_EPOCHS = 150
    KERAS_PATIENCE = 20
    OPTUNA_TIMEOUT_PER_MODEL = 3600
    if model_type in ['xgboost', 'catboost', 'randomforest', 'gradientboosting', 'keras_mlp', 'mlp', 'extratrees', 'lightgbm']:
        OPTUNA_TIMEOUT_PER_MODEL = 5400
    logger.info(f"Optuna timeout for {model_type}: {OPTUNA_TIMEOUT_PER_MODEL}s.")

    def objective(trial):
        model = None
        fit_params = {}
        use_gpu = False
        is_keras = False
        # --- Model Definitions ---
        if model_type == 'xgboost':
            tree_method = trial.suggest_categorical('tree_method', ['hist', 'gpu_hist'])
            param = { 'objective':'multi:softmax', 'num_class':n_classes, 'eval_metric':'mlogloss','n_estimators':trial.suggest_int('n_estimators',200,2000,step=100), 'max_depth':trial.suggest_int('max_depth',3,15),'learning_rate':trial.suggest_float('learning_rate',0.005,0.3,log=True), 'subsample':trial.suggest_float('subsample',0.5,1.0),'colsample_bytree':trial.suggest_float('colsample_bytree',0.4,1.0), 'min_child_weight':trial.suggest_int('min_child_weight',1,12),'gamma':trial.suggest_float('gamma',1e-8,1.0,log=True), 'reg_alpha':trial.suggest_float('reg_alpha',1e-8,15.0,log=True),'reg_lambda':trial.suggest_float('reg_lambda',1e-8,15.0,log=True), 'random_state':42,'n_jobs': 1,'booster':'gbtree', 'tree_method':tree_method }
            if tree_method == 'gpu_hist': param['gpu_id'] = 0; use_gpu = True; param.pop('n_jobs', None)
            else: param.pop('gpu_id', None)
            model = XGBClassifier(**param)
            fit_params = {'verbose': False}
        elif model_type == 'catboost':
            task_type = trial.suggest_categorical('task_type', ['CPU', 'GPU'])
            param = { 'iterations':trial.suggest_int('iterations',200,2000,step=100), 'depth':trial.suggest_int('depth',4,14),'learning_rate':trial.suggest_float('learning_rate',0.005,0.3,log=True), 'l2_leaf_reg':trial.suggest_float('l2_leaf_reg',1,20,log=True),'random_strength':trial.suggest_float('random_strength',1e-3,10.0,log=True), 'border_count':trial.suggest_categorical('border_count',[32,64,128,254]),'bagging_temperature':trial.suggest_float('bagging_temperature',0.0,1.0), 'loss_function':'MultiClass', 'eval_metric':'Accuracy','random_seed':42, 'thread_count':-1,'verbose':False, 'task_type':task_type }
            if task_type == 'GPU': param['devices'] = '0'; use_gpu = True
            model = CatBoostClassifier(**param)
            fit_params = {'early_stopping_rounds': KERAS_PATIENCE, 'verbose': False}
        elif model_type == 'keras_mlp':
            is_keras = True
            optimizer_name = trial.suggest_categorical('optimizer', ['adam'])
            lr = trial.suggest_float('learning_rate', 1e-4, 5e-3, log=True)
            dropout = trial.suggest_float('dropout_rate', 0.1, 0.6)
            activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'swish'])
            n_layers = trial.suggest_int('n_layers', 2, 4)
            units_list = []
            last_units = n_features
            for i in range(n_layers):
                max_units = max(32, int(last_units / 1.5))
                min_units = max(16, int(last_units / 4))
                units = trial.suggest_int(f'n_units_l{i}', min_units, max_units, step=4, log=True)
                units_list.append(units)
                last_units = units
            model = KerasClassifier(model=build_keras_model, n_features=n_features, n_classes=n_classes, optimizer=optimizer_name, learning_rate=lr, hidden_units=units_list, dropout_rate=dropout, activation=activation, epochs=KERAS_EPOCHS, batch_size=trial.suggest_categorical('batch_size', [64, 128, 256, 512]), verbose=0)
            keras_callbacks = [ EarlyStopping(monitor='val_accuracy', patience=KERAS_PATIENCE, restore_best_weights=True, verbose=0), ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=KERAS_PATIENCE // 2, min_lr=1e-6, verbose=0) ]
            fit_params = {'callbacks': keras_callbacks, 'validation_split': 0.15}
        elif model_type == 'mlp':
            layer_choices = [(100,), (50, 50), (100, 50), (64, 32, 16), (128, 64), (256, 128)]
            layers = trial.suggest_categorical('hidden_layer_sizes', layer_choices)
            param = {'hidden_layer_sizes': layers, 'activation': trial.suggest_categorical('activation', ['relu', 'tanh']), 'solver': trial.suggest_categorical('solver', ['adam']), 'alpha': trial.suggest_float('alpha', 1e-6, 1e-2, log=True), 'learning_rate': 'adaptive', 'learning_rate_init': trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True), 'max_iter': trial.suggest_int('max_iter', 400, 1200), 'early_stopping': True, 'n_iter_no_change': KERAS_PATIENCE + 10, 'validation_fraction': 0.15, 'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256]), 'random_state': 42, 'warm_start': False }
            model = MLPClassifier(**param)
        elif model_type == 'randomforest':
             param={'n_estimators':trial.suggest_int('n_estimators',100,2000,step=100), 'max_depth':trial.suggest_int('max_depth',5,40,step=5), 'min_samples_split':trial.suggest_int('min_samples_split',2,25), 'min_samples_leaf':trial.suggest_int('min_samples_leaf',1,20), 'max_features':trial.suggest_categorical('max_features',['sqrt','log2',0.6,0.8]), 'bootstrap':trial.suggest_categorical('bootstrap',[True,False]), 'class_weight':trial.suggest_categorical('class_weight',['balanced','balanced_subsample',None]), 'random_state':42, 'n_jobs':n_jobs_optuna}
             model=RandomForestClassifier(**param)
        elif model_type == 'extratrees':
             param={'n_estimators':trial.suggest_int('n_estimators',100,2000,step=100), 'max_depth':trial.suggest_int('max_depth',5,45,step=5), 'min_samples_split':trial.suggest_int('min_samples_split',2,25), 'min_samples_leaf':trial.suggest_int('min_samples_leaf',1,20), 'max_features':trial.suggest_categorical('max_features',['sqrt','log2',0.6,0.8]), 'bootstrap':trial.suggest_categorical('bootstrap',[True,False]), 'class_weight':trial.suggest_categorical('class_weight',['balanced','balanced_subsample',None]), 'random_state':42, 'n_jobs':n_jobs_optuna}
             model=ExtraTreesClassifier(**param)
        elif model_type == 'logistic':
            param={'C':trial.suggest_float('C',1e-4,1e3,log=True), 'penalty':trial.suggest_categorical('penalty',['l1','l2']),'solver':'liblinear','class_weight':'balanced', 'max_iter':trial.suggest_int('max_iter',100,1000), 'random_state':42}
            model=LogisticRegression(**param)
        elif model_type == 'gradientboosting':
             param={'n_estimators':trial.suggest_int('n_estimators',100,1500,step=100), 'learning_rate':trial.suggest_float('learning_rate',0.005,0.3,log=True), 'max_depth':trial.suggest_int('max_depth',3,12), 'min_samples_split':trial.suggest_int('min_samples_split',2,25), 'min_samples_leaf':trial.suggest_int('min_samples_leaf',1,20), 'subsample':trial.suggest_float('subsample',0.5,1.0), 'max_features':trial.suggest_categorical('max_features',['sqrt','log2',None]),'random_state':42}
             model=GradientBoostingClassifier(**param)
        elif model_type == 'adaboost':
            base_depth=trial.suggest_int('base_estimator_max_depth',1,6)
            param_ada={'n_estimators':trial.suggest_int('n_estimators',50,800,step=50), 'learning_rate':trial.suggest_float('learning_rate',0.01,1.5,log=True), 'algorithm':'SAMME', 'random_state':42}
            base_est=DecisionTreeClassifier(max_depth=base_depth,random_state=42)
            model=AdaBoostClassifier(estimator=base_est,**param_ada)
            trial.set_user_attr("base_estimator_max_depth",base_depth)
        elif model_type == 'knn':
            metric=trial.suggest_categorical('metric',['minkowski','manhattan','chebyshev'])
            param={'n_neighbors':trial.suggest_int('n_neighbors',3,65,step=2), 'weights':trial.suggest_categorical('weights',['uniform','distance']), 'metric':metric,'n_jobs':n_jobs_optuna}
            if metric=='minkowski':
                param['p']=trial.suggest_int('p',1,3)
            model=KNeighborsClassifier(**param)
        elif model_type == 'lightgbm':
             param = {
                 'objective': 'multiclass', 'num_class': n_classes, 'metric': 'multi_logloss',
                 'n_estimators': trial.suggest_int('n_estimators', 200, 2000, step=100),
                 'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
                 'num_leaves': trial.suggest_int('num_leaves', 20, 150, step=5),
                 'max_depth': trial.suggest_int('max_depth', 5, 16),
                 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
                 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
                 'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
                 'random_state': 42, 'n_jobs': n_jobs_optuna
             }
             model = lgb.LGBMClassifier(**param)
             fit_params = {'callbacks': [lgb.early_stopping(KERAS_PATIENCE, verbose=False)]}
        else:
            logger.error(f"Unsupported model type: {model_type}")
            raise ValueError(f"Unsupported: {model_type}")

        # --- Cross-validation ---
        scores = []
        is_dataframe = isinstance(X, pd.DataFrame)
        try:
            for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
                X_train = X.iloc[train_idx] if is_dataframe else X[train_idx]
                X_valid = X.iloc[valid_idx] if is_dataframe else X[valid_idx]
                y_train_fold = y_keras[train_idx] if is_keras else (y.iloc[train_idx] if isinstance(y, pd.Series) else y[train_idx])
                y_valid_fold_orig = y.iloc[valid_idx] if isinstance(y, pd.Series) else y[valid_idx]
                current_fit_params = fit_params.copy()
                try:
                    if model_type == 'lightgbm':
                        current_fit_params['eval_set'] = [(X_valid, y_valid_fold_orig)]
                        current_fit_params['eval_metric'] = 'multi_logloss'
                        model.fit(X_train, y_train_fold, **current_fit_params)
                    elif model_type == 'xgboost':
                        eval_set_xgb = [(X_valid, y_valid_fold_orig)]
                        model.fit(X_train, y_train_fold, eval_set=eval_set_xgb, early_stopping_rounds=KERAS_PATIENCE, verbose=False)
                    elif model_type == 'catboost':
                        current_fit_params['eval_set'] = [(X_valid, y_valid_fold_orig)]
                        model.fit(X_train, y_train_fold, **current_fit_params)
                    elif model_type == 'keras_mlp':
                        model.fit(X_train, y_train_fold, **current_fit_params)
                    elif model_type == 'mlp':
                        model.fit(X_train, y_train_fold)
                    else:
                        model.fit(X_train, y_train_fold, **current_fit_params)
                    y_pred = model.predict(X_valid)
                    if is_keras and y_pred.ndim > 1 and y_pred.shape[1] > 1:
                        y_pred = np.argmax(y_pred, axis=1)
                    score = accuracy_score(y_valid_fold_orig, y_pred)
                    scores.append(score)
                except ValueError as ve:
                    logger.warning(f"CV fold {fold+1} VAL ERROR {model_type} trial {trial.number}: {ve}")
                    return 0.0
                except Exception as e:
                    logger.warning(f"CV fold {fold+1} fail {model_type} trial {trial.number} (GPU:{use_gpu}, Keras:{is_keras}): {e}", exc_info=False)
                    scores = []
                    break
        except Exception as outer_e:
            logger.error(f"Outer CV error {model_type} trial {trial.number} (GPU:{use_gpu}, Keras:{is_keras}): {outer_e}", exc_info=True)
            return 0.0
        if not scores:
            logger.error(f"Cross-validation failed completely for {model_type} trial {trial.number}")
            return 0.0
        mean_score = np.mean(scores)
        logger.debug(f"Trial {trial.number} ({model_type}) completed. Avg CV Score: {mean_score:.5f}")
        return mean_score

    # --- Run Optuna Study ---
    study_name = f"{model_type}_opt_{timestamp}"
    storage_name = f"sqlite:///optuna_trials/{study_name}.db"
    study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, load_if_exists=True, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
    completed_trials = len([t for t in study.trials if t.state==optuna.trial.TrialState.COMPLETE])
    trials_to_run = n_trials-completed_trials
    if trials_to_run > 0:
        logger.info(f"Setting Optuna timeout for {model_type} to {OPTUNA_TIMEOUT_PER_MODEL} seconds.")
        try:
            study.optimize(objective, n_trials=trials_to_run, timeout=OPTUNA_TIMEOUT_PER_MODEL, n_jobs=1)
        except Exception as opt_e:
            logger.error(f"Optuna optimize call failed for {model_type}: {opt_e}", exc_info=True)
            return None, -1, {}
    else:
        logger.info(f"Study '{study_name}' already has {completed_trials} completed trials. Skipping optimization run.")

    # --- Retrieve Results ---
    try:
        if not any(t.state == optuna.trial.TrialState.COMPLETE for t in study.trials):
            logger.error(f"Optuna study {model_type} no successful trials.")
            return None, -1, {}
        best_trial = study.best_trial
        best_params = best_trial.params
        best_cv_score = best_trial.value
    except ValueError:
        logger.error(f"Optuna study {model_type} no best trial.")
        return None, -1, {}
    except Exception as res_e:
        logger.error(f"Error get Optuna results {model_type}: {res_e}", exc_info=True)
        return None, -1, {}

    logger.info(f"Optimization complete for {model_type}.")
    logger.info(f"Best CV score: {best_cv_score:.5f}")
    logger.info(f"Best params: {best_params}")

    # --- Save Study Summary ---
    try:
        summary_file = f'optuna_trials/{model_type}_study_summary_{timestamp}.txt'
        with open(summary_file, 'w') as f:
            f.write(f"Optuna Summary: {model_type}\nTS: {timestamp}\nBest Trial: {best_trial.number}\nScore: {best_cv_score:.5f}\n\nParams:\n")
            params_json = best_params.copy()
            if model_type=='adaboost' and "base_estimator_max_depth" in best_trial.user_attrs:
                params_json['base_estimator_max_depth'] = best_trial.user_attrs["base_estimator_max_depth"]
                params_json['algorithm'] = 'SAMME'
            if model_type=='xgboost' and 'tree_method' in best_params:
                params_json['tree_method'] = best_params['tree_method']
            if model_type=='catboost' and 'task_type' in best_params:
                params_json['task_type'] = best_params['task_type']
            if model_type=='keras_mlp' or model_type=='mlp':
                 if 'hidden_layer_sizes' in params_json:
                     params_json['hidden_layer_sizes'] = str(params_json['hidden_layer_sizes'])
                 if 'n_layers' in best_params:
                     units_list = [best_params.get(f'n_units_l{i}') for i in range(best_params['n_layers']) if best_params.get(f'n_units_l{i}')]
                     params_json['hidden_units_structure'] = str(units_list)
                 params_json = {k: v for k, v in params_json.items() if not k.startswith('n_units_l')}
            json.dump(params_json, f, indent=4)
            logger.info(f"Saved Optuna summary: {summary_file}")
    except Exception as file_e:
        logger.warning(f"Could not save Optuna summary {model_type}: {file_e}")

    # --- Train final model ---
    final_model = None
    final_fit_params = {} # Defined for Keras

    try:
        logger.info(f"Instantiating final {model_type} model with best parameters...")

        if model_type == 'adaboost':
            best_d = best_trial.user_attrs.get('base_estimator_max_depth', 1)
            logger.info(f"Reconstruct AdaBoost DT(max_depth={best_d}) using SAMME")
            base_est_inst = DecisionTreeClassifier(max_depth=best_d, random_state=42)
            final_p_ada = {k:v for k,v in best_params.items() if k != 'base_estimator_max_depth'}
            final_p_ada['algorithm'] = 'SAMME'
            final_model = AdaBoostClassifier(estimator=base_est_inst, **final_p_ada)

        elif model_type == 'xgboost':
            final_params_xgb = best_params.copy()
            final_params_xgb['objective']='multi:softmax'
            final_params_xgb['num_class']=n_classes
            final_params_xgb['n_jobs']=1 # Use n_jobs=1 for final model wrapper
            final_model = XGBClassifier(**final_params_xgb)

        elif model_type == 'catboost':
            final_params_cat = best_params.copy()
            final_params_cat['loss_function']='MultiClass'
            final_params_cat['verbose']=False
            final_model = CatBoostClassifier(**final_params_cat)

        elif model_type == 'mlp':
            final_model = MLPClassifier(**best_params)

        elif model_type == 'keras_mlp':
            keras_build_params = {
                'n_features': n_features, 'n_classes': n_classes,
                'optimizer': best_params.get('optimizer', 'adam'),
                'learning_rate': best_params.get('learning_rate', 0.001),
                'dropout_rate': best_params.get('dropout_rate', 0.3),
                'activation': best_params.get('activation', 'relu')
            }
            if 'n_layers' in best_params:
                hidden_units = []
                for i in range(best_params['n_layers']):
                   unit_val = best_params.get(f'n_units_l{i}')
                   if unit_val is not None: hidden_units.append(unit_val)
                keras_build_params['hidden_units'] = hidden_units if hidden_units else [64]
            else:
                hidden_units_final = best_params.get('hidden_layer_sizes', [128, 64])
                if isinstance(hidden_units_final, str):
                    try: hidden_units_final = eval(hidden_units_final)
                    except: hidden_units_final = [128, 64]
                keras_build_params['hidden_units'] = list(hidden_units_final)

            final_model = KerasClassifier(
                model=build_keras_model,
                model__n_features=keras_build_params['n_features'],
                model__n_classes=keras_build_params['n_classes'],
                model__optimizer=keras_build_params['optimizer'],
                model__learning_rate=keras_build_params['learning_rate'],
                model__hidden_units=keras_build_params['hidden_units'],
                model__dropout_rate=keras_build_params['dropout_rate'],
                model__activation=keras_build_params['activation'],
                epochs=KERAS_EPOCHS,
                batch_size=best_params.get('batch_size', 128),
                verbose=0
            )
            final_callbacks = [
                EarlyStopping(monitor='val_accuracy', patience=KERAS_PATIENCE, restore_best_weights=True, verbose=0),
                ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=KERAS_PATIENCE // 2, min_lr=1e-6, verbose=0)
            ]
            final_fit_params = {'callbacks': final_callbacks, 'validation_split': 0.15}

        elif model_type == 'randomforest':
            final_params_rf = best_params.copy()
            final_params_rf['n_jobs'] = n_jobs_optuna
            final_model = RandomForestClassifier(**final_params_rf)

        elif model_type == 'extratrees':
            final_params_et = best_params.copy()
            final_params_et['n_jobs'] = n_jobs_optuna
            final_model = ExtraTreesClassifier(**final_params_et)

        elif model_type == 'logistic':
            final_p_log = best_params.copy()
            final_p_log['solver'] = 'liblinear'
            if 'class_weight' not in final_p_log:
                final_p_log['class_weight'] = 'balanced'
            final_model = LogisticRegression(**final_p_log)

        elif model_type == 'gradientboosting':
            final_model = GradientBoostingClassifier(**best_params)

        elif model_type == 'knn':
            final_params_knn = best_params.copy()
            final_params_knn['n_jobs'] = n_jobs_optuna
            final_model = KNeighborsClassifier(**final_params_knn)

        elif model_type == 'lightgbm':
            final_params_lgbm = best_params.copy()
            final_params_lgbm['objective'] = 'multiclass'
            final_params_lgbm['num_class'] = n_classes
            final_params_lgbm['n_jobs'] = n_jobs_optuna
            final_model = lgb.LGBMClassifier(**final_params_lgbm)

        # --- Fit the final model ---
        if final_model is not None:
            logger.info(f"Fitting final {model_type} model...")
            start_fit_time = time.time()
            try:
                if model_type == 'keras_mlp':
                    logger.info("Using validation_split for Keras final fit...")
                    final_model.fit(X, y_keras, **final_fit_params)
                else:
                    final_model.fit(X, y)

                fit_duration = time.time() - start_fit_time
                logger.info(f"Final {model_type} fitted in {fit_duration:.2f}s.")

                # --- Save model and importance (AFTER fitting) ---
                model_path = f'models/{model_type}_{timestamp}.joblib'
                logger.info(f"Saving final {model_type} model...")
                try:
                    if isinstance(final_model, KerasClassifier):
                        tf_model_save_path = f'models/{model_type}_tfmodel_{timestamp}'
                        try: final_model.model_.save(tf_model_save_path)
                        except Exception as keras_save_err: logger.warning(f"Keras TF save failed ({keras_save_err}), attempt joblib wrapper..."); joblib.dump(final_model, model_path)
                    else: joblib.dump(final_model, model_path)
                    logger.info(f"Saved final {model_type} model path: {model_path}")
                except Exception as save_err: logger.error(f"Failed save model {model_type}: {save_err}", exc_info=True)

                feat_names = list(X.columns) if isinstance(X, pd.DataFrame) else None
                if feat_names: logger.info(f"Saving importance {model_type}..."); save_feature_importance(final_model, feat_names, timestamp, model_type)
                else: logger.warning(f"No feat names for importance {model_type}.")

            except Exception as fit_save_e:
                logger.error(f"Error during final fit/save {model_type}: {fit_save_e}", exc_info=True)
                return None, best_cv_score, best_params
        else:
            logger.error(f"Could not instantiate final model {model_type}.")
            return None, best_cv_score, best_params

    except Exception as final_e:
        logger.error(f"Failed final instantiate/fit/save process {model_type}: {final_e}", exc_info=True)
        return None, best_cv_score, best_params

    return final_model, best_cv_score, best_params

In [19]:
def create_ensemble(qualified_models_with_scores, X_train_ensemble, y_train_ensemble, timestamp, n_jobs_ensemble=1):
    """ Create ensemble models (Voting and Stacking) from qualified models. """
    logger.info("Attempting ensemble creation...")
    if not qualified_models_with_scores:
        logger.error("No qualified models.")
        return None, None, None

    sorted_models = sorted(qualified_models_with_scores, key=lambda x: x[2], reverse=True)
    logger.info(f"Qualified models: {[(m[0], f'{m[2]:.5f}') for m in sorted_models]}")
    N_ens = len(sorted_models)

    if N_ens < 2:
        logger.warning(f"<2 qualified models. Skip ensembles.")
        if N_ens == 1:
            n, m, s = sorted_models[0]
            logger.info(f"Return best individual: {n} ({s:.5f})")
            return None, None, m
        else:
            return None, None, None

    estimators_valid_for_ensemble = []
    keras_models_excluded = []
    for name, model, score in sorted_models:
        is_keras_wrapper = isinstance(model, KerasClassifier)
        model_saved_path = f'models/{name}_{timestamp}.joblib'
        tf_model_path = f'models/{name}_tfmodel_{timestamp}'
        if is_keras_wrapper and not os.path.exists(tf_model_path) and not os.path.exists(model_saved_path):
            logger.warning(f"Keras model {name} save files missing. Exclude.")
            keras_models_excluded.append(name)
        else:
            estimators_valid_for_ensemble.append((name, model))

    if len(estimators_valid_for_ensemble) < 2:
        logger.warning(f"<2 models usable for ensemble. Skipping.")
        non_keras_models = [(n, m, s) for n, m, s in sorted_models if n not in keras_models_excluded]
        if non_keras_models:
            best_n, best_m, best_s = non_keras_models[0]
            logger.info(f"Return best non-excluded: {best_n} ({best_s:.5f})")
            return None, None, best_m
        elif sorted_models:
            best_n, best_m, best_s = sorted_models[0]
            logger.info(f"Return original best: {best_n} ({best_s:.5f})")
            return None, None, best_m
        else:
            return None, None, None

    logger.info(f"Using {len(estimators_valid_for_ensemble)} models for ensemble: {[n for n,m in estimators_valid_for_ensemble]}")
    est_ens = estimators_valid_for_ensemble
    scores = [s for n, m, s in sorted_models if n in dict(est_ens)]
    min_s = min(scores) if scores else 0
    shift_s = [s - min_s + 1e-6 for s in scores]
    tot_s = sum(shift_s)
    norm_w = [s / tot_s for s in shift_s] if tot_s > 0 else None

    vote_clf = None
    can_soft = all(hasattr(m, 'predict_proba') for _, m in est_ens)

    if can_soft:
        logger.info("Attempt Soft Voting...")
        logger.info(f"Weights:{list(np.round(norm_w,3)) if norm_w else 'Uniform'}")
        try:
            vote_clf = VotingClassifier(estimators=est_ens, voting='soft', weights=norm_w, n_jobs=n_jobs_ensemble)
            vote_clf.fit(X_train_ensemble, y_train_ensemble)
            vote_path = f'models/voting_ensemble_soft_{timestamp}.joblib'
            joblib.dump(vote_clf, vote_path)
            logger.info(f"Saved Soft Voting Ens: {vote_path}")
        except Exception as e:
            logger.error(f"Fail Soft Voting: {e}", exc_info=True)
            vote_clf = None
            can_soft = False

    if not can_soft:
        logger.warning("Attempting Hard Voting...")
        try:
            vote_clf = VotingClassifier(estimators=est_ens, voting='hard', n_jobs=n_jobs_ensemble)
            vote_clf.fit(X_train_ensemble, y_train_ensemble)
            vote_path = f'models/voting_ensemble_hard_{timestamp}.joblib'
            joblib.dump(vote_clf, vote_path)
            logger.info(f"Saved Hard Voting Ens: {vote_path}")
        except Exception as e:
            logger.error(f"Fail Hard Voting: {e}", exc_info=True)
            vote_clf = None

    stack_clf = None
    can_stack = all(hasattr(m, 'predict_proba') for _, m in est_ens)
    meta_learner = None
    if can_stack:
        logger.info("Attempt Stacking...")
        try:
            meta_learner = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', C=1.0, n_jobs=1)
            stack_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
            stack_clf = StackingClassifier(estimators=est_ens, final_estimator=meta_learner, cv=stack_cv, stack_method='predict_proba', n_jobs=n_jobs_ensemble, passthrough=False)
            stack_clf.fit(X_train_ensemble, y_train_ensemble)
            stack_path = f'models/stacking_ensemble_{timestamp}.joblib'
            joblib.dump(stack_clf, stack_path)
            logger.info(f"Saved Stacking Ens: {stack_path}")
        except Exception as e:
            logger.error(f"Fail Stacking: {e}", exc_info=True)
            stack_clf = None
    else:
        logger.warning("Cannot create Stacking Ens.")

    best_ind_q_model = None
    best_n = "N/A"
    best_s = -1
    if sorted_models:
        best_n, best_m, best_s = sorted_models[0]
        best_ind_q_model = best_m
        logger.info(f"Best individual qualified: {best_n} (CV:{best_s:.5f})")

    try:
        summary_path = f'results/ensemble_creation_summary_{timestamp}.txt'
        with open(summary_path, 'w') as f:
            f.write("Ensemble Summary\n=============\nQualified Models:\n")
            for n, _, s in sorted_models:
               f.write(f"- {n}: CV {s:.5f} {'(Excl.)' if n in keras_models_excluded else '(Incl.)'}\n")
            f.write(f"\nEnsembles ({len(estimators_valid_for_ensemble)} models):\n")
            vote_t = 'Soft' if can_soft and vote_clf and vote_clf.voting == 'soft' else ('Hard' if vote_clf and vote_clf.voting == 'hard' else 'N/A')
            f.write(f"- Voting ({vote_t}): {'Saved' if vote_clf else 'Failed/Skipped'}.\n")
            meta_name = meta_learner.__class__.__name__ if meta_learner and stack_clf else 'N/A'
            f.write(f"- Stacking (Meta:{meta_name}): {'Saved' if stack_clf else 'Failed/Skipped'}.\n")
            if keras_models_excluded:
                f.write(f"\nKeras Excluded: {', '.join(keras_models_excluded)}\n")
            if best_ind_q_model:
                f.write(f"\nBest individual: {best_n}\n")
        logger.info(f"Saved ensemble summary: {summary_path}")
    except Exception as file_e:
        logger.warning(f"Could not save ens summary: {file_e}")

    return vote_clf, stack_clf, best_ind_q_model

In [20]:
def evaluate_model(model, X_eval, y_eval, model_name, timestamp, le):
    if model is None:
        logger.warning(f"Skip eval {model_name}: model None.")
        return None, None

    if le is None:
        logger.warning(f"Skip eval {model_name}: LE None.")
        return None, None

    logger.info(f"Evaluating {model_name}...")
    if isinstance(y_eval, pd.Series):
        y_eval = y_eval.values

    is_keras_wrapper = isinstance(model, KerasClassifier)

    try:
        y_pred = model.predict(X_eval)
        if is_keras_wrapper and hasattr(model, 'predict_proba') and y_pred.ndim > 1 and y_pred.shape[1] > 1:
            y_pred = np.argmax(y_pred, axis=1)

        acc = accuracy_score(y_eval, y_pred)

        try:
            y_eval_lbls = le.inverse_transform(y_eval)
            y_pred_lbls = le.inverse_transform(y_pred)
            tg_names = le.classes_
        except Exception as e_le:
            logger.warning(f"LE inverse fail {model_name}:{e_le}. Use numeric.")
            y_eval_lbls = y_eval
            y_pred_lbls = y_pred
            tg_names = [str(i) for i in sorted(np.unique(y_eval))]

        rpt_str = classification_report(y_eval_lbls, y_pred_lbls, target_names=tg_names, zero_division=0)
        rpt_dict = classification_report(y_eval_lbls, y_pred_lbls, target_names=tg_names, output_dict=True, zero_division=0)
        cm = confusion_matrix(y_eval_lbls, y_pred_lbls, labels=tg_names)

        logger.info(f"{model_name} Eval Acc: {acc:.5f}")
        eval_fname = f'results/{model_name}_evaluation_{timestamp}.txt'
        with open(eval_fname, 'w') as f:
            f.write(f"Model Eval Summary\n=============\nModel:{model_name}\nTS:{timestamp}\nAcc:{acc:.5f}\n\nReport:\n{rpt_str}\n\nCM:\n{np.array2string(cm)}\n")
        logger.info(f"Saved eval: {eval_fname}")

        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=tg_names, yticklabels=tg_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'CM - {model_name} (Acc:{acc:.3f})')
        plt.tight_layout()
        plot_fname = f'plots/{model_name}_confusion_matrix_{timestamp}.png'
        plt.savefig(plot_fname)
        plt.close()
        logger.info(f"Saved CM plot: {plot_fname}")
        return acc, rpt_dict
    except AttributeError as ae:
        logger.error(f"Eval error {model_name}: AttrErr: {ae}", exc_info=True)
        return None, None
    except Exception as e:
        logger.error(f"Error eval {model_name}: {e}", exc_info=True)
        return None, None

In [21]:
def make_test_predictions(model, X_test, test_obs, timestamp, model_name, le):
    """
    Generate test predictions and save results to CSV and summary files.
    """
    logger.info(f"Generating test predictions using {model_name}...")

    if model is None:
        logger.error(f"Prediction failed for {model_name}: model is None.")
        return None

    if le is None:
        logger.warning(f"LabelEncoder is None for {model_name}. Attempting fallback...")
        enc_fs = sorted([f for f in os.listdir('features') if f.startswith('label_encoder_')])
        if enc_fs:
            try:
                le = joblib.load(f'features/{enc_fs[-1]}')
                logger.info(f"Loaded fallback LabelEncoder: {enc_fs[-1]}.")
            except Exception as load_e:
                logger.error(f"LabelEncoder fallback failed: {load_e}. Prediction aborted.")
                return None
        else:
            logger.error(f"Prediction failed: LabelEncoder is None and no fallback available.")
            return None

    try:
        y_pred_enc = model.predict(X_test)

        # Handle KerasClassifier predictions
        is_keras_wrapper = isinstance(model, KerasClassifier)
        if is_keras_wrapper and hasattr(model, 'predict_proba') and y_pred_enc.ndim > 1 and y_pred_enc.shape[1] > 1:
            y_pred_enc = np.argmax(y_pred_enc, axis=1)

        pred_lbls = le.inverse_transform(y_pred_enc)

        # Create submission DataFrame
        sub = pd.DataFrame({'obs': test_obs, 'salary_category': pred_lbls})
        safe_name = model_name.replace("/", "_").replace("\\", "_").replace(":", "_").replace(" ", "_")
        sub_path = f'submissions/solution_{safe_name}_{timestamp}.csv'
        sub.to_csv(sub_path, index=False)
        logger.info(f"Saved submission: {sub_path}")

        # Log prediction distribution
        val_cts = sub['salary_category'].value_counts().to_dict()
        logger.info(f"Test prediction distribution for {model_name}: {val_cts}")

        # Save prediction summary
        sum_fname = f'results/{safe_name}_test_prediction_summary_{timestamp}.txt'
        with open(sum_fname, 'w') as f:
            f.write(f"Test Prediction Summary - {model_name}\n\n")
            try:
                m_cls = model.__class__.__name__
            except AttributeError:
                m_cls = "N/A"
            f.write(f"Model Class: {m_cls}\n")
            f.write(f"Timestamp: {timestamp}\n")
            f.write(f"Total Predictions: {len(pred_lbls)}\n")
            f.write("Distribution:\n")
            total = len(pred_lbls)
            if total > 0:
                for lbl, cnt in sorted(val_cts.items()):
                    f.write(f"- {lbl}: {cnt} ({cnt / total:.2%})\n")
            else:
                f.write("- No predictions.\n")
        logger.info(f"Saved prediction summary: {sum_fname}")

        return sub

    except AttributeError as ae:
        logger.error(f"LabelEncoder or prediction error for {model_name}: {ae}.", exc_info=True)
        return None
    except Exception as e:
        logger.error(f"Error during prediction for {model_name}: {e}", exc_info=True)
        return None

In [22]:
def run_complete_pipeline(perform_feature_selection=False, min_cv_score_threshold=0.72, fs_threshold='mean', n_jobs_sklearn=1):
    """Run the complete model training pipeline with combined FE logic."""
    timestamp = get_timestamp()
    main_log_file = None
    file_handler = None
    try:
        # 1. Setup
        print("--- Starting Complete Pipeline Run (Combined FE, LightGBM Added) ---")
        create_directory_structure()
        main_log_file = f'logs/pipeline_run_{timestamp}.log'
        file_handler = logging.FileHandler(main_log_file)
        file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        # Add handler only if not already present (useful for reruns in same session)
        if file_handler not in logger.handlers:
            logger.addHandler(file_handler)

        logger.info(f"--- Starting Complete Pipeline Run --- Timestamp: {timestamp} ---")
        logger.info(f"Pipeline Config: Combined FE, Scaling=True, FeatSelect={perform_feature_selection} (Thresh={fs_threshold}), CV Thresh={min_cv_score_threshold}, n_jobs={n_jobs_sklearn} used for Sklearn, Const Cols Kept in Preproc")
        logger.info(f"Logging detailed output to: {main_log_file}")

        # 2. Load Data
        logger.info("Loading data...")
        try:
            train_df = pd.read_csv('train.csv')
            test_df = pd.read_csv('test.csv')
            logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
        except FileNotFoundError as e:
            logger.error(f"Data load error: {e}.")
            return False
        if 'salary_category' not in train_df.columns:
            logger.error("Target missing.")
            return False
        if train_df.empty or test_df.empty:
            logger.error("Data empty.")
            return False
        if 'obs' not in test_df.columns:
            logger.error("Column 'obs' missing in test.csv.")
            return False
        # Get unique values for manual OHE (CRITICAL: do this before preprocessing)
        all_states = set(train_df['job_state'].dropna().unique()).union(set(test_df['job_state'].dropna().unique()))
        all_feature1 = set(train_df['feature_1'].dropna().unique()).union(set(test_df['feature_1'].dropna().unique()))
        logger.info(f"Found {len(all_states)} unique states and {len(all_feature1)} unique feature_1 values across train/test.")

        # 3. Preprocess Training Data
        logger.info("Preprocessing training data (using combined logic)...")
        X_train_orig, y_train_orig, feature_cols_initial, label_encoder = preprocess_data(
            train_df, all_states, all_feature1, timestamp, is_training=True
        )
        if X_train_orig is None or y_train_orig is None or label_encoder is None or feature_cols_initial is None:
            logger.error("Train preprocess failed.")
            return False
        logger.info(f"Train preprocess done. Initial Feats: {X_train_orig.shape[1]}")
        y_train_orig = pd.Series(y_train_orig)  # Ensure Series

        # 4. Train/Validation Split
        logger.info("Splitting data (80/20)...")
        X_train_full, X_val, y_train_full, y_val = train_test_split(X_train_orig, y_train_orig, test_size=0.20, random_state=42, stratify=y_train_orig)
        logger.info(f"Train (Pre-scale): {X_train_full.shape}, Val (Pre-scale): {X_val.shape}")
        y_train_full = pd.Series(y_train_full, index=X_train_full.index)
        y_val = pd.Series(y_val, index=X_val.index)

        # 5. SCALING STEP
        logger.info("Applying StandardScaler...")
        scaler = StandardScaler()
        # Fit scaler only on the columns that exist in the training partition
        X_train_full_scaled = scaler.fit_transform(X_train_full[feature_cols_initial])
        X_val_scaled = scaler.transform(X_val[feature_cols_initial])
        X_train_full_scaled = pd.DataFrame(X_train_full_scaled, index=X_train_full.index, columns=feature_cols_initial)
        X_val_scaled = pd.DataFrame(X_val_scaled, index=X_val.index, columns=feature_cols_initial)
        scaler_path = f'scalers/scaler_{timestamp}.joblib'
        joblib.dump(scaler, scaler_path)
        logger.info(f"Scaler saved: {scaler_path}")
        logger.info(f"Scaled Train shape: {X_train_full_scaled.shape}, Scaled Val shape: {X_val_scaled.shape}")

        # 6. Preprocess & Scale Test Data
        logger.info("Preprocessing test data (using combined logic)...")
        # Pass the *initial* feature list determined during training preprocessing
        X_test_orig, _, _, _ = preprocess_data(
            test_df, all_states, all_feature1, timestamp, is_training=False, feature_columns_to_use=feature_cols_initial
        )
        if X_test_orig is None:
            logger.error("Test preprocess failed.")
            return False
        # Ensure columns match (should be handled by preprocess_data now)
        try:
            X_test_aligned = X_test_orig[feature_cols_initial]  # Align columns
            logger.info("Test columns aligned.")
        except KeyError as ke:
            logger.error(f"Test col mismatch after preprocess: {ke}.")
            return False
        logger.info("Scaling test data...")
        X_test_scaled = scaler.transform(X_test_aligned)
        X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test_aligned.index, columns=feature_cols_initial)
        logger.info(f"Test preprocess & scale done. Shape: {X_test_scaled.shape}")

        # Define Data Partitions
        X_opt_train = X_train_full_scaled.copy()
        y_opt_train = y_train_full.copy()
        X_holdout_val = X_val_scaled.copy()
        y_holdout_val = y_val.copy()
        X_final_test = X_test_scaled.copy()
        current_feature_cols = list(feature_cols_initial)  # Start with all features

        # 7. Optional Feature Selection
        if perform_feature_selection:
            logger.info(f"Performing feature selection (Threshold: {fs_threshold})...")
            try:
                # Note: RF selector might rank constant columns low, effectively removing them here if threshold allows
                selector_model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=n_jobs_sklearn, class_weight='balanced', max_depth=20)  # Use param
                logger.info("Fitting RF for feature selection...")
                selector_model.fit(X_opt_train, y_opt_train)
                selector = SelectFromModel(selector_model, threshold=fs_threshold, prefit=True)  # Use specified threshold
                selected_mask = selector.get_support()
                selected_features = X_opt_train.columns[selected_mask]
                num_orig = X_opt_train.shape[1]
                num_sel = len(selected_features)
                if num_sel < num_orig:
                    num_removed = num_orig - num_sel
                    logger.info(f"Feat selection removed {num_removed} features. Selected {num_sel}.")
                    current_feature_cols = list(selected_features)
                    X_opt_train = X_opt_train[current_feature_cols]
                    X_holdout_val = X_holdout_val[current_feature_cols]
                    X_final_test = X_final_test[current_feature_cols]
                    logger.info(f"Selection applied to train/val/test.")
                    joblib.dump(current_feature_cols, f'features/selected_feature_columns_{timestamp}.joblib')
                else:
                    logger.info(f"Feature selection removed no features with threshold '{fs_threshold}'.")
                    perform_feature_selection = False  # Update flag
            except Exception as e:
                logger.error(f"Error feature selection: {e}. Use all scaled.", exc_info=True)
                perform_feature_selection = False
                current_feature_cols = list(feature_cols_initial)
                X_opt_train = X_train_full_scaled[current_feature_cols]
                X_holdout_val = X_val_scaled[current_feature_cols]
                X_final_test = X_test_scaled[current_feature_cols]
        else:
            logger.info("Skipping feature selection.")
            X_opt_train = X_opt_train[current_feature_cols]
            X_holdout_val = X_holdout_val[current_feature_cols]
            X_final_test = X_final_test[current_feature_cols]

        logger.info(f"Data shapes post-scaling/selection: Train={X_opt_train.shape}, Val={X_holdout_val.shape}, Test={X_final_test.shape}")
        logger.info(f"Number of features used in modeling: {len(current_feature_cols)}")

        # 8. Optimize, Train Base Models & Make Individual Predictions
        # --- UPDATED LIST (No SVC, Added LightGBM) ---
        models_to_optimize = [
            ('logistic', 30),
            ('knn', 25),
            ('adaboost', 30),
            ('randomforest', 40),
            ('extratrees', 40),
            ('gradientboosting', 40),
            ('mlp', 25),
            ('keras_mlp', 20),
            ('catboost', 40),
            ('xgboost', 50),
            ('lightgbm', 50) # Added LightGBM
        ]
        qualified_models_with_scores = []
        optimized_params_all = {}
        logger.info(f"--- Optimizing models & Making Individual Predictions (Thresh: {min_cv_score_threshold}) ---")
        logger.info(f"Optimization Order: {[m[0] for m in models_to_optimize]}") # Log updated order

        for model_name, n_trials in models_to_optimize:
            indiv_sub_df = None
            try:
                logger.info(f"--- Optimizing {model_name} ---")
                final_model, best_cv_score, best_params = optimize_model(
                    X_opt_train, y_opt_train, timestamp, model_name, n_trials=n_trials, n_jobs_optuna=n_jobs_sklearn
                )

                # Debug log for qualification check
                log_score = best_cv_score if best_cv_score is not None else -1.0 # Use a value if None
                comparison_result = best_cv_score >= min_cv_score_threshold if best_cv_score is not None else False
                logger.info(f"Qualification Check for {model_name}: "
                            f"final_model is None? {final_model is None}, "
                            f"best_cv_score is None? {best_cv_score is None}, "
                            f"best_cv_score={log_score:.7f}, "
                            f"threshold={min_cv_score_threshold}, "
                            f"comparison result: {comparison_result}")

                # Check qualification based on CV score AND successful model fitting
                if final_model is not None and best_cv_score is not None and best_cv_score >= min_cv_score_threshold:
                    logger.info(f"+++ QUALIFIED: {model_name} (CV Score: {best_cv_score:.5f})")
                    qualified_models_with_scores.append((model_name, final_model, best_cv_score))
                    if best_params: optimized_params_all[model_name] = best_params

                    logger.info(f"--- Evaluating {model_name} on HOLD-OUT set ---")
                    holdout_acc, _ = evaluate_model(final_model, X_holdout_val, y_holdout_val, f"{model_name}_qualified_holdout_eval", timestamp, label_encoder)
                    if holdout_acc is not None: logger.info(f"Hold-out Acc ({model_name}): {holdout_acc:.5f}")
                    else: logger.warning(f"Hold-out Eval failed for {model_name}.")

                    logger.info(f"--- Generating individual predictions for {model_name} ---")
                    indiv_sub_df = make_test_predictions(final_model, X_final_test, test_df['obs'], timestamp, f"{model_name}_qual_individual_pred", label_encoder)
                    if indiv_sub_df is not None: logger.info(f"Individual prediction file saved for {model_name}.")
                    else: logger.error(f"Failed individual predictions for {model_name}.")

                elif best_cv_score is not None:
                    logger.info(f"--- NOT QUALIFIED: {model_name} (CV Score: {best_cv_score:.5f} {' - Final model fit/save failed' if final_model is None else ''}) ---")
                    if best_params: optimized_params_all[model_name] = best_params
                else:
                    logger.warning(f"Optimization failed or returned invalid score for {model_name}. Skip.")

            except Exception as e:
                logger.error(f"Error in main loop for {model_name}: {e}", exc_info=True)

        # --- Post-Optimization Summary ---
        logger.info("--- Model Optimization Phase Complete ---")
        if not qualified_models_with_scores:
            logger.error(f"CRITICAL: NO models met CV threshold {min_cv_score_threshold}. Abort.")
            if file_handler: logger.removeHandler(file_handler); file_handler.close()
            return False
        logger.info(f"--- {len(qualified_models_with_scores)} models qualified. ---")
        logger.info(f"Qualified Models (Name, CV Score): {[(m[0], f'{m[2]:.5f}') for m in qualified_models_with_scores]}")

        # 9. Create Ensembles & Select FINAL Best Model
        final_model = None
        final_model_name = "N/A"
        vote_ens = None
        stack_ens = None
        best_ind_q_model = None
        if len(qualified_models_with_scores) == 1:
            final_model_name, final_model, final_cv_score = qualified_models_with_scores[0]
            logger.warning(f"Only 1 qualified: {final_model_name} (CV:{final_cv_score:.5f}). Select it.")
            best_ind_q_model = final_model
        elif len(qualified_models_with_scores) > 1:
            logger.info(f"--- Creating and Evaluating Ensembles ---")
            vote_ens, stack_ens, best_ind_q_model = create_ensemble(
                qualified_models_with_scores, X_opt_train, y_opt_train, timestamp, n_jobs_ensemble=n_jobs_sklearn  # Pass n_jobs
            )
            logger.info("--- Evaluating candidate final models on HOLD-OUT validation set ---")
            candidates = {}
            best_ind_name = None
            if vote_ens:
                vote_model_name = f"voting_ensemble_{vote_ens.voting}_qualified"
                logger.info(f"--- Eval {vote_model_name} ---")
                val_acc, _ = evaluate_model(vote_ens, X_holdout_val, y_holdout_val, f"{vote_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None:
                    candidates[vote_model_name] = (val_acc, vote_ens)
                    logger.info(f"Hold-out Acc ({vote_model_name}): {val_acc:.5f}")
                else:
                    logger.warning(f"Eval fail: {vote_model_name}")
            if stack_ens:
                stack_model_name = "stacking_ensemble_qualified"
                logger.info(f"--- Eval {stack_model_name} ---")
                val_acc, _ = evaluate_model(stack_ens, X_holdout_val, y_holdout_val, f"{stack_model_name}_holdout_eval", timestamp, label_encoder)
                if val_acc is not None:
                    candidates[stack_model_name] = (val_acc, stack_ens)
                    logger.info(f"Hold-out Acc ({stack_model_name}): {val_acc:.5f}")
                else:
                    logger.warning(f"Eval fail: {stack_model_name}")
            if best_ind_q_model:
                best_ind_info = next((m for m in qualified_models_with_scores if m[1] == best_ind_q_model), None)
                if best_ind_info:
                    best_ind_name = best_ind_info[0]
                    logger.info(f"--- Eval Best Indiv ({best_ind_name}) ---")
                    eval_name = f"{best_ind_name}_best_qual_holdout_eval"
                    val_acc, _ = evaluate_model(best_ind_q_model, X_holdout_val, y_holdout_val, eval_name, timestamp, label_encoder)
                    if val_acc is not None:
                        cand_name = f"{best_ind_name}_best_qualified"
                        candidates[cand_name] = (val_acc, best_ind_q_model)
                        logger.info(f"Hold-out Acc ({best_ind_name}): {val_acc:.5f}")
                    else:
                        logger.warning(f"Eval fail: {best_ind_name}")
                else:
                    logger.warning("Could not find name for best individual.")
            if candidates:
                final_model_name = max(candidates, key=lambda k: candidates[k][0])
                final_val_score, final_model = candidates[final_model_name]
                logger.info(f"--- FINAL MODEL: '{final_model_name}' (Hold-Out Acc: {final_val_score:.5f}) ---")
            else:
                logger.error("Hold-out eval failed for all candidates.")
                if best_ind_q_model and best_ind_name:
                    final_model = best_ind_q_model
                    final_model_name = f"{best_ind_name}_best_qualified_cv_fallback"
                    logger.warning(f"FALLBACK: Using '{final_model_name}'.")
                else:
                    logger.error("No final model fallback.")
                    final_model = None

        if not final_model:
            logger.error("No final model selected. Abort.")
            if file_handler:
                logger.removeHandler(file_handler)
                file_handler.close()
            return False

        # 10. Make FINAL Test Predictions
        logger.info(f"--- Generating FINAL predictions using: {final_model_name} ---")
        final_sub_df = make_test_predictions(final_model, X_final_test, test_df['obs'], timestamp, f"{final_model_name}_FINAL", label_encoder)
        if final_sub_df is None:
            logger.error(f"Failed FINAL submission with {final_model_name}.")
        else:
            logger.info(f"FINAL submission file generated with {final_model_name}.")

        # 11. Final Summary
        logger.info("--- Pipeline Run Summary ---")
        logger.info(f"Timestamp: {timestamp}")
        logger.info(f"Config: Combined FE, Scaling=True, FeatSelect={perform_feature_selection} (Thresh={fs_threshold}), CV Thresh={min_cv_score_threshold}, n_jobs={n_jobs_sklearn}, ConstCols Kept")
        logger.info(f"Final # Features: {len(current_feature_cols)}")
        logger.info("Models Optimized: " + ", ".join([m[0] for m in models_to_optimize]))
        qual_details = [(m[0], f"{m[2]:.5f}") for m in qualified_models_with_scores] if qualified_models_with_scores else ["None"]
        logger.info("Models Qualified (Name, CV Score): " + ", ".join([f"{n}({s})" for n, s in qual_details]))
        logger.info(f"Ensembles Created: Voting={'Yes' if vote_ens else 'No'}, Stacking={'Yes' if stack_ens else 'No'}")
        logger.info(f"Final model selected: {final_model_name}")
        logger.info("Individual predictions saved for qualified models.")
        if final_sub_df is not None:
            safe_final_n = final_model_name.replace("/", "_").replace("\\", "_").replace(":", "_").replace(" ", "_")
            final_sub_path = f"submissions/solution_{safe_final_n}_FINAL_{timestamp}.csv"
            logger.info(f"Final submission file: {final_sub_path}")
        else:
            logger.warning("No FINAL submission file generated.")
        logger.info(f"Logs in: {main_log_file}")
        logger.info("--- Pipeline Completed Successfully ---")
        if file_handler:
            logger.removeHandler(file_handler)
            file_handler.close()
        return True

    except Exception as e:
        logger.error(f"--- Pipeline Failed Critically --- Error: {e}", exc_info=True)
        if file_handler and file_handler in logger.handlers:
            logger.removeHandler(file_handler)
            file_handler.close()
        return False

In [23]:
if __name__ == "__main__":
    N_CORES_TO_USE = 16 # Set num cores for Sklearn models (RF, ET, KNN)
    start_time = time.time()
    success = run_complete_pipeline(
        perform_feature_selection=True,
        fs_threshold='mean', # Feature selection threshold ('mean' or 'median')
        min_cv_score_threshold=0.72, # Your qualification threshold
        n_jobs_sklearn=N_CORES_TO_USE # Use 1 core by default now
        )
    end_time = time.time()
    duration = end_time - start_time
    status_msg = f"Pipeline execution {'succeeded' if success else 'failed'}."
    duration_msg = f"Total time: {duration:.2f} sec ({duration/60:.2f} min)."
    print(f"\n{'='*30}\n{status_msg}")
    print(duration_msg)
    print(f"{'='*30}")
    try: # Log final status if possible
        logger.info(status_msg)
        logger.info(duration_msg)
    except Exception as log_final_e:
        print(f"Final logging failed: {log_final_e}")

2025-04-24 14:42:45,381 - INFO - Created directory: models
2025-04-24 14:42:45,383 - INFO - Created directory: features
2025-04-24 14:42:45,384 - INFO - Created directory: results
2025-04-24 14:42:45,386 - INFO - Created directory: submissions
2025-04-24 14:42:45,387 - INFO - Created directory: logs
2025-04-24 14:42:45,388 - INFO - Created directory: plots
2025-04-24 14:42:45,389 - INFO - Created directory: optuna_trials
2025-04-24 14:42:45,390 - INFO - Created directory: scalers
2025-04-24 14:42:45,391 - INFO - --- Starting Complete Pipeline Run --- Timestamp: 20250424_144245 ---
2025-04-24 14:42:45,391 - INFO - Pipeline Config: Combined FE, Scaling=True, FeatSelect=True (Thresh=mean), CV Thresh=0.72, n_jobs=16 used for Sklearn, Const Cols Kept in Preproc
2025-04-24 14:42:45,392 - INFO - Logging detailed output to: logs/pipeline_run_20250424_144245.log
2025-04-24 14:42:45,392 - INFO - Loading data...
2025-04-24 14:42:45,502 - INFO - Train shape: (1280, 317), Test shape: (854, 316)
202

--- Starting Complete Pipeline Run (Combined FE, LightGBM Added) ---


2025-04-24 14:42:45,567 - INFO - Starting Feature Engineering (using logic from simpler model)...
2025-04-24 14:42:45,579 - INFO - Applying Target Encoding to 'job_title_grouped'...
2025-04-24 14:42:45,590 - INFO - Fit and saved TargetEncoder for job_title_grouped
2025-04-24 14:42:45,591 - INFO - Processed 'job_title' (flags, grouping, target encoding).
2025-04-24 14:42:45,603 - INFO - Processed 'job_posted_date' (cyclical, recency, norm year).
2025-04-24 14:42:45,607 - INFO - Added binned feature for feature_9.
2025-04-24 14:42:45,608 - INFO - Added interaction: feature_2_9_interaction
2025-04-24 14:42:45,611 - INFO - Added squared, sqrt, and binned features for feature_2.
2025-04-24 14:42:45,612 - INFO - Added boolean sum and squared sum features.
2025-04-24 14:42:45,612 - INFO - Added interaction: feature_10_8_interaction
2025-04-24 14:42:45,684 - INFO - Applying PCA (n=15) to job description features...
2025-04-24 14:42:45,700 - INFO - Fit and saved PCA model for job description.
2

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008063 seconds

[I 2025-04-24 16:40:23,914] Trial 0 finished with value: 0.6836059301769488 and parameters: {'n_estimators': 1600, 'learning_rate': 0.013128680516299871, 'num_leaves': 25, 'max_depth': 9, 'subsample': 0.545364963459184, 'colsample_bytree': 0.6056866267226301, 'reg_alpha': 0.0008504794146115263, 'reg_lambda': 0.016856448985259634, 'min_child_samples': 31}. Best is trial 0 with value: 0.6836059301769488.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001876 seconds

[I 2025-04-24 16:40:34,744] Trial 1 finished with value: 0.7002439024390243 and parameters: {'n_estimators': 1700, 'learning_rate': 0.0720985786406418, 'num_leaves': 40, 'max_depth': 13, 'subsample': 0.7237833405595577, 'colsample_bytree': 0.734222878790656, 'reg_alpha': 7.540893624233425, 'reg_lambda': 5.665276051372137e-05, 'min_child_samples': 16}. Best is trial 1 with value: 0.7002439024390243.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001826 seconds

[I 2025-04-24 16:40:50,533] Trial 2 finished with value: 0.6757867049258728 and parameters: {'n_estimators': 600, 'learning_rate': 0.0380619496987491, 'num_leaves': 50, 'max_depth': 15, 'subsample': 0.8055563162717212, 'colsample_bytree': 0.7051291477688695, 'reg_alpha': 5.659509701399759e-05, 'reg_lambda': 1.2353691729831572e-05, 'min_child_samples': 33}. Best is trial 1 with value: 0.7002439024390243.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002059 seconds

[I 2025-04-24 16:41:21,732] Trial 3 finished with value: 0.6777618364418938 and parameters: {'n_estimators': 1500, 'learning_rate': 0.01214436497735735, 'num_leaves': 145, 'max_depth': 8, 'subsample': 0.576222695704465, 'colsample_bytree': 0.7811826875094652, 'reg_alpha': 5.7716434421806126e-05, 'reg_lambda': 2.1904895746541808e-08, 'min_child_samples': 44}. Best is trial 1 with value: 0.7002439024390243.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007248 seconds

[I 2025-04-24 16:41:24,681] Trial 4 finished with value: 0.6738546150167384 and parameters: {'n_estimators': 1300, 'learning_rate': 0.18979737709541747, 'num_leaves': 145, 'max_depth': 7, 'subsample': 0.6631772193465346, 'colsample_bytree': 0.5995604035925317, 'reg_alpha': 3.7547534336744066e-05, 'reg_lambda': 7.264105805431794e-08, 'min_child_samples': 35}. Best is trial 1 with value: 0.7002439024390243.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001729 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001860 seconds

[I 2025-04-24 16:41:49,427] Trial 5 finished with value: 0.7050836920133907 and parameters: {'n_estimators': 1600, 'learning_rate': 0.020835807043676352, 'num_leaves': 135, 'max_depth': 5, 'subsample': 0.5433606242534079, 'colsample_bytree': 0.9256744260804308, 'reg_alpha': 7.378870260063733e-05, 'reg_lambda': 0.09705048191333523, 'min_child_samples': 8}. Best is trial 5 with value: 0.7050836920133907.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000740 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000681 seconds

[I 2025-04-24 16:41:56,760] Trial 6 finished with value: 0.688493543758967 and parameters: {'n_estimators': 300, 'learning_rate': 0.006147385415604492, 'num_leaves': 120, 'max_depth': 5, 'subsample': 0.7563033670912835, 'colsample_bytree': 0.691187091780846, 'reg_alpha': 2.854981327365896e-05, 'reg_lambda': 6.877185084255217e-05, 'min_child_samples': 13}. Best is trial 5 with value: 0.7050836920133907.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000576 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000834 seconds

[I 2025-04-24 16:41:58,398] Trial 7 finished with value: 0.7070349115255858 and parameters: {'n_estimators': 1900, 'learning_rate': 0.024897887592654887, 'num_leaves': 50, 'max_depth': 16, 'subsample': 0.6182867365742812, 'colsample_bytree': 0.7543846172485673, 'reg_alpha': 0.2305612739190994, 'reg_lambda': 4.450719872302879e-05, 'min_child_samples': 18}. Best is trial 7 with value: 0.7070349115255858.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000837 seconds

[I 2025-04-24 16:42:02,710] Trial 8 finished with value: 0.7089956958393113 and parameters: {'n_estimators': 1400, 'learning_rate': 0.007290178155923585, 'num_leaves': 40, 'max_depth': 10, 'subsample': 0.587764476469911, 'colsample_bytree': 0.7652690535425949, 'reg_alpha': 0.01799231665225685, 'reg_lambda': 2.87496627874108e-08, 'min_child_samples': 21}. Best is trial 8 with value: 0.7089956958393113.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000618 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000593 seconds

[I 2025-04-24 16:42:03,073] Trial 9 finished with value: 0.7197465327594452 and parameters: {'n_estimators': 1900, 'learning_rate': 0.25123165307386697, 'num_leaves': 20, 'max_depth': 8, 'subsample': 0.8252360525870108, 'colsample_bytree': 0.8612561914770367, 'reg_alpha': 0.0001395557106773657, 'reg_lambda': 3.246905678491774e-05, 'min_child_samples': 13}. Best is trial 9 with value: 0.7197465327594452.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000616 seconds

[I 2025-04-24 16:42:03,825] Trial 10 finished with value: 0.716805356288857 and parameters: {'n_estimators': 900, 'learning_rate': 0.20395387722669614, 'num_leaves': 90, 'max_depth': 12, 'subsample': 0.9585927451952717, 'colsample_bytree': 0.968666461014053, 'reg_alpha': 3.952783671966224e-08, 'reg_lambda': 3.208829691250939, 'min_child_samples': 7}. Best is trial 9 with value: 0.7197465327594452.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000504 seconds

[I 2025-04-24 16:42:04,591] Trial 11 finished with value: 0.725638450502152 and parameters: {'n_estimators': 900, 'learning_rate': 0.2607119005227267, 'num_leaves': 90, 'max_depth': 12, 'subsample': 0.9693927125364629, 'colsample_bytree': 0.9992855565533644, 'reg_alpha': 1.6164573383498186e-08, 'reg_lambda': 9.946915035826285, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds

[I 2025-04-24 16:42:06,061] Trial 12 finished with value: 0.7187709230033477 and parameters: {'n_estimators': 1000, 'learning_rate': 0.10086205411734901, 'num_leaves': 85, 'max_depth': 12, 'subsample': 0.9927635760597271, 'colsample_bytree': 0.8798145435497242, 'reg_alpha': 1.3207522305330865e-08, 'reg_lambda': 9.071882755764397, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000624 seconds

[I 2025-04-24 16:42:06,459] Trial 13 finished with value: 0.7119416547106647 and parameters: {'n_estimators': 2000, 'learning_rate': 0.2937003798954252, 'num_leaves': 85, 'max_depth': 11, 'subsample': 0.891024861803818, 'colsample_bytree': 0.8518023879234559, 'reg_alpha': 3.558337376128268e-07, 'reg_lambda': 0.004582473632628532, 'min_child_samples': 24}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Number of data points in the train set: 820, number of used features: 36
[LightGBM] [Info] Start training from score -0.937863
[LightGBM] [Info] Start training from score -1.118317
[LightGBM] [Info] Start training from score -1.266887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] 

[I 2025-04-24 16:42:07,423] Trial 14 finished with value: 0.7158345289335246 and parameters: {'n_estimators': 700, 'learning_rate': 0.10010975436523521, 'num_leaves': 115, 'max_depth': 14, 'subsample': 0.8768660271521603, 'colsample_bytree': 0.9852034616916244, 'reg_alpha': 1.426064825103308e-06, 'reg_lambda': 1.3713402744459648e-06, 'min_child_samples': 11}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000597 seconds

[I 2025-04-24 16:42:07,861] Trial 15 finished with value: 0.6845911047345767 and parameters: {'n_estimators': 1100, 'learning_rate': 0.13493822779647713, 'num_leaves': 65, 'max_depth': 7, 'subsample': 0.8790546318574548, 'colsample_bytree': 0.8510375941317171, 'reg_alpha': 0.0037750258046644835, 'reg_lambda': 0.001772276654258581, 'min_child_samples': 49}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6714
[LightGBM] [Info] Number of data points in the train set: 820, number of used features: 35
[LightGBM] [Info] Start training from score -0.937863
[LightGBM] [Info] Start training from score -1.118317
[LightGBM] [Info] Start training from score -1.266887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000617 seconds

[I 2025-04-24 16:42:08,778] Trial 16 finished with value: 0.7236585365853658 and parameters: {'n_estimators': 300, 'learning_rate': 0.05993987179422879, 'num_leaves': 70, 'max_depth': 10, 'subsample': 0.8112318402505097, 'colsample_bytree': 0.9078240296243568, 'reg_alpha': 2.25195824413223e-06, 'reg_lambda': 0.2412995835450981, 'min_child_samples': 13}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds

[I 2025-04-24 16:42:09,575] Trial 17 finished with value: 0.7090243902439024 and parameters: {'n_estimators': 200, 'learning_rate': 0.05863965464613942, 'num_leaves': 100, 'max_depth': 10, 'subsample': 0.9437252142119706, 'colsample_bytree': 0.503116628000742, 'reg_alpha': 6.958907744703763e-07, 'reg_lambda': 0.3647345788471017, 'min_child_samples': 23}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000704 seconds

[I 2025-04-24 16:42:11,280] Trial 18 finished with value: 0.7109516977522716 and parameters: {'n_estimators': 500, 'learning_rate': 0.04485110357438605, 'num_leaves': 70, 'max_depth': 12, 'subsample': 0.6992491367326914, 'colsample_bytree': 0.9314256920171211, 'reg_alpha': 1.8684438260261138e-07, 'reg_lambda': 0.9988600929032654, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002370 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000603 seconds

[I 2025-04-24 16:42:11,898] Trial 19 finished with value: 0.7021759923481588 and parameters: {'n_estimators': 400, 'learning_rate': 0.1412568882062612, 'num_leaves': 70, 'max_depth': 14, 'subsample': 0.795381652683619, 'colsample_bytree': 0.9968212720057872, 'reg_alpha': 1.79555988769597e-06, 'reg_lambda': 0.07638183988671823, 'min_child_samples': 17}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000795 seconds

[I 2025-04-24 16:42:13,354] Trial 20 finished with value: 0.7129124820659971 and parameters: {'n_estimators': 800, 'learning_rate': 0.03732679651182418, 'num_leaves': 105, 'max_depth': 11, 'subsample': 0.9172942115628563, 'colsample_bytree': 0.9043911744828906, 'reg_alpha': 5.2777802623685706e-08, 'reg_lambda': 0.6071531223336768, 'min_child_samples': 10}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000635 seconds

[I 2025-04-24 16:42:13,745] Trial 21 finished with value: 0.7080248684839789 and parameters: {'n_estimators': 1300, 'learning_rate': 0.24089718219685408, 'num_leaves': 20, 'max_depth': 9, 'subsample': 0.8377404534921891, 'colsample_bytree': 0.8242538848970669, 'reg_alpha': 7.134544611326888e-06, 'reg_lambda': 0.0006532486707238218, 'min_child_samples': 13}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6717
[LightGBM] [Info] Number of data points in the train set: 820, number of used features: 36
[LightGBM] [Info] Start training from score -0.937863
[LightGBM] [Info] Start training from score -1.118317
[LightGBM] [Info] Start training from score -1.266887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000637 seconds

[I 2025-04-24 16:42:14,127] Trial 22 finished with value: 0.7050884744141559 and parameters: {'n_estimators': 1200, 'learning_rate': 0.2965964248627459, 'num_leaves': 60, 'max_depth': 7, 'subsample': 0.8430842051100378, 'colsample_bytree': 0.9434804935570428, 'reg_alpha': 0.0005767254993752886, 'reg_lambda': 1.024436415599306e-06, 'min_child_samples': 15}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000722 seconds

[I 2025-04-24 16:42:14,712] Trial 23 finished with value: 0.6816690578670493 and parameters: {'n_estimators': 1800, 'learning_rate': 0.1369073583782633, 'num_leaves': 80, 'max_depth': 9, 'subsample': 0.761862598615531, 'colsample_bytree': 0.819047187703264, 'reg_alpha': 4.299248805351282e-06, 'reg_lambda': 8.455638726885581, 'min_child_samples': 28}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds

[I 2025-04-24 16:42:15,328] Trial 24 finished with value: 0.7080439980870397 and parameters: {'n_estimators': 700, 'learning_rate': 0.0860125955675984, 'num_leaves': 30, 'max_depth': 8, 'subsample': 0.8283693696333188, 'colsample_bytree': 0.8863866859357736, 'reg_alpha': 1.2616495195939001e-07, 'reg_lambda': 0.022414869716160406, 'min_child_samples': 9}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000622 seconds

[I 2025-04-24 16:42:15,872] Trial 25 finished with value: 0.710961262553802 and parameters: {'n_estimators': 200, 'learning_rate': 0.19865614188706843, 'num_leaves': 100, 'max_depth': 10, 'subsample': 0.6630342914129369, 'colsample_bytree': 0.9700129419449124, 'reg_alpha': 1.521932473936358e-08, 'reg_lambda': 3.2064412552240688e-06, 'min_child_samples': 18}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000598 seconds

[I 2025-04-24 16:42:16,327] Trial 26 finished with value: 0.6972931611669058 and parameters: {'n_estimators': 1000, 'learning_rate': 0.16196541819708385, 'num_leaves': 55, 'max_depth': 6, 'subsample': 0.9595914817437685, 'colsample_bytree': 0.9038247616168672, 'reg_alpha': 8.010358697760728e-06, 'reg_lambda': 1.8410499176000985, 'min_child_samples': 21}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6717
[LightGBM] [Info] Number of data points in the train set: 820, number of used features: 36
[LightGBM] [Info] Start training from score -0.937863
[LightGBM] [Info] Start training from score -1.118317
[LightGBM] [Info] Start training from score -1.266887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds

[I 2025-04-24 16:42:17,007] Trial 27 finished with value: 0.6728694404591105 and parameters: {'n_estimators': 500, 'learning_rate': 0.06085782594194191, 'num_leaves': 125, 'max_depth': 13, 'subsample': 0.9969871853220585, 'colsample_bytree': 0.8067984472715672, 'reg_alpha': 0.02127311255447375, 'reg_lambda': 0.0003078807060982213, 'min_child_samples': 38}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000654 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000644 seconds

[I 2025-04-24 16:42:18,649] Trial 28 finished with value: 0.7138928742228599 and parameters: {'n_estimators': 800, 'learning_rate': 0.024785146071625058, 'num_leaves': 35, 'max_depth': 8, 'subsample': 0.7990130279068306, 'colsample_bytree': 0.8595587080145105, 'reg_alpha': 0.00023376257526278615, 'reg_lambda': 0.3352590224286426, 'min_child_samples': 13}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000702 seconds

[I 2025-04-24 16:42:21,074] Trial 29 finished with value: 0.7001673840267814 and parameters: {'n_estimators': 1600, 'learning_rate': 0.012778595626915232, 'num_leaves': 75, 'max_depth': 11, 'subsample': 0.9080044797919734, 'colsample_bytree': 0.9531558126606852, 'reg_alpha': 0.002797824524994356, 'reg_lambda': 0.010391841505667158, 'min_child_samples': 26}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000580 seconds

[I 2025-04-24 16:42:21,511] Trial 30 finished with value: 0.6933668101386896 and parameters: {'n_estimators': 400, 'learning_rate': 0.10558224849059084, 'num_leaves': 20, 'max_depth': 9, 'subsample': 0.8543659216113875, 'colsample_bytree': 0.9108380258131199, 'reg_alpha': 8.076709669708282e-06, 'reg_lambda': 0.06281826891185274, 'min_child_samples': 29}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6714
[LightGBM] [Info] Number of data points in the train set: 820, number of used features: 35
[LightGBM] [Info] Start training from score -0.937863
[LightGBM] [Info] Start training from score -1.118317
[LightGBM] [Info] Start training from score -1.266887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000569 seconds

[I 2025-04-24 16:42:22,731] Trial 31 finished with value: 0.7177857484457197 and parameters: {'n_estimators': 1000, 'learning_rate': 0.10965106903239828, 'num_leaves': 95, 'max_depth': 12, 'subsample': 0.9928890526615186, 'colsample_bytree': 0.8863690413433325, 'reg_alpha': 1.114219738448586e-08, 'reg_lambda': 4.735445642682242, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000660 seconds

[I 2025-04-24 16:42:24,190] Trial 32 finished with value: 0.7226637972262075 and parameters: {'n_estimators': 1100, 'learning_rate': 0.07477945108849725, 'num_leaves': 110, 'max_depth': 13, 'subsample': 0.9291790206865004, 'colsample_bytree': 0.8615629330300604, 'reg_alpha': 8.950261730166306e-08, 'reg_lambda': 4.246903525750214, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000759 seconds

[I 2025-04-24 16:42:25,179] Trial 33 finished with value: 0.7129124820659971 and parameters: {'n_estimators': 1200, 'learning_rate': 0.06975789323038499, 'num_leaves': 110, 'max_depth': 13, 'subsample': 0.5047478763742768, 'colsample_bytree': 0.7941191915148076, 'reg_alpha': 1.2194427292924107e-07, 'reg_lambda': 0.19782091288137743, 'min_child_samples': 11}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds

[I 2025-04-24 16:42:26,904] Trial 34 finished with value: 0.7255810616929699 and parameters: {'n_estimators': 1400, 'learning_rate': 0.04903968608577325, 'num_leaves': 110, 'max_depth': 14, 'subsample': 0.9354901133596453, 'colsample_bytree': 0.685629660443084, 'reg_alpha': 6.116718516840896e-07, 'reg_lambda': 1.6401126858975799, 'min_child_samples': 7}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000533 seconds

[I 2025-04-24 16:42:28,352] Trial 35 finished with value: 0.7109516977522716 and parameters: {'n_estimators': 1400, 'learning_rate': 0.05099474672076964, 'num_leaves': 125, 'max_depth': 14, 'subsample': 0.9284719885536158, 'colsample_bytree': 0.6557043267855723, 'reg_alpha': 6.429618184499373e-07, 'reg_lambda': 1.7193856991319039, 'min_child_samples': 8}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001002 seconds

[I 2025-04-24 16:42:32,020] Trial 36 finished with value: 0.7187709230033477 and parameters: {'n_estimators': 1500, 'learning_rate': 0.03029745898966977, 'num_leaves': 110, 'max_depth': 15, 'subsample': 0.9644904945398207, 'colsample_bytree': 0.6270645003508519, 'reg_alpha': 4.800293316396515e-08, 'reg_lambda': 1.9112032800713314, 'min_child_samples': 7}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000877 seconds

[I 2025-04-24 16:42:36,861] Trial 37 finished with value: 0.7070540411286466 and parameters: {'n_estimators': 1200, 'learning_rate': 0.017783616309495525, 'num_leaves': 135, 'max_depth': 13, 'subsample': 0.8996077316049065, 'colsample_bytree': 0.720819597455516, 'reg_alpha': 1.8266271125947746e-06, 'reg_lambda': 0.029852893342472438, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000711 seconds

[I 2025-04-24 16:42:39,161] Trial 38 finished with value: 0.715844093735055 and parameters: {'n_estimators': 1400, 'learning_rate': 0.03459264327268173, 'num_leaves': 95, 'max_depth': 16, 'subsample': 0.9340387148328808, 'colsample_bytree': 0.67855335321822, 'reg_alpha': 2.606350594788523e-07, 'reg_lambda': 0.1837378546623065, 'min_child_samples': 10}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6735
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6696
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 35
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000726 seconds

[I 2025-04-24 16:42:40,378] Trial 39 finished with value: 0.6904543280726925 and parameters: {'n_estimators': 1700, 'learning_rate': 0.043667323135365706, 'num_leaves': 130, 'max_depth': 15, 'subsample': 0.7158540748482026, 'colsample_bytree': 0.7393110560342646, 'reg_alpha': 4.678262592137587e-08, 'reg_lambda': 0.8489893072105087, 'min_child_samples': 38}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000347 seconds

[I 2025-04-24 16:42:42,136] Trial 40 finished with value: 0.7109564801530368 and parameters: {'n_estimators': 900, 'learning_rate': 0.08017324409720235, 'num_leaves': 80, 'max_depth': 15, 'subsample': 0.8617700682380283, 'colsample_bytree': 0.5964013559270482, 'reg_alpha': 1.5511096742014564e-05, 'reg_lambda': 9.831924559806902, 'min_child_samples': 15}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000785 seconds

[I 2025-04-24 16:42:43,001] Trial 41 finished with value: 0.7050693448110952 and parameters: {'n_estimators': 2000, 'learning_rate': 0.24654012872862116, 'num_leaves': 45, 'max_depth': 14, 'subsample': 0.7758245979409396, 'colsample_bytree': 0.7796360524439874, 'reg_alpha': 0.00013120145660105867, 'reg_lambda': 3.503287671137895e-05, 'min_child_samples': 12}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000663 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001020 seconds

[I 2025-04-24 16:42:44,789] Trial 42 finished with value: 0.7236681013868962 and parameters: {'n_estimators': 1900, 'learning_rate': 0.06616558164669284, 'num_leaves': 115, 'max_depth': 12, 'subsample': 0.8197239419971248, 'colsample_bytree': 0.8336465118200687, 'reg_alpha': 0.20732480681449272, 'reg_lambda': 9.440441159948653e-06, 'min_child_samples': 8}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000707 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000768 seconds

[I 2025-04-24 16:42:46,254] Trial 43 finished with value: 0.7178000956480153 and parameters: {'n_estimators': 1700, 'learning_rate': 0.06605237260057231, 'num_leaves': 150, 'max_depth': 13, 'subsample': 0.7317001336584638, 'colsample_bytree': 0.959154326414993, 'reg_alpha': 0.10934264193201264, 'reg_lambda': 1.1740935540902964e-05, 'min_child_samples': 8}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000774 seconds

[I 2025-04-24 16:42:47,775] Trial 44 finished with value: 0.7060975609756097 and parameters: {'n_estimators': 1500, 'learning_rate': 0.05168212633284151, 'num_leaves': 120, 'max_depth': 12, 'subsample': 0.9730529695437794, 'colsample_bytree': 0.8273535684671174, 'reg_alpha': 6.857060533855848, 'reg_lambda': 3.7040612440595956e-07, 'min_child_samples': 7}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000770 seconds

[I 2025-04-24 16:42:55,602] Trial 45 finished with value: 0.7158584409373505 and parameters: {'n_estimators': 1100, 'learning_rate': 0.009302285234871647, 'num_leaves': 110, 'max_depth': 11, 'subsample': 0.8116061689219819, 'colsample_bytree': 0.7670094577390447, 'reg_alpha': 4.0649317883325935, 'reg_lambda': 3.2593433995404264, 'min_child_samples': 7}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000549 seconds

[I 2025-04-24 16:42:58,156] Trial 46 finished with value: 0.7070827355332376 and parameters: {'n_estimators': 1300, 'learning_rate': 0.029859597920615818, 'num_leaves': 90, 'max_depth': 13, 'subsample': 0.8666147193324442, 'colsample_bytree': 0.5612940541611084, 'reg_alpha': 2.340809164538081, 'reg_lambda': 0.00023846883417581957, 'min_child_samples': 15}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000874 seconds

[I 2025-04-24 16:43:02,200] Trial 47 finished with value: 0.7129076996652319 and parameters: {'n_estimators': 1800, 'learning_rate': 0.016975466221217206, 'num_leaves': 140, 'max_depth': 12, 'subsample': 0.9021569286019933, 'colsample_bytree': 0.7174168501276783, 'reg_alpha': 1.3325931387235372, 'reg_lambda': 1.0118295343540669e-05, 'min_child_samples': 9}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000812 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000843 seconds

[I 2025-04-24 16:43:10,124] Trial 48 finished with value: 0.7109469153515064 and parameters: {'n_estimators': 600, 'learning_rate': 0.005004639795447415, 'num_leaves': 115, 'max_depth': 11, 'subsample': 0.9443040026162207, 'colsample_bytree': 0.839233188501401, 'reg_alpha': 7.801633028109822e-07, 'reg_lambda': 9.784008446107348e-05, 'min_child_samples': 20}. Best is trial 11 with value: 0.725638450502152.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6738
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000709 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6699
[LightGBM] [Info] Number of data points in the train set: 819, number of used features: 36
[LightGBM] [Info] Start training from score -0.936643
[LightGBM] [Info] Start training from score -1.117097
[LightGBM] [Info] Start training from score -1.270005
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000949 seconds

[I 2025-04-24 16:43:12,009] Trial 49 finished with value: 0.7178000956480153 and parameters: {'n_estimators': 900, 'learning_rate': 0.08169389465867899, 'num_leaves': 100, 'max_depth': 14, 'subsample': 0.7765584644015124, 'colsample_bytree': 0.9259163851026673, 'reg_alpha': 9.192661608473929e-08, 'reg_lambda': 0.004136828092790918, 'min_child_samples': 5}. Best is trial 11 with value: 0.725638450502152.
2025-04-24 16:43:12,029 - INFO - Optimization complete for lightgbm.
2025-04-24 16:43:12,031 - INFO - Best CV score: 0.72564
2025-04-24 16:43:12,031 - INFO - Best params: {'n_estimators': 900, 'learning_rate': 0.2607119005227267, 'num_leaves': 90, 'max_depth': 12, 'subsample': 0.9693927125364629, 'colsample_bytree': 0.9992855565533644, 'reg_alpha': 1.6164573383498186e-08, 'reg_lambda': 9.946915035826285, 'min_child_samples': 5}
2025-04-24 16:43:12,033 - INFO - Saved Optuna summary: optuna_trials/lightgbm_study_summary_20250424_144245.txt
2025-04-24 16:43:12,034 - INFO - Instantiating fi

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6790
[LightGBM] [Info] Number of data points in the train set: 1024, number of used features: 36
[LightGBM] [Info] Start training from score -0.937510
[LightGBM] [Info] Start training from score -1.117341
[LightGBM] [Info] Start training from score -1.268511


2025-04-24 16:43:17,029 - INFO - Final lightgbm fitted in 4.99s.
2025-04-24 16:43:17,031 - INFO - Saving final lightgbm model...
2025-04-24 16:43:17,231 - INFO - Saved final lightgbm model path: models/lightgbm_20250424_144245.joblib
2025-04-24 16:43:17,232 - INFO - Saving importance lightgbm...
2025-04-24 16:43:17,514 - INFO - Saved importance plot: plots/lightgbm_feature_importance_20250424_144245.png
2025-04-24 16:43:17,514 - INFO - Saved importance csv: results/lightgbm_feature_importance_20250424_144245.csv
2025-04-24 16:43:17,514 - INFO - Qualification Check for lightgbm: final_model is None? False, best_cv_score is None? False, best_cv_score=0.7256385, threshold=0.72, comparison result: True
2025-04-24 16:43:17,514 - INFO - +++ QUALIFIED: lightgbm (CV Score: 0.72564)
2025-04-24 16:43:17,529 - INFO - --- Evaluating lightgbm on HOLD-OUT set ---
2025-04-24 16:43:17,530 - INFO - Evaluating lightgbm_qualified_holdout_eval...
2025-04-24 16:43:17,548 - INFO - lightgbm_qualified_holdout


Pipeline execution succeeded.
Total time: 7309.17 sec (121.82 min).
