In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import optuna
import joblib

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostRegressor
from crepes.extras import margin, DifficultyEstimator, MondrianCategorizer
from crepes import WrapRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
features = [
    'age',
    'work_type',
    'education', 
    'experience_bin',
    'profession',
    'continent'
]

target = 'target'

categorical_features = [
    'age',
    'work_type',
    'education', 
    'experience_bin',
    'profession',
    'continent'
]

numerical_features = [
]

In [4]:
df = pd.read_csv('../data/preprocessed/preprocessed_data.csv', sep=';')

# Splitting Dataset

In [8]:
df_train, df_ = train_test_split(df, test_size=.3, random_state=42)
df_cal, df_test = train_test_split(df_, test_size=.5, random_state=42)

X_train = df_train[features].copy()
y_train = df_train[target].copy()

X_cal = df_cal[features].copy()
y_cal = df_cal[target].copy()

X_test = df_test[features].copy()
y_test = df_test[target].copy()

df.shape[0], df_train.shape[0], df_cal.shape[0], df_test.shape[0]

(20817, 14571, 3123, 3123)

# Optuna Workflow

In [9]:
def get_pipeline(catboost_params, numeric_features, categorical_features):
    """
    Create a pipeline with preprocessing and CatBoostRegressor.
    """
    # Preprocessing for numerical features
    numerical_transformer = SimpleImputer(strategy="median")

    # Preprocessing for categorical features
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="N/A")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ])

    # Combine preprocessors
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    # Full pipeline: preprocessing + CatBoost
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", CatBoostRegressor(**catboost_params))
    ])

    return pipeline

In [10]:
def objective(trial, X_train, y_train, numeric_features, categorical_features, n_folds):
    """
    Optuna objective function for hyperparameter optimization.
    """
    # Hyperparameter search space
    catboost_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "iterations": trial.suggest_int("iterations", 100, 1000, step=50),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "thread_count": -1,
        "random_seed": 42,
        "verbose": 0,
    }

    # Create the pipeline
    pipeline = get_pipeline(catboost_params, numeric_features, categorical_features)

    # Cross-validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    mae_scores, mape_scores, rmse_scores = [], [], []

    for train_idx, val_idx in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Fit and predict
        pipeline.fit(X_train_fold, y_train_fold)
        y_pred = pipeline.predict(X_val_fold)

        # Compute metrics
        mae_scores.append(mean_absolute_error(y_val_fold, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_val_fold, y_pred)))
        mape_scores.append(np.mean(np.abs((y_val_fold - y_pred) / y_val_fold)) * 100)

    # Log metrics
    trial.set_user_attr("rmse", np.mean(rmse_scores))
    trial.set_user_attr("mape", np.mean(mape_scores))

    # Return MAE (objective metric to minimize)
    return np.mean(mae_scores)

In [11]:
def optimize_model(X_train, y_train, numeric_features, categorical_features, n_folds=5, n_trials=50):
    """
    Run Optuna optimization.
    """
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_train, y_train, numeric_features, categorical_features, n_folds),
                   n_trials=n_trials)

    print("Best Parameters:", study.best_params)
    print("Best MAE:", study.best_value)
    return study

In [12]:
study = optimize_model(X_train, y_train, numerical_features, categorical_features, n_folds=5, n_trials=10)

[I 2024-12-16 15:56:57,306] A new study created in memory with name: no-name-2a477f02-df1f-4dcb-87e7-4d39b11b7298
[I 2024-12-16 15:57:01,646] Trial 0 finished with value: 63981.89939456359 and parameters: {'learning_rate': 0.045379742417057445, 'iterations': 300, 'depth': 5, 'l2_leaf_reg': 0.15378156813395868, 'bagging_temperature': 0.5149098769949544, 'random_strength': 0.03651727037601661, 'subsample': 0.7611358720203436}. Best is trial 0 with value: 63981.89939456359.
[I 2024-12-16 15:57:07,401] Trial 1 finished with value: 65703.46575643125 and parameters: {'learning_rate': 0.010322524131531601, 'iterations': 550, 'depth': 4, 'l2_leaf_reg': 0.13604965556109078, 'bagging_temperature': 0.8450601831741675, 'random_strength': 2.9947872049124133, 'subsample': 0.5116132391621052}. Best is trial 0 with value: 63981.89939456359.
[I 2024-12-16 15:57:23,381] Trial 2 finished with value: 64598.97072977617 and parameters: {'learning_rate': 0.020125731321338408, 'iterations': 850, 'depth': 8, '

Best Parameters: {'learning_rate': 0.045379742417057445, 'iterations': 300, 'depth': 5, 'l2_leaf_reg': 0.15378156813395868, 'bagging_temperature': 0.5149098769949544, 'random_strength': 0.03651727037601661, 'subsample': 0.7611358720203436}
Best MAE: 63981.89939456359


In [14]:
study.best_params

{'learning_rate': 0.045379742417057445,
 'iterations': 300,
 'depth': 5,
 'l2_leaf_reg': 0.15378156813395868,
 'bagging_temperature': 0.5149098769949544,
 'random_strength': 0.03651727037601661,
 'subsample': 0.7611358720203436}

In [15]:
best_params = {
    'learning_rate': 0.045379742417057445,
    'iterations': 300,
    'depth': 5,
    'l2_leaf_reg': 0.15378156813395868,
    'bagging_temperature': 0.5149098769949544,
    'random_strength': 0.03651727037601661,
    'subsample': 0.7611358720203436
}

## Training Model

In [16]:
best_params = study.best_params
best_params.update({
    "thread_count": -1,
    "random_seed": 42,
    "verbose": 0,
})

final_pipeline = get_pipeline(best_params, numerical_features, categorical_features)
final_pipeline.fit(X_train, y_train)

y_test_pred = final_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

print("\nFinal Model Performance on Test Data")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.2f}%")


Final Model Performance on Test Data
MAE: 63406.1645
RMSE: 101590.3246
MAPE: 81.73%


In [17]:
joblib.dump(final_pipeline, '../model/inference_pipeline.pkl')

['../model/inference_pipeline.pkl']

In [18]:
joblib.load('../model/inference_pipeline.pkl')