In [None]:
import optuna
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# --- 1. DATA PREPARATION ---

# Let's assume 'df' is your DataFrame and 'loan_paid_back' is the target
# X = df.drop('loan_paid_back', axis=1)
# y = df['loan_paid_back']

# Identify your categorical features by NAME
# Note: combine your Nominal and Ordinal lists for CatBoost
cat_features_names = ORDINAL_COLS + NOMINAL_COLS

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create CatBoost Pools (Optimized internal data structure)
# This is much faster for Optuna to reuse than passing raw DataFrames every time
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features_names)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features_names)

# --- 2. DEFINE THE OBJECTIVE FUNCTION ---

def objective(trial):
    """
    Optuna will run this function many times with different parameters.
    It returns the score (accuracy/AUC) to maximize.
    """
    
    # A. Define the search space for hyperparameters
    param = {
        'iterations': 1000,                        # Fixed high number, use early stopping
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10), # CatBoost prefers 6-10
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1, 10),
        'border_count': 254,                       # Max splits for numerical features (default 254)
        
        # Hardcoded parameters
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',                      # Or 'Accuracy'
        'verbose': False,                          # Keep output clean
        'task_type': 'CPU',                        # Change to 'GPU' if available
        'early_stopping_rounds': 50
    }

    # B. Train the model with these parameters
    model = CatBoostClassifier(**param)
    
    model.fit(
        train_pool,
        eval_set=test_pool,
        use_best_model=True
    )
    
    # C. Return the metric to optimize
    # Since we set use_best_model=True, model.predict uses the best iteration
    preds_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds_proba)
    
    return auc

# --- 3. RUN THE OPTIMIZATION ---

print("Starting Optuna Optimization...")
study = optuna.create_study(direction='maximize') # We want to maximize AUC
study.optimize(objective, n_trials=30) # Run 30 different experiments

print("-" * 50)
print("BEST PARAMETERS FOUND:")
print(study.best_params)
print(f"BEST AUC: {study.best_value}")

# --- 4. TRAIN FINAL MODEL ---

# Take the best parameters found by Optuna
best_params = study.best_params

# Add the fixed parameters back in (since they weren't in the 'best_params' dict)
best_params['iterations'] = 2000 # Increase iterations for final robust model
best_params['loss_function'] = 'Logloss'
best_params['eval_metric'] = 'AUC'
best_params['task_type'] = 'CPU'
best_params['early_stopping_rounds'] = 100

print("\nTraining Final Model with Best Parameters...")
final_model = CatBoostClassifier(**best_params)
final_model.fit(
    train_pool, 
    eval_set=test_pool, 
    use_best_model=True, 
    plot=True # Nice graph in Jupyter
)