# Тюнинг CatBoost

In [1]:
# Тюнинг
import optuna as opt
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score

# Данные
import os
import pandas as pd

In [2]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
test_df = pd.read_csv(TEST_DATASET)

X, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
X = X.drop([
    'Категорийный менеджер', 'Изменение позиции заказа на закупку: изменение даты поставки на бумаге',
    'Количество', 'Дней между 0_1', 'Дней между 1_2', 'Дней между 2_3', 'Дней между 3_4', 'Дней между 4_5', 
    'Дней между 5_6', 'Дней между 6_7', 'Дней между 7_8', 'Согласование заказа 1', 'Согласование заказа 2',
    'Согласование заказа 3', 'Изменение даты поставки 7', 'Изменение даты поставки 15', 'Изменение даты поставки 30'
    ], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [10]:
# Функция оптимизации
def objective(trial: opt.Trial):
    # Параметры
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.7, log=True)
    n_estimators = trial.suggest_int('n_estimators', 300, 1000, 50)
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_bin = trial.suggest_int('max_bin', 32, 128)
    l2_leaf_reg = trial.suggest_float('l2_reg', 0.1, 0.7, log=True)

    # Модель
    model = CatBoostClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        l2_leaf_reg=l2_leaf_reg,
        cat_features=[
            'Поставщик', 'Материал', 'Операционный менеджер',
            'Завод', 'Закупочная организация', 'Группа закупок', 'Балансовая единица',
            'ЕИ', 'Вариант поставки'
            ]
        )

    model.fit(X_train, y_train, verbose=False)
    accuracy = f1_score(y_test, model.predict(X_test), average='macro')

    # cv_score = cross_val_score(model, X_train, y_train, cv=StratifiedKFold(), scoring='f1_macro', n_jobs=-1)
    # accuracy = cv_score.mean()
    
    return accuracy

In [11]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2023-08-09 13:29:15,823] A new study created in memory with name: no-name-80220af9-5c57-4564-8a21-681c44adb7fb
[I 2023-08-09 13:29:34,028] Trial 0 finished with value: 0.7263418842597797 and parameters: {'learning_rate': 0.15512026275899193, 'n_estimators': 100, 'max_depth': 5, 'max_bin': 92, 'l2_reg': 0.14302786577478915}. Best is trial 0 with value: 0.7263418842597797.
[I 2023-08-09 13:29:53,359] Trial 1 finished with value: 0.7856322134760165 and parameters: {'learning_rate': 0.35584524256671124, 'n_estimators': 100, 'max_depth': 6, 'max_bin': 105, 'l2_reg': 0.2406634688016393}. Best is trial 1 with value: 0.7856322134760165.


In [39]:
model = CatBoostClassifier(
    learning_rate=0.3,
    n_estimators=600,
    max_depth=8,
    l2_leaf_reg=0.2,
    cat_features=[
        'Поставщик', 'Материал',
        'Завод', 'Закупочная организация', 'Группа закупок', 'Балансовая единица',
        'ЕИ', 'Вариант поставки'
        ]  
    )   

model.fit(X_train, y_train, plot=True)
accuracy = f1_score(y_test, model.predict(X_test), average='macro')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.5016694	total: 642ms	remaining: 6m 24s
1:	learn: 0.4054375	total: 1.26s	remaining: 6m 15s
2:	learn: 0.3666208	total: 1.82s	remaining: 6m 1s
3:	learn: 0.3418767	total: 2.51s	remaining: 6m 14s
4:	learn: 0.3246549	total: 3.13s	remaining: 6m 12s
5:	learn: 0.3121603	total: 3.79s	remaining: 6m 15s
6:	learn: 0.3066716	total: 4.36s	remaining: 6m 9s
7:	learn: 0.2990171	total: 4.98s	remaining: 6m 8s
8:	learn: 0.2941848	total: 5.57s	remaining: 6m 5s
9:	learn: 0.2862520	total: 6.13s	remaining: 6m 1s
10:	learn: 0.2825685	total: 6.79s	remaining: 6m 3s
11:	learn: 0.2774157	total: 7.42s	remaining: 6m 3s
12:	learn: 0.2743473	total: 8.04s	remaining: 6m 3s
13:	learn: 0.2701579	total: 8.7s	remaining: 6m 4s
14:	learn: 0.2681048	total: 9.4s	remaining: 6m 6s
15:	learn: 0.2659529	total: 10s	remaining: 6m 6s
16:	learn: 0.2645141	total: 10.7s	remaining: 6m 5s
17:	learn: 0.2616967	total: 11.2s	remaining: 6m 3s
18:	learn: 0.2579565	total: 11.9s	remaining: 6m 2s
19:	learn: 0.2556461	total: 12.4s	remain

In [40]:
f1_score(y_test, model.predict(X_test))

0.7747141896435776

In [46]:
cv_score = cross_val_score(model, X_test, y_test, cv=StratifiedKFold(), n_jobs=-1)

RuntimeError: Cannot clone object <catboost.core.CatBoostClassifier object at 0x0000022668E34650>, as the constructor either does not set or modifies parameter custom_metric