# Тюнинг CatBoost

In [29]:
# Тюнинг
import optuna as opt
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Данные
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [16]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
balanced_df = pd.read_csv(BALANCED_DATASET)
test_df = pd.read_csv(TEST_DATASET)

X, y = balanced_df.iloc[:, :-1], balanced_df.iloc[:, -1]
# X = X.drop([
#     'Категорийный менеджер', 'Изменение позиции заказа на закупку: изменение даты поставки на бумаге',
#     'Количество', 'Дней между 0_1', 'Дней между 1_2', 'Дней между 2_3', 'Дней между 3_4', 'Дней между 4_5', 
#     'Дней между 5_6', 'Дней между 6_7', 'Дней между 7_8', 'Согласование заказа 1', 'Согласование заказа 2',
#     'Согласование заказа 3', 'Изменение даты поставки 7', 'Изменение даты поставки 15', 'Изменение даты поставки 30'
#     ], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [31]:
# Препроцессоры
class DataPreprocessor(BaseEstimator, TransformerMixin):
    """ Предобработчик данных """
    def __init__(self, transform_train=True):
        self.transform_train = transform_train

    def fit(self, X, y=None):
        # Создаём копию датасета
        X_ = X.copy()

        # Временные фичи
        X_['day_sin'] = np.sin(np.pi * 2 * X_['День недели 2'] / 6)
        X_['day_cos'] = np.cos(np.pi * 2 * X_['День недели 2'] / 6)

        return self
    
    def transform(self, X):
        # Создаём копию датасета
        X_ = X.copy()

        # Временные фичи
        X_['day_sin'] = np.sin(np.pi * 2 * X_['День недели 2'] / 6)
        X_['day_cos'] = np.cos(np.pi * 2 * X_['День недели 2'] / 6)

        return X_


In [32]:
# Функция оптимизации
def objective(trial: opt.Trial):
    # Параметры
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.7, log=True)
    n_estimators = trial.suggest_int('n_estimators', 300, 1000, 50)
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_bin = trial.suggest_int('max_bin', 32, 128)
    l2_leaf_reg = trial.suggest_float('l2_reg', 0.1, 0.7, log=True)

    # Модель
    model = CatBoostClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        l2_leaf_reg=l2_leaf_reg,
        cat_features=[
            'Поставщик', 'Материал', 'Операционный менеджер',
            'Завод', 'Закупочная организация', 'Группа закупок', 'Балансовая единица',
            'ЕИ', 'Вариант поставки'
            ]
        )

    model.fit(X_train, y_train, verbose=False)
    accuracy = f1_score(y_test, model.predict(X_test), average='macro')

    # cv_score = cross_val_score(model, X_train, y_train, cv=StratifiedKFold(), scoring='f1_macro', n_jobs=-1)
    # accuracy = cv_score.mean()
    
    return accuracy

In [11]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2023-08-09 13:29:15,823] A new study created in memory with name: no-name-80220af9-5c57-4564-8a21-681c44adb7fb
[I 2023-08-09 13:29:34,028] Trial 0 finished with value: 0.7263418842597797 and parameters: {'learning_rate': 0.15512026275899193, 'n_estimators': 100, 'max_depth': 5, 'max_bin': 92, 'l2_reg': 0.14302786577478915}. Best is trial 0 with value: 0.7263418842597797.
[I 2023-08-09 13:29:53,359] Trial 1 finished with value: 0.7856322134760165 and parameters: {'learning_rate': 0.35584524256671124, 'n_estimators': 100, 'max_depth': 6, 'max_bin': 105, 'l2_reg': 0.2406634688016393}. Best is trial 1 with value: 0.7856322134760165.


In [36]:
cat_features=[
    'Поставщик', 'Материал', 'Операционный менеджер',
    'Завод', 'Закупочная организация', 'Группа закупок', 'Балансовая единица',
    'ЕИ', 'Вариант поставки'
]

data_preproc = DataPreprocessor()
model = CatBoostClassifier(
    learning_rate=0.25,
    n_estimators=1000,
    max_depth=6,
    l2_leaf_reg=0.1543,
    cat_features=cat_features,
    )   

pipeline = Pipeline([
    ('data_preproc', data_preproc),
    ('model', model)
])

pipeline.fit(X_train, y_train)
accuracy = f1_score(y_test, pipeline.predict(X_test), average='macro')

0:	learn: 0.4115130	total: 694ms	remaining: 11m 33s
1:	learn: 0.3554211	total: 1.29s	remaining: 10m 44s
2:	learn: 0.3229705	total: 2.02s	remaining: 11m 12s
3:	learn: 0.2934138	total: 2.88s	remaining: 11m 56s
4:	learn: 0.2808247	total: 3.53s	remaining: 11m 42s
5:	learn: 0.2719424	total: 4.1s	remaining: 11m 19s
6:	learn: 0.2630356	total: 4.75s	remaining: 11m 14s
7:	learn: 0.2538397	total: 5.6s	remaining: 11m 34s
8:	learn: 0.2484951	total: 6.38s	remaining: 11m 42s
9:	learn: 0.2404192	total: 7.09s	remaining: 11m 41s
10:	learn: 0.2373927	total: 7.83s	remaining: 11m 43s
11:	learn: 0.2297121	total: 8.54s	remaining: 11m 43s
12:	learn: 0.2251291	total: 9.11s	remaining: 11m 31s
13:	learn: 0.2213071	total: 9.77s	remaining: 11m 28s
14:	learn: 0.2182658	total: 10.3s	remaining: 11m 19s
15:	learn: 0.2156130	total: 11s	remaining: 11m 18s
16:	learn: 0.2133683	total: 11.7s	remaining: 11m 15s
17:	learn: 0.2108374	total: 12.4s	remaining: 11m 17s
18:	learn: 0.2079149	total: 13.1s	remaining: 11m 16s
19:	lea

In [37]:
f1_score(y_test, pipeline.predict(X_test))

0.9863013698630138

In [None]:
cv_score = cross_val_score(model, X_test, y_test, cv=StratifiedKFold(), n_jobs=-1)

In [42]:
test_df[cat_features]

Unnamed: 0,Поставщик,Материал,Операционный менеджер,Завод,Закупочная организация,Группа закупок,Балансовая единица,ЕИ,Вариант поставки
0,273,269,8,1,1,64,1,9,1
1,499,27439,10,18,16,1,14,1,2
2,86,27439,2,1,1,172,1,1,2
3,97,4064,4,7,1,22,6,3,1
4,117,27439,3,1,1,2,1,1,1
...,...,...,...,...,...,...,...,...,...
24995,255,6409,6,3,1,10,3,1,2
24996,1,5827,1,7,1,46,6,1,1
24997,8,6504,5,3,1,17,3,1,2
24998,18,1309,17,1,1,44,1,1,2


In [47]:
test_df[cat_features] = test_df[cat_features]

TypeError: arg must be a list, tuple, 1-d array, or Series

In [43]:
submission_cat = pd.DataFrame({'id': test_df.index, 'value': pipeline.predict(test_df)})
submission_cat.to_csv('cat_submission.csv', index=False)

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=11]=1.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.