# Тюнинг CatBoost

In [1]:
# Тюнинг
import optuna as opt
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Данные
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [2]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
balanced_df = pd.read_csv(BALANCED_DATASET, index_col=0)
test_df = pd.read_csv(TEST_DATASET)

first_negatives = train_df[train_df['y'] == 0][:train_df[train_df['y'] == 1]['y'].count()]
train_df = pd.concat([train_df[train_df['y'] == 1], first_negatives])

X, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

FEATURES_TO_DROP = [
    'Изменение позиции заказа на закупку: изменение даты поставки на бумаге',
    'Количество', 'Категорийный менеджер', 'Завод', 'Материал',
    'Отмена полного деблокирования заказа на закупку', 'Количество циклов согласования'
    ]

# FEATURES_TO_KEEP = ['Поставщик', 'Длительность', 'Закупочная организация', 'Балансовая единица',
#                     'ЕИ', 'Сумма', 'Вариант поставки', 'НРП', 'Вариант поставки',
#                     'Месяц1', 'Месяц2', 'Месяц3', 'День недели 2']
# FEATURES_TO_DROP = X.columns[~X.columns.isin(FEATURES_TO_KEEP)]

X_train = X_train.drop(FEATURES_TO_DROP, axis=1)
X_test = X_test.drop(FEATURES_TO_DROP, axis=1)
test_df = test_df.drop(FEATURES_TO_DROP, axis=1)

In [3]:
# Препроцессоры
class DataPreprocessor(BaseEstimator, TransformerMixin):
    """ Предобработчик данных """
    def __init__(self, transform_train=True):
        self.transform_train = transform_train

    def fit(self, X, y=None):
        # Создаём копию датасета
        X_ = X.copy()

        # Временные фичи
        # X_['day_sin'] = np.sin(np.pi * 2 * X_['День недели 2'] / 6)
        # X_['day_cos'] = np.cos(np.pi * 2 * X_['День недели 2'] / 6)

        return self
    
    def transform(self, X):
        # Создаём копию датасета
        X_ = X.copy()

        # Временные фичи
        # X_['day_sin'] = np.sin(np.pi * 2 * X_['День недели 2'] / 6)
        # X_['day_cos'] = np.cos(np.pi * 2 * X_['День недели 2'] / 6)

        return X_


In [3]:
# Функция оптимизации
def objective(trial: opt.Trial):
    # Параметры
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.7, log=True)
    n_estimators = trial.suggest_int('n_estimators', 300, 1000, 50)
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_bin = trial.suggest_int('max_bin', 32, 128)
    l2_leaf_reg = trial.suggest_float('l2_reg', 0.1, 0.7, log=True)

    # Модель
    model = CatBoostClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        l2_leaf_reg=l2_leaf_reg,
        cat_features=[
            'Поставщик', 'Материал', 'Операционный менеджер',
            'Завод', 'Закупочная организация', 'Группа закупок', 'Балансовая единица',
            'ЕИ', 'Вариант поставки'
            ]
        )

    model.fit(X_train, y_train, verbose=False)
    accuracy = f1_score(y_test, model.predict(X_test), average='macro')

    # cv_score = cross_val_score(model, X_train, y_train, cv=StratifiedKFold(), scoring='f1_macro', n_jobs=-1)
    # accuracy = cv_score.mean()
    
    return accuracy

In [11]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2023-08-09 13:29:15,823] A new study created in memory with name: no-name-80220af9-5c57-4564-8a21-681c44adb7fb
[I 2023-08-09 13:29:34,028] Trial 0 finished with value: 0.7263418842597797 and parameters: {'learning_rate': 0.15512026275899193, 'n_estimators': 100, 'max_depth': 5, 'max_bin': 92, 'l2_reg': 0.14302786577478915}. Best is trial 0 with value: 0.7263418842597797.
[I 2023-08-09 13:29:53,359] Trial 1 finished with value: 0.7856322134760165 and parameters: {'learning_rate': 0.35584524256671124, 'n_estimators': 100, 'max_depth': 6, 'max_bin': 105, 'l2_reg': 0.2406634688016393}. Best is trial 1 with value: 0.7856322134760165.


In [4]:
cat_features=[
    'Поставщик', 'Операционный менеджер', 'Закупочная организация', 
    'Группа закупок', 'Балансовая единица','ЕИ', 'Вариант поставки'
]

data_preproc = DataPreprocessor()
model = CatBoostClassifier(
    learning_rate=0.3,
    n_estimators=1000,
    max_depth=10,
    l2_leaf_reg=0.4,
    cat_features=cat_features
    )   

pipeline = Pipeline([
    ('data_preproc', data_preproc),
    ('model', model)
])

pipeline.fit(X_train, y_train)
accuracy = f1_score(y_test, pipeline.predict(X_test), average='macro')

NameError: name 'DataPreprocessor' is not defined

In [20]:
f1_score(y_test, pipeline.predict(X_test))

0.981243075982733

In [21]:
preds = pipeline.predict(test_df)
submit_df = pd.DataFrame({'id': test_df.index, 'value': preds})

In [22]:
submit_df.to_csv('submission-cat.csv', index=False)

In [None]:
cv_score = cross_val_score(model, X_test, y_test, cv=StratifiedKFold(), n_jobs=-1)

In [42]:
test_df[cat_features]

Unnamed: 0,Поставщик,Материал,Операционный менеджер,Завод,Закупочная организация,Группа закупок,Балансовая единица,ЕИ,Вариант поставки
0,273,269,8,1,1,64,1,9,1
1,499,27439,10,18,16,1,14,1,2
2,86,27439,2,1,1,172,1,1,2
3,97,4064,4,7,1,22,6,3,1
4,117,27439,3,1,1,2,1,1,1
...,...,...,...,...,...,...,...,...,...
24995,255,6409,6,3,1,10,3,1,2
24996,1,5827,1,7,1,46,6,1,1
24997,8,6504,5,3,1,17,3,1,2
24998,18,1309,17,1,1,44,1,1,2


In [47]:
test_df[cat_features] = test_df[cat_features]

TypeError: arg must be a list, tuple, 1-d array, or Series

In [43]:
submission_cat = pd.DataFrame({'id': test_df.index, 'value': pipeline.predict(test_df)})
submission_cat.to_csv('cat_submission.csv', index=False)

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=11]=1.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.