In [44]:
# Тюнинг
import optuna as opt
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFE

# Пайплайн
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

# Данные
import os
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder
from sklearn.preprocessing import RobustScaler

In [45]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
balanced_df = pd.read_csv(BALANCED_DATASET, index_col=0)
test_df = pd.read_csv(TEST_DATASET)

first_negatives = train_df[train_df['y'] == 0][:train_df[train_df['y'] == 1]['y'].count()]
train_df = pd.concat([train_df[train_df['y'] == 1], first_negatives])

X, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

features_to_drop = [
    'Изменение позиции заказа на закупку: изменение даты поставки на бумаге',
    'Количество', 'Категорийный менеджер', 'Завод', 'Материал',
    'Отмена полного деблокирования заказа на закупку',
    ]

# features_to_keep = ['Поставщик', 'Длительность', 'Закупочная организация', 'Балансовая единица',
#                     'ЕИ', 'Сумма', 'Вариант поставки', 'Количество обработчиков 7', 
#                     'Количество обработчиков 15', 'Количество обработчиков 30']
# features_to_drop = X.columns[~X.columns.isin(features_to_keep)]

X_train = X_train.drop(features_to_drop, axis=1)
X_test = X_test.drop(features_to_drop, axis=1)
test_df = test_df.drop(features_to_drop, axis=1)

In [39]:
cat_features = [
    'Поставщик', 'Закупочная организация', 
    'Балансовая единица', 'ЕИ', 'Вариант поставки'
    ]

In [5]:
# Препроцессоры
class DataPreprocessor(BaseEstimator, TransformerMixin):
    """ Предобработчик данных """
    def __init__(self, cat_features, transform_train=True):
        self.transform_train = transform_train
        self.cat_features = cat_features

        self.bin_encoder = BinaryEncoder(cols=cat_features)
        self.robust_scaler = RobustScaler()

    def fit(self, X, y=None):
        # Создаём копию датасета
        X_ = X.copy()

        # Временные фичи
        # X_['day_sin'] = np.sin(np.pi * 2 * X_['День недели 2'] / 6)
        # X_['day_cos'] = np.cos(np.pi * 2 * X_['День недели 2'] / 6)

        # Категориальные фичи
        X_ = self.bin_encoder.fit_transform(X_)

        # Нумерация фич для LGBM (не принимает JSON символы)
        X_.columns = [num for num in range(0, len(X_.columns))]

        # Масштабирование
        # self.robust_scaler.fit(X_)

        return self
    
    def transform(self, X):
        # Создаём копию датасета
        X_ = X.copy()

        # Временные фичи
        # X_['day_sin'] = np.sin(np.pi * 2 * X_['День недели 2'] / 6)
        # X_['day_cos'] = np.cos(np.pi * 2 * X_['День недели 2'] / 6)

        # Категориальные фичи
        X_ = self.bin_encoder.transform(X_)

        # Нумерация фич для LGBM (не принимает JSON символы)
        X_.columns = [num for num in range(0, len(X_.columns))]

        # Масштабирование
        # X_ = self.robust_scaler.transform(X_)

        return X_


In [19]:
# data_preprocessor.fit(X_train)
# X_train_preproc = data_preprocessor.transform(X_train)
# X_test_preproc = data_preprocessor.transform(X_test)

In [6]:
# Функция оптимизации
def objective(trial: opt.Trial):
    # Параметры
    learning_rate = trial.suggest_float('learning_rate', 0.1, 1, log=True)
    n_estimators = trial.suggest_int('n_estimators', 700, 1500, 50)
    max_depth = trial.suggest_int('max_depth', 6, 16)
    max_bin = trial.suggest_int('max_bin', 64, 352),
    num_leaves = trial.suggest_int('num_leaves', 64, 256)
    reg_lambda = trial.suggest_float('l2_reg', 0.1, 1, log=True)

    # Модель
    data_preprocessor = DataPreprocessor(cat_features)
    model = LGBMClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        max_bin=max_bin,
        n_jobs=-1,
        force_col_wise=True
    )

    rfe = RFE(estimator=model)

    pipeline = Pipeline([
        ('data_preproc', data_preprocessor),
        ('model', model)
    ])
    
    cv_score = cross_val_score(pipeline, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='f1_macro', n_jobs=-1)
    accuracy = cv_score.mean()

    return accuracy

In [None]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [51]:
best_params = {
    'learning_rate': 0.3, 
    'n_estimators': 1000, 
    'max_depth': 8, 
    'max_bin': 128, 
    'num_leaves': 128, 
    'reg_lambda': 0.3
    }

# Модель
data_preprocessor = DataPreprocessor(cat_features)
scaler = RobustScaler()
model = LGBMClassifier(
    **best_params,
    n_jobs=-1,
    force_col_wise=True,
    subsample=0.8
)

pipeline = Pipeline([
    ('data_preproc', data_preprocessor),
    ('scaler', scaler),
    ('model', model)
])

In [52]:
pipeline.fit(X_train, y_train)
f1_score(y_test, pipeline.predict(X_test), average='macro')

[LightGBM] [Info] Number of positive: 29093, number of negative: 29113
[LightGBM] [Info] Total Bins 2050
[LightGBM] [Info] Number of data points in the train set: 58206, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499828 -> initscore=-0.000687
[LightGBM] [Info] Start training from score -0.000687


0.9058043273202369

In [42]:
preds = pipeline.predict(test_df)
submit_df = pd.DataFrame({'id': test_df.index, 'value': preds})

In [10]:
submit_df.to_csv('submission.csv', index=False)

In [29]:
# Модель
X_ = X_train.copy()
X_.columns = [num for num in range(0, len(X_train.columns))]

model = LGBMClassifier(
    **best_params,
    n_jobs=-1,
    force_col_wise=True,
)

model.fit(X_train, y_train)

X_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
81750,1,3,1,16,1,1,13,1,0.0,12,...,1.0,15,3.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0
209701,668,1,13,119,11,4,12,2,0.0,0,...,1.0,9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
111802,99,14,1,111,1,1,7,1,1.0,113,...,1.0,10,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0
38660,296,8,1,19,1,1,4,1,0.0,109,...,1.0,0,12.0,2.0,3.0,0.0,0.0,1.0,96.0,96.0
167262,24,3,1,16,1,1,66,1,1.0,29,...,1.0,9,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,308,1,17,166,16,1,14,2,1.0,80,...,0.0,6,143.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
12678,617,15,6,27,3,2,35,2,0.0,92,...,1.0,15,0.0,1.0,1.0,0.0,0.0,-1.0,-1.0,-1.0
131932,250,5,1,85,1,1,1,1,1.0,128,...,1.0,16,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,-1.0
146867,38,2,1,5,1,1,5,2,0.0,49,...,3.0,41,1.0,0.0,5.0,0.0,0.0,-1.0,-1.0,-1.0


In [26]:
pipeline['model'].feature_importances_

array([  349,  1503,  1602,  1617,  1668,  1638,  1518,  1632,  1716,
        1733,  1666,  1553,     9,  1045,  1113,  1336,  1177,  1169,
         399,   867,   876,   994,   749,   374,  1339,  1376,  1387,
        1279,  1335,  1406,  1451,  1309,     8,   980,   900,  1411,
         847,    68,   448,  1116,   914,  1114, 31865,   942,   960,
        1579, 30281, 32986, 11650, 17219, 11487, 15197, 67988, 29789,
       13691,  7344,  7617,  4627,  2542,  2221,  8625,  1463,  1470,
        1740,  4619,  1290, 32079, 27052,  7201,  8610,  5700,  2146,
        3665,  1699,  9989])