In [27]:
# Тюнинг
import optuna as opt
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFE

# Пайплайн
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

# Данные
import os
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.gaussian_process import GaussianProcessClassifier

In [2]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')
SUBMISSION_PATH = os.path.join(ROOT, '../submissions/')

def save_submission(model, subname):
    subname = os.path.join(SUBMISSION_PATH, f'{subname}.csv')
    preds = model.predict(test_df)
    submit_df = pd.DataFrame({'id': test_df.index, 'value': preds})
    submit_df.to_csv(subname, index=False)

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
# balanced_df = pd.read_csv(BALANCED_DATASET, index_col=0)
test_df = pd.read_csv(TEST_DATASET)

first_negatives = train_df[train_df['y'] == 0][:train_df[train_df['y'] == 1]['y'].count()]
train_df = pd.concat([train_df[train_df['y'] == 1], first_negatives])

def random_undersample(df):
    neg_count, pos_count = np.bincount(df['y'])
    pos_df = df[df['y'] == 1]
    neg_df = df[df['y'] == 0]
    neg_df = neg_df.sample(n=pos_count, random_state=1708)
    return pd.concat([pos_df, neg_df])

balanced_df = random_undersample(train_df)

X, y = balanced_df.iloc[:, :-1], balanced_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [24]:
cat_features = [
    'Provider'
    ]

scale_features = [
    'Sum', 'Position Count', 'Duration', 'ETC Delivery',
    'Changes After Approvals', 'Order Approval 1', 'Order Approval 2',
    'Order Approval 3'
]

drop_features = [
    'Change on Paper', 'Amount', 'Category Manager', 'Factory', 
    'Material', 'Cancel Complete Release', 'Approval Cycles',
    'Change Delivery Date 15', 'Change Delivery Date 30', 
    'Company Code', 'EI', 'Delivery Option', 'Days Between 4_5',
    'Days Between 5_6', 'Days Between 6_7', 'Purchasing Organization',
    'Purchasing Group', 'Material Group', 'Days Between 1_2', 
    'Days Between 2_3', 'Days Between 3_4', 'Days Between 7_8',
    'Delivery Date', 'Days Between 0_1', 'NRP', 'Operations Manager'
    ]

In [4]:
# Препроцессоры
class DataPreprocessor(BaseEstimator, TransformerMixin):
    """ Предобработчик данных """
    def __init__(self, cat_features, scale_features,
                 drop_features, transform_train=True):
        self.transform_train = transform_train
        self.cat_features = cat_features

        self.bin_encoder = BinaryEncoder(cols=cat_features)
        self.onehot_encoder = OneHotEncoder(cols=['Поставщик'])
        self.robust_scaler = RobustScaler()
        self.rename_cols = (
            'Provider', 'Material', 'Category Manager', 'Operations Manager',
            'Factory', 'Purchasing Organization', 'Purchasing Group', 
            'Company Code', 'EI', 'Material Group', 'Delivery Option', 'NRP',
            'Duration', 'ETC Delivery', 'Month1', 'Month2', 'Month3', 'Weekday',
            'Sum', 'Position Count', 'Amount', 'Handlers 7', 'Handlers 15', 
            'Handlers 30', 'Order Approval 1', 'Order Approval 2', 'Order Approval 3',
            'Change Delivery Date 7', 'Change Delivery Date 15', 'Change Delivery Date 30',
            'Cancel Complete Release', 'Change on Paper', 'Delivery Date', 
            'Approval Cycles', 'Changes After Approvals', 'Days Between 0_1', 
            'Days Between 1_2', 'Days Between 2_3', 'Days Between 3_4', 'Days Between 4_5', 
            'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8'
            )

        self.drop_features = drop_features
        self.scale_features = scale_features

    def fit(self, X, y=None):
        # Создаём копию датасета
        X_ = X.copy()
        X_.columns = self.rename_cols
        X_ = X_.drop(self.drop_features, axis=1)

        # Временные фичи
        X_['Weekday'] += 1
        X_['day_sin'] = np.sin(np.pi * 2 * X_['Weekday'] / 7)
        X_['day_cos'] = np.cos(np.pi * 2 * X_['Weekday'] / 7)
        X_['month1_sin'] = np.sin(np.pi * 2 * X_['Month1'] / 12)
        X_['month1_cos'] = np.cos(np.pi * 2 * X_['Month1'] / 12)
        X_['month2_sin'] = np.sin(np.pi * 2 * X_['Month2'] / 12)
        X_['month2_cos'] = np.cos(np.pi * 2 * X_['Month2'] / 12)
        X_['month3_sin'] = np.sin(np.pi * 2 * X_['Month3'] / 12)
        X_['month3_cos'] = np.cos(np.pi * 2 * X_['Month3'] / 12)

        # Категориальные фичи
        X_ = self.bin_encoder.fit_transform(X_)

        features_to_drop = ['Weekday', 'Month1', 'Month2', 'Month3']
        X_ = X_.drop(features_to_drop, axis=1)

        # Масштабирование
        self.robust_scaler.fit(X_[self.scale_features])

        return self
    
    def transform(self, X):
        # Создаём копию датасета
        X_ = X.copy()
        X_.columns = self.rename_cols
        X_ = X_.drop(self.drop_features, axis=1)

        # Временные фичи
        X_['day_sin'] = np.sin(np.pi * 2 * X_['Weekday'] / 7)
        X_['day_cos'] = np.cos(np.pi * 2 * X_['Weekday'] / 7)
        X_['month1_sin'] = np.sin(np.pi * 2 * X_['Month1'] / 12)
        X_['month1_cos'] = np.cos(np.pi * 2 * X_['Month1'] / 12)
        X_['month2_sin'] = np.sin(np.pi * 2 * X_['Month2'] / 12)
        X_['month2_cos'] = np.cos(np.pi * 2 * X_['Month2'] / 12)
        X_['month3_sin'] = np.sin(np.pi * 2 * X_['Month3'] / 12)
        X_['month3_cos'] = np.cos(np.pi * 2 * X_['Month3'] / 12)

        # Категориальные фичи
        X_ = self.bin_encoder.transform(X_)

        features_to_drop = ['Weekday', 'Month1', 'Month2', 'Month3']

        X_ = X_.drop(features_to_drop, axis=1)

        # Масштабирование
        X_[self.scale_features] = self.robust_scaler.transform(X_[self.scale_features])

        return X_


In [None]:
# data_preprocessor.fit(X_train)
# X_train_preproc = data_preprocessor.transform(X_train)
# X_test_preproc = data_preprocessor.transform(X_test)

In [10]:
# Функция оптимизации
def objective(trial: opt.Trial):
    # Параметры
    learning_rate = trial.suggest_float('learning_rate', 0.1, 1, log=True)
    n_estimators = trial.suggest_int('n_estimators', 700, 1500, 50)
    max_depth = trial.suggest_int('max_depth', 6, 16)
    max_bin = trial.suggest_int('max_bin', 64, 352),
    num_leaves = trial.suggest_int('num_leaves', 64, 256)
    reg_lambda = trial.suggest_float('l2_reg', 0.1, 1, log=True)

    # Модель
    data_preprocessor = DataPreprocessor(cat_features)
    model = LGBMClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        max_bin=max_bin,
        n_jobs=-1,
        force_col_wise=True
    )

    rfe = RFE(estimator=model)

    pipeline = Pipeline([
        ('data_preproc', data_preprocessor),
        ('model', model)
    ])
    
    cv_score = cross_val_score(pipeline, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='f1_macro', n_jobs=-1)
    accuracy = cv_score.mean()

    return accuracy

In [None]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [28]:
best_params = {
    'learning_rate': 0.3, 
    'n_estimators': 1000, 
    'max_depth': 12, 
    'max_bin': 128, 
    'num_leaves': 128, 
    'reg_lambda': 0.2,
    }

# Модель
data_preprocessor = DataPreprocessor(cat_features, scale_features, drop_features)
# model = LGBMClassifier(
#     **best_params,
#     n_jobs=-1,
#     force_col_wise=True,
#     is_unbalance=True
# )

model = GaussianProcessClassifier()

pipeline = Pipeline([
    ('data_preproc', data_preprocessor),
    ('model', model)
])

In [29]:
pipeline.fit(X_train, y_train)
f1_score(y_test, pipeline.predict(X_test), average='macro')

MemoryError: Unable to allocate 12.6 GiB for an array with shape (1693940115,) and data type float64

In [17]:
feature_importances = pd.DataFrame({'Feature': pipeline['model'].feature_name_, 'Importance': pipeline['model'].feature_importances_})
feature_importances.to_csv('features.csv', index=False)

In [50]:
save_submission(pipeline, 'submission_LGBM_ENS')