In [1]:
# Тюнинг
import optuna as opt
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.feature_selection import RFE

# Пайплайн
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

# Данные
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, SplineTransformer, PowerTransformer, PolynomialFeatures, QuantileTransformer
from sklearn.gaussian_process import GaussianProcessClassifier

%matplotlib inline

In [2]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')
SUBMISSION_PATH = os.path.join(ROOT, '../submissions/')

def save_submission(model, subname):
    subname = os.path.join(SUBMISSION_PATH, f'{subname}.csv')
    preds = model.predict(test_df)
    submit_df = pd.DataFrame({'id': test_df.index, 'value': preds})
    submit_df.to_csv(subname, index=False)

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
train_df = train_df.drop_duplicates()
# balanced_df = pd.read_csv(BALANCED_DATASET, index_col=0)
# train_df = balanced_df
test_df = pd.read_csv(TEST_DATASET)

# first_negatives = train_df[train_df['y'] == 0][:train_df[train_df['y'] == 1]['y'].count()]
# train_df = pd.concat([train_df[train_df['y'] == 1], first_negatives])

# def random_undersample(df):
#     neg_count, pos_count = np.bincount(df['y'])
#     pos_df = df[df['y'] == 1]
#     neg_df = df[df['y'] == 0]
#     neg_df = neg_df.sample(n=pos_count, random_state=1708)
#     return pd.concat([pos_df, neg_df])

# balanced_df = random_undersample(train_df)

# Нормализация
# train_df = train_df[train_df['Длительность'] < 500]
# train_df = train_df[(train_df['Сумма'] > 2) & (train_df['Сумма'] < 10)]
# train_df = train_df[train_df['До поставки'] < 300]
# train_df = train_df[train_df['Дней между 0_1'] < 400]

X, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train = pd.concat([X_train, y_train], axis=1)

# X_train = X_train[(X_train['Сумма'] > 2) & (X_train['Сумма'] < 10)]
# X_train = X_train[X_train['Длительность'] < 400]

y_train = X_train['y']
X_train = X_train.drop('y', axis=1)

In [3]:
# Категориальные фичи
cat_features = [
    'Purchasing Organization', 'Delivery Option', 
    'Company Code', 'Provider', 'Provider Delivery option', 
    'Provider Purchaser', 'Operations Manager', 'Sum Fold',
    'Month1', 'Month2', 'Month3', 'Weekday', 'EI'
    ]

# Фичи на масштабирование
scale_features = [
    'Position Count', 'Duration', 'ETC Delivery',
    'Changes After Approvals', 'Order Approval 1', 'Order Approval 2',
    'Order Approval 3', 'Sum', 'Change Delivery Date 7', 'Change Delivery Date 15', 
    'Change Delivery Date 30', 'Approval Cycles', 'Handlers 7', 'Handlers 15', 
    'Handlers 30',
    'Days Between 0_1', 'Days Between 1_2', 'Days Between 2_3', 
    'Days Between 3_4', 'Days Between 4_5', 
    'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8'
]

# Фичи на дроп
drop_features = [
    'Material', 'Cancel Complete Release', 'Amount',
    'Material Group', 'Delivery Date', 'Change on Paper', 
    'Category Manager', 'NRP'
    ]

# Новые имена фич
rename_cols = [
    'Provider', 'Material', 'Category Manager', 'Operations Manager',
    'Factory', 'Purchasing Organization', 'Purchasing Group', 
    'Company Code', 'EI', 'Material Group', 'Delivery Option', 'NRP',
    'Duration', 'ETC Delivery', 'Month1', 'Month2', 'Month3', 'Weekday',
    'Sum', 'Position Count', 'Amount', 'Handlers 7', 'Handlers 15', 
    'Handlers 30', 'Order Approval 1', 'Order Approval 2', 'Order Approval 3',
    'Change Delivery Date 7', 'Change Delivery Date 15', 'Change Delivery Date 30',
    'Cancel Complete Release', 'Change on Paper', 'Delivery Date', 
    'Approval Cycles', 'Changes After Approvals', 'Days Between 0_1', 
    'Days Between 1_2', 'Days Between 2_3', 'Days Between 3_4', 'Days Between 4_5', 
    'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8'
    ]

In [4]:
# Препроцессоры
class DataPreprocessor(BaseEstimator, TransformerMixin):
    """ Предобработчик данных """
    def __init__(self, cat_features, scale_features,
                 drop_features, rename_cols, transform_train=True):
        self.transform_train = transform_train
        self.cat_features = cat_features

        self.bin_encoder = BinaryEncoder(cols=cat_features)
        self.robust_scaler = RobustScaler()
        self.spline = SplineTransformer(n_knots=3, degree=8, extrapolation='linear')

        self.rename_cols = rename_cols

        self.drop_features = drop_features
        self.scale_features = scale_features

    def fit(self, X, y=None):
        # Создаём копию датасета
        X_ = X.copy()
        X_.columns = self.rename_cols

        # Экстракция фич
        X_['Provider Purchaser'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Purchasing Organization'].values)]
        X_['Provider Delivery option'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Delivery Option'].values)]
        X_['Sum Fold'] = X_['Sum'].apply(lambda x: int(x % 10))
        
        # Нормализация
        self.robust_scaler.fit(X_[self.scale_features])

        # Категориальные фичи        
        # X_ = self.bin_encoder.fit_transform(X_)

        X_ = X_.drop(self.drop_features, axis=1)
        
        return self
    
    def transform(self, X):
        # Создаём копию датасета
        X_ = X.copy()
        X_.columns = self.rename_cols

        # Экстракция фич
        X_['Provider Purchaser'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Purchasing Organization'].values)]
        X_['Provider Delivery option'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Delivery Option'].values)]
        X_['Sum Fold'] = X_['Sum'].apply(lambda x: int(x % 10))

        # Нормализация
        X_[self.scale_features] = self.robust_scaler.transform(X_[self.scale_features])

        # Категориальные фичи
        # X_ = self.bin_encoder.transform(X_)

        X_ = X_.drop(self.drop_features, axis=1)

        return X_


In [7]:
# Функция оптимизации
def objective(trial: opt.Trial):
    # Параметры
    learning_rate = trial.suggest_float('learning_rate', 0.1, 1)
    iterations = trial.suggest_int('iterations', 300, 1000)
    max_depth = trial.suggest_int('max_depth', 4, 12)
    reg_lambda = trial.suggest_float('l2_leaf_reg', 0.1, 1)

    # Модель
    data_preprocessor = DataPreprocessor(cat_features, scale_features, drop_features, rename_cols)
    model = CatBoostClassifier(
        cat_features=cat_features,
        learning_rate=learning_rate,
        iterations=iterations,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        auto_class_weights='Balanced'
    )

    pipeline = Pipeline([
        ('data_preproc', data_preprocessor),
        ('model', model)
    ])
    
    cv_score = cross_val_score(pipeline, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='f1_macro', n_jobs=-1)
    accuracy = cv_score.mean()

    return accuracy

In [None]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [12]:
best_params = {
    'cat_features': cat_features,
    'learning_rate': 0.3073614381077149, 
    'iterations': 500, 
    'max_depth': 12, 
    'l2_leaf_reg': 0.2,
    'auto_class_weights': 'Balanced',
    'task_type': 'GPU'
}

In [13]:
data_preprocessor = DataPreprocessor(cat_features, scale_features, drop_features, rename_cols)
model = CatBoostClassifier(**best_params, eval_metric='F1')
pipeline = Pipeline([
    ('data_preproc', data_preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train, model__plot=True)
preds = pipeline.predict(X_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.7508314	total: 306ms	remaining: 2m 32s
1:	learn: 0.7978296	total: 587ms	remaining: 2m 26s
2:	learn: 0.8088934	total: 867ms	remaining: 2m 23s
3:	learn: 0.8192026	total: 1.16s	remaining: 2m 23s
4:	learn: 0.8288556	total: 1.43s	remaining: 2m 21s
5:	learn: 0.8380385	total: 1.68s	remaining: 2m 18s
6:	learn: 0.8438967	total: 1.97s	remaining: 2m 18s
7:	learn: 0.8511165	total: 2.24s	remaining: 2m 18s
8:	learn: 0.8560469	total: 2.54s	remaining: 2m 18s
9:	learn: 0.8621539	total: 2.81s	remaining: 2m 17s
10:	learn: 0.8684759	total: 3.1s	remaining: 2m 17s
11:	learn: 0.8713588	total: 3.37s	remaining: 2m 16s
12:	learn: 0.8745118	total: 3.64s	remaining: 2m 16s
13:	learn: 0.8786257	total: 3.91s	remaining: 2m 15s
14:	learn: 0.8825928	total: 4.2s	remaining: 2m 15s
15:	learn: 0.8861330	total: 4.49s	remaining: 2m 15s
16:	learn: 0.8882326	total: 4.78s	remaining: 2m 15s
17:	learn: 0.8907863	total: 5.05s	remaining: 2m 15s
18:	learn: 0.8930126	total: 5.32s	remaining: 2m 14s
19:	learn: 0.8954193	tot

In [15]:
print(f1_score(y_test, preds, average='macro'))
print(roc_auc_score(y_test, preds))

0.8820179973083964
0.8688453418502681


In [10]:
print(f1_score(y_test, preds, average='macro'))
print(roc_auc_score(y_test, preds))

0.8854213108420703
0.8815021923888918


In [79]:
save_submission(pipeline, 'submission_cat')

In [100]:
preds_cat = pd.DataFrame(preds)

In [101]:
%store -r preds_lgbm
preds_lgbm = pd.DataFrame(preds_lgbm)

In [102]:
res_df = pd.concat([preds_cat, preds_lgbm], axis=1)

In [103]:
res_df

Unnamed: 0,0,0.1
0,0,0
1,0,0
2,0,0
3,0,0
4,1,0
...,...,...
22495,0,0
22496,0,0
22497,0,0
22498,0,0


In [104]:
res_df['equal'] = [1 if x == y else 0 for x, y in zip(res_df.iloc[:, 0].values, res_df.iloc[:, 1].values)]

In [105]:
res_df['equal'].value_counts()

equal
1    21909
0      591
Name: count, dtype: int64

In [9]:
preds = pipeline.predict(X_test)
print(f1_score(y_test, preds, average='macro'))
print(roc_auc_score(y_test, preds))

0.8923816188804096
0.8822463824705095
