In [29]:
# Модели
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Пайплайн
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

# Данные
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, PowerTransformer, PolynomialFeatures, QuantileTransformer
from sklearn.utils.class_weight import compute_sample_weight

In [32]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')
SUBMISSION_PATH = os.path.join(ROOT, '../submissions/')

def save_submission(model, subname):
    subname = os.path.join(SUBMISSION_PATH, f'{subname}.csv')
    preds = model.predict(test_df)
    submit_df = pd.DataFrame({'id': test_df.index, 'value': preds})
    submit_df.to_csv(subname, index=False)

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
balanced_df = pd.read_csv(BALANCED_DATASET, index_col=0)
train_df = balanced_df
test_df = pd.read_csv(TEST_DATASET)

# first_negatives = train_df[train_df['y'] == 0][:train_df[train_df['y'] == 1]['y'].count()]
# train_df = pd.concat([train_df[train_df['y'] == 1], first_negatives])

# def random_undersample(df):
#     neg_count, pos_count = np.bincount(df['y'])
#     pos_df = df[df['y'] == 1]
#     neg_df = df[df['y'] == 0]
#     neg_df = neg_df.sample(n=pos_count, random_state=1708)
#     return pd.concat([pos_df, neg_df])

# balanced_df = random_undersample(train_df)

# Нормализация
# train_df = train_df[train_df['Длительность'] < 500]
train_df = train_df[(train_df['Сумма'] > 2) & (train_df['Сумма'] < 10)]
# train_df = train_df[train_df['До поставки'] < 300]

X, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [33]:
# Категориальные фичи
cat_features = [
    'Purchasing Organization', 'Delivery Option', 
    'Company Code', 'Provider', 'Provider Delivery option', 
    'Provider Purchaser', 'Operations Manager', 'Sum Fold'
    ]

# Фичи на масштабирование
scale_features = [
    'Position Count', 'Duration', 'ETC Delivery',
    'Changes After Approvals', 'Order Approval 1', 'Order Approval 2',
    'Order Approval 3', 'Sum', 'Change Delivery Date 7', 'Change Delivery Date 15', 
    'Change Delivery Date 30', 'Approval Cycles', 'Handlers 7', 'Handlers 15', 
    'Handlers 30', 'Days Between 0_1', 'Days Between 1_2', 'Days Between 2_3', 
    'Days Between 3_4', 'Days Between 4_5', 
    'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8'
]

# Фичи на дроп
drop_features = [
    'Material', 'Cancel Complete Release', 'Amount',
    'Material Group', 'Month1', 'Month2', 'Month3', 
    'Weekday', 'Delivery Date', 'Change on Paper', 
    'Category Manager', 'NRP', 'EI'
    ]

# Новые имена фич
rename_cols = [
    'Provider', 'Material', 'Category Manager', 'Operations Manager',
    'Factory', 'Purchasing Organization', 'Purchasing Group', 
    'Company Code', 'EI', 'Material Group', 'Delivery Option', 'NRP',
    'Duration', 'ETC Delivery', 'Month1', 'Month2', 'Month3', 'Weekday',
    'Sum', 'Position Count', 'Amount', 'Handlers 7', 'Handlers 15', 
    'Handlers 30', 'Order Approval 1', 'Order Approval 2', 'Order Approval 3',
    'Change Delivery Date 7', 'Change Delivery Date 15', 'Change Delivery Date 30',
    'Cancel Complete Release', 'Change on Paper', 'Delivery Date', 
    'Approval Cycles', 'Changes After Approvals', 'Days Between 0_1', 
    'Days Between 1_2', 'Days Between 2_3', 'Days Between 3_4', 'Days Between 4_5', 
    'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8'
    ]

In [34]:
# Препроцессоры
class DataPreprocessor(BaseEstimator, TransformerMixin):
    """ Предобработчик данных """
    def __init__(self, cat_features, scale_features,
                 drop_features, rename_cols, transform_train=True):
        self.transform_train = transform_train
        self.cat_features = cat_features

        self.bin_encoder = BinaryEncoder(cols=cat_features)
        self.robust_scaler = RobustScaler()

        self.rename_cols = rename_cols

        self.drop_features = drop_features
        self.scale_features = scale_features

    def fit(self, X, y=None):
        # Создаём копию датасета
        X_ = X.copy()
        X_.columns = self.rename_cols

        # Временные фичи
        X_['Weekday'] += 1
        X_['day_sin'] = np.sin(np.pi * 2 * X_['Weekday'] / 7)
        X_['day_cos'] = np.cos(np.pi * 2 * X_['Weekday'] / 7)
        X_['month1_sin'] = np.sin(np.pi * 2 * X_['Month1'] / 12)
        X_['month1_cos'] = np.cos(np.pi * 2 * X_['Month1'] / 12)
        X_['month2_sin'] = np.sin(np.pi * 2 * X_['Month2'] / 12)
        X_['month2_cos'] = np.cos(np.pi * 2 * X_['Month2'] / 12)
        X_['month3_sin'] = np.sin(np.pi * 2 * X_['Month3'] / 12)
        X_['month3_cos'] = np.cos(np.pi * 2 * X_['Month3'] / 12)

        # Экстракция фич
        X_['Provider Purchaser'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Purchasing Organization'].values)]
        X_['Provider Delivery option'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Delivery Option'].values)]
        X_['Sum Fold'] = X_['Sum'].apply(lambda x: int(x % 10))
        
        # Нормализация
        self.robust_scaler.fit(X_[self.scale_features])

        # Категориальные фичи        
        X_ = self.bin_encoder.fit_transform(X_)

        X_ = X_.drop(self.drop_features, axis=1)
        
        return self
    
    def transform(self, X):
        # Создаём копию датасета
        X_ = X.copy()
        X_.columns = self.rename_cols

        # Временные фичи
        X_['Weekday'] += 1
        X_['day_sin'] = np.sin(np.pi * 2 * X_['Weekday'] / 7)
        X_['day_cos'] = np.cos(np.pi * 2 * X_['Weekday'] / 7)
        X_['month1_sin'] = np.sin(np.pi * 2 * X_['Month1'] / 12)
        X_['month1_cos'] = np.cos(np.pi * 2 * X_['Month1'] / 12)
        X_['month2_sin'] = np.sin(np.pi * 2 * X_['Month2'] / 12)
        X_['month2_cos'] = np.cos(np.pi * 2 * X_['Month2'] / 12)
        X_['month3_sin'] = np.sin(np.pi * 2 * X_['Month3'] / 12)
        X_['month3_cos'] = np.cos(np.pi * 2 * X_['Month3'] / 12)

        # Экстракция фич
        X_['Provider Purchaser'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Purchasing Organization'].values)]
        X_['Provider Delivery option'] = [f'{x}_{y}' for x, y in zip(X_['Provider'].values, X_['Delivery Option'].values)]
        X_['Sum Fold'] = X_['Sum'].apply(lambda x: x % 10)

        # Нормализация
        X_[self.scale_features] = self.robust_scaler.transform(X_[self.scale_features])

        # Категориальные фичи
        X_ = self.bin_encoder.transform(X_)

        X_ = X_.drop(self.drop_features, axis=1)

        return X_


In [35]:
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [36]:
cat_params = [
    {
    'learning_rate': 0.3073614381077149, 
    'iterations': 1000, 
    'max_depth': 10, 
    'l2_leaf_reg': 0.9661379548019147,
    'auto_class_weights': 'Balanced',
    'task_type': 'GPU'
    },
    {
    'learning_rate': 0.3073614381077149, 
    'iterations': 1000, 
    'max_depth': 12, 
    'l2_leaf_reg': 0.9661379548019147,
    'auto_class_weights': 'Balanced',
    'task_type': 'GPU'
    },
    {
    'learning_rate': 0.1073614381077149, 
    'iterations': 1000, 
    'max_depth': 12, 
    'l2_leaf_reg': 0.9661379548019147,
    'auto_class_weights': 'Balanced',
    'task_type': 'GPU'
    }
]

lgbm_params = [
    {
    'learning_rate': 0.3, 
    'n_estimators': 1500, 
    'max_depth': 16, 
    'max_bin': 160, 
    'num_leaves': 128,
    'reg_lambda': 0.2,
    'is_unbalance': True,
    'n_jobs': -1
    },
    {
    'learning_rate': 0.15, 
    'n_estimators': 1500, 
    'max_depth': 24, 
    'max_bin': 160, 
    'num_leaves': 128,
    'reg_lambda': 0.3,
    'is_unbalance': True,
    'n_jobs': -1
    },
    {
    'learning_rate': 0.3, 
    'n_estimators': 1500, 
    'max_depth': 12, 
    'max_bin': 160, 
    'num_leaves': 128,
    'reg_lambda': 0.2,
    'is_unbalance': True,
    'n_jobs': -1
    }
]

xgb_params = [
    {
    'learning_rate': 0.3, 
    'n_estimators': 1000, 
    'max_depth': 16, 
    'max_bin': 160, 
    'max_leaves': 128,
    'reg_lambda': 0.2,
    'n_jobs': -1,
    'tree_method': 'gpu_hist',
    'random_state': 42
    },
    {
    'learning_rate': 0.3, 
    'n_estimators': 1500, 
    'max_depth': 12, 
    'max_bin': 160, 
    'max_leaves': 128,
    'reg_lambda': 0.2,
    'n_jobs': -1,
    'tree_method': 'gpu_hist',
    'random_state': 1001
    },
    {
    'learning_rate': 0.1, 
    'n_estimators': 1500, 
    'max_depth': 9, 
    'max_bin': 160, 
    'max_leaves': 128,
    'reg_lambda': 0.2,
    'n_jobs': -1,
    'tree_method': 'gpu_hist',
    'random_state': 1708
    }
]

models = []

for index, params in enumerate(cat_params):
    cat_model = CatBoostClassifier(**params)
    models.append((f'cat_{index}', cat_model))
    
for index, params in enumerate(lgbm_params):
    lgbm_model = LGBMClassifier(**params)
    models.append((f'lgbm_{index}', lgbm_model))

for index, params in enumerate(xgb_params):
    xgb_model = XGBClassifier(**params)
    models.append((f'xgb_{index}', xgb_model))


In [44]:
# final_estimator_params = {
#     'learning_rate': 0.3, 
#     'n_estimators': 1500, 
#     'max_depth': 12, 
#     'max_bin': 160, 
#     'max_leaves': 128,
#     'reg_lambda': 0.2,
#     'n_jobs': -1,
#     'tree_method': 'gpu_hist',
#     'random_state': 1001
#     }

# final_estimator = XGBClassifier(**final_estimator_params)
data_preprocessor = DataPreprocessor(cat_features, scale_features, drop_features, rename_cols)
stack = StackingClassifier(estimators=models, verbose=1)
pipeline = Pipeline([
    ('data', data_preprocessor),
    ('model', stack)
])

In [45]:
pipeline.fit(X_train, y_train)

0:	learn: 0.6188270	total: 27.4ms	remaining: 27.3s
1:	learn: 0.5651397	total: 53.5ms	remaining: 26.7s
2:	learn: 0.5357499	total: 80ms	remaining: 26.6s
3:	learn: 0.5120286	total: 110ms	remaining: 27.3s
4:	learn: 0.4909003	total: 136ms	remaining: 27.1s
5:	learn: 0.4757440	total: 165ms	remaining: 27.4s
6:	learn: 0.4588944	total: 192ms	remaining: 27.3s
7:	learn: 0.4497129	total: 219ms	remaining: 27.2s
8:	learn: 0.4389711	total: 245ms	remaining: 27s
9:	learn: 0.4290828	total: 273ms	remaining: 27s
10:	learn: 0.4218820	total: 299ms	remaining: 26.9s
11:	learn: 0.4122279	total: 324ms	remaining: 26.7s
12:	learn: 0.4069973	total: 349ms	remaining: 26.5s
13:	learn: 0.4020961	total: 376ms	remaining: 26.5s
14:	learn: 0.3940276	total: 403ms	remaining: 26.5s
15:	learn: 0.3868589	total: 428ms	remaining: 26.3s
16:	learn: 0.3813463	total: 456ms	remaining: 26.3s
17:	learn: 0.3713561	total: 485ms	remaining: 26.4s
18:	learn: 0.3655057	total: 513ms	remaining: 26.5s
19:	learn: 0.3591961	total: 540ms	remaining:

In [39]:
preds = pipeline.predict(X_test)

In [46]:
preds = pipeline.predict(X_test)
print(f1_score(y_test, preds, average='macro'))
print(roc_auc_score(y_test, preds))

0.9956197033587497
0.9956375498704727


In [42]:
preds = pipeline.predict(test_df)

In [43]:
save_submission(pipeline, 'submission_stack')