In [2]:
# Тюнинг
import optuna as opt
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression

# Пайплайн
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

# Данные
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, SplineTransformer, PowerTransformer, PolynomialFeatures, QuantileTransformer
from sklearn.gaussian_process import GaussianProcessClassifier


%matplotlib inline

In [3]:
# Категориальные фичи
cat_features = [
    'Provider', 'EI', 'Company Code', 'Factory', 'Purchasing Organization', 
    'Purchasing Group', 'Material', 'Material Group', 'NRP'
    ]

# Фичи на масштабирование
scale_features = [
    'Position Count', 'Duration', 'ETC Delivery', 'Changes After Approvals', 
    'Sum', 'Handlers 7', 'Handlers 15', 'Handlers 30', 'Order Approval 1', 
    'Order Approval 2', 'Order Approval 3', 'Change Delivery Date 15', 'Change Delivery Date 30',
    'Change Delivery Date 7', 'Days Between 1_2', 'Days Between 2_3', 'Days Between 3_4', 'Days Between 4_5', 
    'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8', 'Change on Paper', 'Delivery Date',
    'Approval Cycles', 'Changes After Approvals'
]

drop_features = []

# Новые имена фич
rename_cols = [
    'Provider', 'Material', 'Category Manager', 'Operations Manager',
    'Factory', 'Purchasing Organization', 'Purchasing Group', 
    'Company Code', 'EI', 'Material Group', 'Delivery Option', 'NRP',
    'Duration', 'ETC Delivery', 'Month1', 'Month2', 'Month3', 'Weekday',
    'Sum', 'Position Count', 'Amount', 'Handlers 7', 'Handlers 15', 
    'Handlers 30', 'Order Approval 1', 'Order Approval 2', 'Order Approval 3',
    'Change Delivery Date 7', 'Change Delivery Date 15', 'Change Delivery Date 30',
    'Cancel Complete Release', 'Change on Paper', 'Delivery Date', 
    'Approval Cycles', 'Changes After Approvals', 'Days Between 0_1', 
    'Days Between 1_2', 'Days Between 2_3', 'Days Between 3_4', 'Days Between 4_5', 
    'Days Between 5_6', 'Days Between 6_7', 'Days Between 7_8', 'y'
    ]

In [4]:
# Пути
ROOT = os.getcwd()
TRAIN_DATASET = os.path.join(ROOT, '../data/train_AIC.csv')
BALANCED_DATASET = os.path.join(ROOT, '../data/balanced_train.csv')
TEST_DATASET = os.path.join(ROOT, '../data/test_AIC.csv')
SUBMISSION_PATH = os.path.join(ROOT, '../submissions/')

# Загрузка
train_df = pd.read_csv(TRAIN_DATASET)
test_df = pd.read_csv(TEST_DATASET)

# first_negatives = train_df[train_df['y'] == 0][:train_df[train_df['y'] == 1]['y'].count()]
# train_df = pd.concat([train_df[train_df['y'] == 1], first_negatives])

train_df.columns = rename_cols
test_df.columns = rename_cols[:-1]
X, y = train_df.iloc[:, :-1], train_df.iloc[:, -1]   

# Временные фичи
X['Weekday'] += 1
X['day_sin'] = np.sin(np.pi * 2 * X['Weekday'] / 7)
X['day_cos'] = np.cos(np.pi * 2 * X['Weekday'] / 7)
X['month1_sin'] = np.sin(np.pi * 2 * X['Month1'] / 12)
X['month1_cos'] = np.cos(np.pi * 2 * X['Month1'] / 12)
X['month2_sin'] = np.sin(np.pi * 2 * X['Month2'] / 12)
X['month2_cos'] = np.cos(np.pi * 2 * X['Month2'] / 12)
X['month3_sin'] = np.sin(np.pi * 2 * X['Month3'] / 12)
X['month3_cos'] = np.cos(np.pi * 2 * X['Month3'] / 12)

bin_encoder = BinaryEncoder(cols=cat_features)
rob_scaler = RobustScaler()

X = bin_encoder.fit_transform(X)
X[scale_features] = rob_scaler.fit_transform(X[scale_features])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

test_df['Weekday'] += 1
test_df['day_sin'] = np.sin(np.pi * 2 * test_df['Weekday'] / 7)
test_df['day_cos'] = np.cos(np.pi * 2 * test_df['Weekday'] / 7)
test_df['month1_sin'] = np.sin(np.pi * 2 * test_df['Month1'] / 12)
test_df['month1_cos'] = np.cos(np.pi * 2 * test_df['Month1'] / 12)
test_df['month2_sin'] = np.sin(np.pi * 2 * test_df['Month2'] / 12)
test_df['month2_cos'] = np.cos(np.pi * 2 * test_df['Month2'] / 12)
test_df['month3_sin'] = np.sin(np.pi * 2 * test_df['Month3'] / 12)
test_df['month3_cos'] = np.cos(np.pi * 2 * test_df['Month3'] / 12)
test_df = bin_encoder.transform(test_df)
test_df[scale_features] = rob_scaler.fit_transform(test_df[scale_features])

In [5]:
scores = {}
models = {}
best_params = {
    'learning_rate': 0.1741038067844043,
    'n_estimators': 1000,
    'max_depth': 8,
    'max_bin': 128,
    'num_leaves': 64,
    'reg_lambda': 0.00012895423530234794
    }

for index, base_feature in enumerate(train_df.columns[:-1]):
    base_feature_copy = base_feature
    if base_feature in cat_features:
        base_feature = [column for column in X.columns if column.startswith(base_feature)]
        train_data = X_train[base_feature]
    else:
        train_data = X_train[base_feature].values.reshape(-1, 1)

    # Модель
    model = LGBMClassifier(
        **best_params,
        n_jobs=-1,
        force_col_wise=True,
        is_unbalance=True,
        verbose=-1
    )

    # Оценка
    print(f'Оценивается: {base_feature_copy}\nПрогресс: {index + 1}/{len(train_df.columns)}')
    
    model.fit(train_data, y_train)
    cv_score = cross_val_score(model, train_data, y_train, cv=StratifiedKFold())

    scores.update({base_feature_copy: cv_score.mean()})
    models.update({f'model_{base_feature_copy}': [model, base_feature, base_feature_copy]})


Оценивается: Provider
Прогресс: 1/44
Оценивается: Material
Прогресс: 2/44
Оценивается: Category Manager
Прогресс: 3/44
Оценивается: Operations Manager
Прогресс: 4/44
Оценивается: Factory
Прогресс: 5/44
Оценивается: Purchasing Organization
Прогресс: 6/44
Оценивается: Purchasing Group
Прогресс: 7/44
Оценивается: Company Code
Прогресс: 8/44
Оценивается: EI
Прогресс: 9/44
Оценивается: Material Group
Прогресс: 10/44
Оценивается: Delivery Option
Прогресс: 11/44
Оценивается: NRP
Прогресс: 12/44
Оценивается: Duration
Прогресс: 13/44
Оценивается: ETC Delivery
Прогресс: 14/44
Оценивается: Month1
Прогресс: 15/44
Оценивается: Month2
Прогресс: 16/44
Оценивается: Month3
Прогресс: 17/44
Оценивается: Weekday
Прогресс: 18/44
Оценивается: Sum
Прогресс: 19/44
Оценивается: Position Count
Прогресс: 20/44
Оценивается: Amount
Прогресс: 21/44
Оценивается: Handlers 7
Прогресс: 22/44
Оценивается: Handlers 15
Прогресс: 23/44
Оценивается: Handlers 30
Прогресс: 24/44
Оценивается: Order Approval 1
Прогресс: 25/44
О

In [7]:
# Тестовая выборка (преобразованный датасет)
test_prepare = pd.DataFrame(index=X_test.index)
for model, features, feature_name in models.values():
    if isinstance(features, list):
        preds = model.predict(X_test[features])
    else:
        preds = model.predict(X_test[features].values.reshape(-1, 1))
        
    preds = pd.DataFrame({feature_name: preds}, index=X_test.index)
    test_prepare = pd.concat([test_prepare, preds], axis=1)

In [8]:
test_prepare['preds'] = test_prepare.mean(axis=1)
test_prepare['preds'] = test_prepare['preds'].apply(lambda x: np.int8(x))

In [10]:
f1_score(y_test, test_prepare['preds'], average='macro')

0.4611939941090543

In [103]:
# Тест AIIJC (преобразованный датасет)
# submit_prepare = pd.DataFrame(index=test_df.index)
# for model, features, feature_name in models.values():
#     preds = model.predict(test_df[features])
#     preds = pd.DataFrame({feature_name: preds}, index=test_df.index)
#     submit_prepare = pd.concat([submit_prepare, preds], axis=1)