## Загрузка данных и подключение библиотек

In [1]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-may-2021
!pip install catboost
!pip install eli5
!pip install optuna
!pip install shap
!pip install scikit-learn-extra
!unzip /content/test.csv.zip
!unzip /content/train.csv.zip
!unzip /content/sample_submission.csv.zip

kaggle.json
Downloading sample_submission.csv.zip to /content
  0% 0.00/128k [00:00<?, ?B/s]
100% 128k/128k [00:00<00:00, 47.6MB/s]
Downloading train.csv.zip to /content
  0% 0.00/1.72M [00:00<?, ?B/s]
100% 1.72M/1.72M [00:00<00:00, 57.1MB/s]
Downloading test.csv.zip to /content
  0% 0.00/851k [00:00<?, ?B/s]
100% 851k/851k [00:00<00:00, 115MB/s]
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 61kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/d1/54/04cab6e1c0ae535bec93f795d8403fdf6caf66fa5a6512263202dbb14ea6/eli5-0.11.0-py2.py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 5.5MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import catboost as cb
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA, FactorAnalysis as FA
from typing import List, Optional
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split
from typing import List, Tuple
import scipy.stats as ss
from sklearn_extra.cluster import KMedoids
import math
from sklearn.utils.validation import check_is_fitted
import eli5
from sklearn.base import BaseEstimator, TransformerMixin
import time
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import shap
import missingno as msno
from sklearn.inspection import permutation_importance
from eli5.sklearn import PermutationImportance
import optuna
from sklearn.metrics import log_loss
pd.plotting.register_matplotlib_converters()
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("viridis", 10)
sns.set_palette(pal)

## Используемые функции

In [3]:
def get_input(data_path: str) -> pd.DataFrame:
  """
  Считывание данных и вывод основной информации о наборе данных.

  Parmeters
  ---------
  data_path: str - название файла

  Returns
  -------
  data: pandas.core.frame.DataFrame - загруженный набор данных в pandas.Dataframe
  """
  base_path = "/content"
  data = pd.read_csv(f"{base_path}/{data_path}")
  data.columns = [col.lower() for col in data.columns]
  print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")
  return data

In [4]:
def plot_feature_importance(importance, names, model_type, figsize=(10,8)):

  #Create arrays from feature importance and feature names
  feature_importance = np.array(importance)
  feature_names = np.array(names)

  #Create a DataFrame using a Dictionary
  data={'feature_names':feature_names,'feature_importance':feature_importance}
  fi_df = pd.DataFrame(data)

  #Sort the DataFrame in order decreasing feature importance
  fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

  #Define size of bar plot
  plt.figure(figsize=figsize)
  #Plot Searborn bar chart
  sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
  #Add chart labels
  plt.title(model_type + ' FEATURE IMPORTANCE')
  plt.xlabel('FEATURE IMPORTANCE')
  plt.ylabel('FEATURE NAMES')
  return list(fi_df['feature_names'].values)

In [5]:
def multi_estimators_predict(estimators: List,
                             x_valid: pd.DataFrame,
                             y_valid = None,
                             metric: callable = None,
                             scalers: List = None):
  preds = []
  evals = []
  if scalers:
    assert len(estimators) == len(scalers)

  for i in range(len(estimators)):
    if scalers:
      pred = estimators[i].predict_proba(scalers[i].transform(x_valid))
    else:
      if type(estimators[0]) == xgb.core.Booster:
        pred = estimators[i].predict(x_valid)
      else:
        pred = estimators[i].predict_proba(x_valid)

    
    preds.append(pred)
    if (y_valid is not None) and (metric is not None):
      eval = metric(y_valid, pred)
      evals.append(eval)
  
  result = np.stack([preds], axis=0)
  result = np.squeeze(result, axis=0)
  result = np.mean(result, axis=0)

  if (y_valid is not None) & (metric is not None):
    for i, eval in enumerate(evals):
      print(f"Model {i} metric: {eval:.7}")
    print(f"Result model metric: {metric(y_valid, result):.7}")

  return result

In [6]:
def logistic_cv_fit(params, X, y, cv, scale=False):
    """
    Кросс-валидация для модели LogisticRegression.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """

    estimators, folds_scores, scalers = [], [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        if scale:
          scaler = StandardScaler()
          x_train = scaler.fit_transform(x_train)
          x_valid = scaler.transform(x_valid)
        model = LogisticRegression(**params)
        model.fit(x_train, y_train)
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)
        if scale:
          scalers.append(scaler)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds, scalers

In [7]:
def catboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)], 
            verbose=10, 
            early_stopping_rounds=100,
            #cat_features=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [8]:
def lightgbm_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели lightgbm.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="multi_logloss", 
            verbose=10, 
            early_stopping_rounds=50,
            #categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [9]:
def xgboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели xgboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros((X.shape[0], 4))

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="mlogloss", 
            verbose=10, 
            early_stopping_rounds=50,
        )

        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

In [10]:
def lgb_cb_xgb_cv_fit(X, y, cv_high, cv_low, cb_params, xgb_params, lgb_params, categorical = None):
    """
    Функция разбивает исходный датасет (X, y) на несколько частей согласно cv. 
    На тренировочных частях обучает ансамбли CatBoost, LightGBM и XGBoost. С помощью
    ансамблей производятся предсказания на валидационные части. Результаты для каждого типа
    алгоритма усредняются и оформляются в виде pd.DataFrame. 

    Parameters
    ----------

    X: pandas.core.frame.DataFrame
        Исходный датафрейм.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv_high: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели. Для разбиения исходного датасета
    
    cv_low: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели. Тренировочных частей,
        полученных после разбиения cv_high
    
    cb_params: dict
        Словарь гиперпараметров CatBoost.
    
    xgb_params
        Словарь гиперпараметров XGBoost.

    lgb_params
        Словарь гиперпараметров LightGBM.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    valid_preds: Dict
        Словарь с датафреймами (число датафреймов согласно cv_high) которые содержат
        предсказания ансамблей для каждого типа алгоритма (xgb, cb, lgb).
    
    estimators: Dict
        Словарь со списками решателей.
    """
    if not categorical:
        categorical = "auto"

    valid_preds = {}
    estimators = {}

    for fold, (train_idx, valid_idx) in enumerate(cv_high.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        # Индексы надо сбросить, так как в _cv_fit будет ошибка с разбиением
        y_train.reset_index(inplace=True, drop=True)
        y_valid.reset_index(inplace=True, drop=True)
        x_train.reset_index(inplace=True, drop=True)
        x_valid.reset_index(inplace=True, drop=True)
        
        lgb_estimators, lgb_oof = lightgbm_cv_fit(lgb_params, x_train, y_train, cv_low)
        xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(xgb_params, x_train, y_train, cv_low)
        cb_estimators, cb_oof = catboost_cv_fit(cb_params, x_train, y_train, cv_low)

        result_lgb_valid = multi_estimators_predict(lgb_estimators, x_valid, y_valid, metrics.log_loss)
        result_xgb_valid = multi_estimators_predict(xgb_estimators, x_valid, y_valid, metrics.log_loss)
        result_cb_valid = multi_estimators_predict(cb_estimators, x_valid, y_valid, metrics.log_loss)

        result_lgb_df = pd.DataFrame(result_lgb_valid)
        result_xgb_df = pd.DataFrame(result_xgb_valid)
        result_cb_df = pd.DataFrame(result_cb_valid)
        result_lgb_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
        result_xgb_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
        result_cb_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
        result_valid = pd.concat([result_lgb_df, result_xgb_df, result_cb_df], axis=1) 
        result_valid['target'] = y_valid
        valid_preds[f'Iteration {fold}'] = result_valid
        #valid_preds.append(result_valid)
        estimators[f'Iteration {fold}'] = {'CatBoost': cb_estimators,
                                           'XGBoost': xgb_estimators,
                                           'LGBoost': lgb_estimators}
    return valid_preds, estimators

## Загрузка данных и построение модели первого уровня - ансамблей бустингов с усреднением

In [11]:
import lightgbm as lgb
data = get_input("train.csv")
test = get_input("test.csv")

data.drop(columns='id', inplace=True)
mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

target = data['target']
data = data.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols


In [12]:
# Гиперпараметры подобраны на на кросс валидации в отдельном ноутбуке

lgb_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 2000,
    'num_class': 4,
    'lambda_l1': 0.02259438646302076,
    'lambda_l2': 3.3834082296901886e-05,
    'num_leaves': 256,
    'feature_fraction': 0.42939073275678896,
    'bagging_fraction': 0.8760623722003144,
    'bagging_freq': 4,
    'min_child_samples': 71,
    'max_depth': 3,
    'learning_rate': 0.07658957460133804,
    'random_state': 42,
}

cb_params = {
    "n_estimators": 2000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    #"task_type": "GPU",
    'min_data_in_leaf': 25,
    'depth': 3,
    'learning_rate': 0.09,
    'random_strength' : 10,
    "l2_leaf_reg": 100,                       
    'grow_policy': "SymmetricTree",
    "random_seed": 27,
}

xgb_params = {
    "booster": "gbtree",
    #"eta": 0.1,
    "random_seed": 42,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3,
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927,
    'subsample': 0.6445037550866027,
    'colsample_bytree': 0.07634753656242108,
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'num_class': 4,
    'n_estimators': 2000,  # надо потюнить
    #'tree_method': 'gpu_hist',
}

In [13]:
cv_high = StratifiedKFold(n_splits=3, random_state=435, shuffle=True)
cv_low = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)
valid_preds, estimators = lgb_cb_xgb_cv_fit(data, target, cv_high, cv_low, cb_params, xgb_params, lgb_params, categorical = None)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[270]	validation_0-mlogloss:1.09444
[280]	validation_0-mlogloss:1.0942
[290]	validation_0-mlogloss:1.09404
[300]	validation_0-mlogloss:1.09382
[310]	validation_0-mlogloss:1.09359
[320]	validation_0-mlogloss:1.0934
[330]	validation_0-mlogloss:1.0932
[340]	validation_0-mlogloss:1.09297
[350]	validation_0-mlogloss:1.09289
[360]	validation_0-mlogloss:1.09261
[370]	validation_0-mlogloss:1.0924
[380]	validation_0-mlogloss:1.09232
[390]	validation_0-mlogloss:1.09222
[400]	validation_0-mlogloss:1.09204
[410]	validation_0-mlogloss:1.0919
[420]	validation_0-mlogloss:1.09183
[430]	validation_0-mlogloss:1.09166
[440]	validation_0-mlogloss:1.09166
[450]	validation_0-mlogloss:1.09154
[460]	validation_0-mlogloss:1.09144
[470]	validation_0-mlogloss:1.09136
[480]	validation_0-mlogloss:1.09141
[490]	validation_0-mlogloss:1.09136
[500]	validation_0-mlogloss:1.09133
[510]	validation_0-mlogloss:1.09139
[520]	validation_0-mlog

## Построение модели второго уровня

In [31]:
logreg_params = {'random_state': 0, 
          'max_iter': 500, 
          'multi_class': 'multinomial', 
          'solver': 'lbfgs',
          'C':1
          }

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)
estimators_log_reg, oof_log_reg, scalers = logistic_cv_fit(logreg_params, valid_preds['Iteration 0'].drop(columns='target'), valid_preds['Iteration 0']['target'], cv)

Wed May 26 21:09:54 2021, Cross-Validation, 33334 rows, 12 cols
Fold 1, Valid score = 1.09685
Fold 2, Valid score = 1.09366
Fold 3, Valid score = 1.09074
Fold 4, Valid score = 1.08785
Fold 5, Valid score = 1.08958
Fold 6, Valid score = 1.08872
Fold 7, Valid score = 1.0892
Score by each fold: [1.09685, 1.09366, 1.09074, 1.08785, 1.08958, 1.08872, 1.0892]


In [30]:
print(f"Out of fold log loss {metrics.log_loss(valid_preds['Iteration 0']['target'], oof_log_reg)}")

Out of fold log loss 1.0917875441120584


In [23]:
cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)
estimators_log_reg, oof_log_reg, scalers = logistic_cv_fit(logreg_params, valid_preds['Iteration 1'].drop(columns='target'), valid_preds['Iteration 1']['target'], cv)

Wed May 26 21:06:08 2021, Cross-Validation, 33333 rows, 12 cols
Fold 1, Valid score = 1.09378
Fold 2, Valid score = 1.09372
Fold 3, Valid score = 1.09752
Fold 4, Valid score = 1.08897
Fold 5, Valid score = 1.08844
Fold 6, Valid score = 1.09009
Fold 7, Valid score = 1.08391
Score by each fold: [1.09378, 1.09372, 1.09752, 1.08897, 1.08844, 1.09009, 1.08391]


In [24]:
print(f"Out of fold log loss {metrics.log_loss(valid_preds['Iteration 1']['target'], oof_log_reg)}")

Out of fold log loss 1.0909189780312147


In [25]:
cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)
estimators_log_reg, oof_log_reg, scalers = logistic_cv_fit(logreg_params, valid_preds['Iteration 2'].drop(columns='target'), valid_preds['Iteration 2']['target'], cv)

Wed May 26 21:07:26 2021, Cross-Validation, 33333 rows, 12 cols
Fold 1, Valid score = 1.0929
Fold 2, Valid score = 1.08899
Fold 3, Valid score = 1.08684
Fold 4, Valid score = 1.08323
Fold 5, Valid score = 1.09424
Fold 6, Valid score = 1.09361
Fold 7, Valid score = 1.08466
Score by each fold: [1.0929, 1.08899, 1.08684, 1.08323, 1.09424, 1.09361, 1.08466]


In [26]:
print(f"Out of fold log loss {metrics.log_loss(valid_preds['Iteration 2']['target'], oof_log_reg)}")

Out of fold log loss 1.0892074730576007


In [32]:
logreg_0 = LogisticRegression(**logreg_params)
logreg_1 = LogisticRegression(**logreg_params)
logreg_2 = LogisticRegression(**logreg_params)
logreg_0.fit(valid_preds['Iteration 0'].drop(columns='target'), valid_preds['Iteration 0']['target'])
logreg_1.fit(valid_preds['Iteration 1'].drop(columns='target'), valid_preds['Iteration 1']['target'])
logreg_2.fit(valid_preds['Iteration 2'].drop(columns='target'), valid_preds['Iteration 2']['target'])

LogisticRegression(C=1, max_iter=500, multi_class='multinomial', random_state=0)

## Получение результата

In [41]:
result_xgb_test = multi_estimators_predict(estimators['Iteration 0']['XGBoost'], test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(estimators['Iteration 0']['LGBoost'], test.drop(columns='id'))
result_cb_test = multi_estimators_predict(estimators['Iteration 0']['CatBoost'], test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg_0 = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

In [42]:
result_xgb_test = multi_estimators_predict(estimators['Iteration 1']['XGBoost'], test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(estimators['Iteration 1']['LGBoost'], test.drop(columns='id'))
result_cb_test = multi_estimators_predict(estimators['Iteration 1']['CatBoost'], test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg_1 = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

In [43]:
result_xgb_test = multi_estimators_predict(estimators['Iteration 2']['XGBoost'], test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(estimators['Iteration 2']['LGBoost'], test.drop(columns='id'))
result_cb_test = multi_estimators_predict(estimators['Iteration 2']['CatBoost'], test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg_2 = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

In [44]:
y_pred_logreg_0 = logreg_0.predict_proba(result_for_logreg_0)
y_pred_logreg_1 = logreg_1.predict_proba(result_for_logreg_1)
y_pred_logreg_2 = logreg_2.predict_proba(result_for_logreg_2)

In [67]:
result = np.stack([y_pred_logreg_0, y_pred_logreg_1, y_pred_logreg_2], axis=0)
result = np.mean(result, axis=0)

In [68]:
test_pred_df = pd.DataFrame(result)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

Результат 1.08567, что хуже двухуровневой модели, обученной на oof. Видимо из-за того, что в случае oof каждый бустинговый алгоритм и алгоритм логистической регрессии обучается на большем количестве сэмплов