## Загрузка данных и подключение библиотек

In [1]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-may-2021
!pip install catboost
!pip install eli5
!pip install optuna
!pip install shap
!pip install scikit-learn-extra
!unzip /content/test.csv.zip
!unzip /content/train.csv.zip
!unzip /content/sample_submission.csv.zip

kaggle.json
Downloading train.csv.zip to /content
  0% 0.00/1.72M [00:00<?, ?B/s]
100% 1.72M/1.72M [00:00<00:00, 47.7MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/128k [00:00<?, ?B/s]
100% 128k/128k [00:00<00:00, 132MB/s]
Downloading test.csv.zip to /content
  0% 0.00/851k [00:00<?, ?B/s]
100% 851k/851k [00:00<00:00, 115MB/s]
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 43kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/d1/54/04cab6e1c0ae535bec93f795d8403fdf6caf66fa5a6512263202dbb14ea6/eli5-0.11.0-py2.py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 9.6MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11.

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import catboost as cb
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA, FactorAnalysis as FA
from typing import List, Optional
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split
from typing import List, Tuple
import scipy.stats as ss
from sklearn_extra.cluster import KMedoids
import math
from sklearn.utils.validation import check_is_fitted
import eli5
from sklearn.base import BaseEstimator, TransformerMixin
import time
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import shap
import missingno as msno
from sklearn.inspection import permutation_importance
from eli5.sklearn import PermutationImportance
import optuna
from sklearn.metrics import log_loss
pd.plotting.register_matplotlib_converters()
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("viridis", 10)
sns.set_palette(pal)

## Используемые функции

In [5]:
def get_input(data_path: str) -> pd.DataFrame:
  """
  Считывание данных и вывод основной информации о наборе данных.

  Parmeters
  ---------
  data_path: str - название файла

  Returns
  -------
  data: pandas.core.frame.DataFrame - загруженный набор данных в pandas.Dataframe
  """
  base_path = "/content"
  data = pd.read_csv(f"{base_path}/{data_path}")
  data.columns = [col.lower() for col in data.columns]
  print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")
  return data

In [6]:
def plot_feature_importance(importance, names, model_type, figsize=(10,8)):

  #Create arrays from feature importance and feature names
  feature_importance = np.array(importance)
  feature_names = np.array(names)

  #Create a DataFrame using a Dictionary
  data={'feature_names':feature_names,'feature_importance':feature_importance}
  fi_df = pd.DataFrame(data)

  #Sort the DataFrame in order decreasing feature importance
  fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

  #Define size of bar plot
  plt.figure(figsize=figsize)
  #Plot Searborn bar chart
  sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
  #Add chart labels
  plt.title(model_type + ' FEATURE IMPORTANCE')
  plt.xlabel('FEATURE IMPORTANCE')
  plt.ylabel('FEATURE NAMES')
  return list(fi_df['feature_names'].values)

In [7]:
def multi_estimators_predict(estimators: List,
                             x_valid: pd.DataFrame,
                             y_valid = None,
                             metric: callable = None,
                             scalers: List = None):
  preds = []
  evals = []
  if scalers:
    assert len(estimators) == len(scalers)

  for i in range(len(estimators)):
    if scalers:
      pred = estimators[i].predict_proba(scalers[i].transform(x_valid))
    else:
      if type(estimators[0]) == xgb.core.Booster:
        pred = estimators[i].predict(x_valid)
      else:
        pred = estimators[i].predict_proba(x_valid)

    
    preds.append(pred)
    if (y_valid is not None) and (metric is not None):
      eval = metric(y_valid, pred)
      evals.append(eval)
  
  result = np.stack([preds], axis=0)
  result = np.squeeze(result, axis=0)
  result = np.mean(result, axis=0)

  if (y_valid is not None) & (metric is not None):
    for i, eval in enumerate(evals):
      print(f"Model {i} metric: {eval:.7}")
    print(f"Result model metric: {metric(y_valid, result):.7}")

  return result

In [8]:
def catboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)], verbose=10, early_stopping_rounds=100,
            #cat_features=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [9]:
def lightgbm_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели lightgbm.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="multi_logloss", verbose=10, early_stopping_rounds=50,
            categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [10]:
def xgboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели xgboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros((X.shape[0], 4))

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="mlogloss", 
            verbose=10, 
            early_stopping_rounds=50,
        )

        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

## Загрузка данных и построение моделей

### lightgbm + optuna

In [18]:
data = get_input("train.csv")
data.drop(columns='id', inplace=True)
mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)
#target = data['target']
#data = data.drop(columns=['target'])

lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    #'num_boost_round': 100,
    'max_depth': 8,
    'num_class': 4,
    'num_leaves': 6, 
}

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

dtrain = lgb.Dataset(data=x_train, label=y_train)
dvalid = lgb.Dataset(data=x_valid, label=y_valid)

result = lgb.cv(
    params=lgbm_params,
    train_set=dtrain,
    num_boost_round=1000,
    early_stopping_rounds=100,
    verbose_eval=10,
    stratified=True,
    seed=42,
    metrics="multi_logloss",
    shuffle=True,
    nfold=2
)

train.csv: shape = 100000 rows, 52 cols


In [26]:
result['multi_logloss-mean'][-1] + result['multi_logloss-stdv'][-1]

1.098969788749625

In [58]:
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dvalid = lgb.Dataset(valid_x, label=valid_y)

    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'num_class': 4,

        'max_depth': trial.suggest_int('max_depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.13 ),

    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
    )

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)

    log_loss = sklearn.metrics.log_loss(valid_y, preds)
    return log_loss

In [75]:
import optuna.integration.lightgbm as lgb
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dvalid = lgb.Dataset(valid_x, label=valid_y)

    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'num_class': 4,

        'max_depth': trial.suggest_int('max_depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.13 ),
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    result = lgb.cv(
      params=param,
      train_set=dtrain,
      num_boost_round=10000,
      early_stopping_rounds=100,
      verbose_eval=10,
      stratified=True,
      seed=42,
      metrics="multi_logloss",
      shuffle=True,
      nfold=5
    )
 
    log_loss = result['multi_logloss-mean'][-1] + result['multi_logloss-stdv'][-1]
    return log_loss

In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=20), direction="minimize"
    )
study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

In [53]:
optuna.visualization.plot_optimization_history(study)

In [44]:
optuna.visualization.plot_slice(study)

In [67]:
optuna.visualization.plot_param_importances(study)

In [46]:
print(study.best_params)

{'lambda_l1': 1.45120566383297e-06, 'lambda_l2': 0.003368552965821498, 'num_leaves': 32, 'feature_fraction': 0.4471131328810426, 'bagging_fraction': 0.5888198372400193, 'bagging_freq': 4, 'min_child_samples': 32}


In [47]:
study.best_value

1.0833801288748757

In [35]:
import lightgbm as lgb
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [36]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'lambda_l1': 5.5056959920762045, # reg_alpha
    'lambda_l2': 1.2560855982145416, # lambda_l2
    'num_leaves': 6,
    'max_depth': 8,
    'feature_fraction': 0.557158037184633, # colsample_bytree
    'bagging_fraction': 0.43089966688052994, # subsample
    'bagging_freq': 1,
    'min_child_samples': 60,
    'learning_rate': 0.06662526682252737,
    'random_state': 42, 
}

lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

Wed May 19 18:56:13 2021, Cross-Validation, 70000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.115
[20]	valid_0's multi_logloss: 1.11173
[30]	valid_0's multi_logloss: 1.10944
[40]	valid_0's multi_logloss: 1.10752
[50]	valid_0's multi_logloss: 1.10572
[60]	valid_0's multi_logloss: 1.10412
[70]	valid_0's multi_logloss: 1.1028
[80]	valid_0's multi_logloss: 1.10162
[90]	valid_0's multi_logloss: 1.10055
[100]	valid_0's multi_logloss: 1.09961
[110]	valid_0's multi_logloss: 1.09884
[120]	valid_0's multi_logloss: 1.09813
[130]	valid_0's multi_logloss: 1.09751
[140]	valid_0's multi_logloss: 1.09696
[150]	valid_0's multi_logloss: 1.09646
[160]	valid_0's multi_logloss: 1.09598
[170]	valid_0's multi_logloss: 1.09556
[180]	valid_0's multi_logloss: 1.09526
[190]	valid_0's multi_logloss: 1.09492
[200]	valid_0's multi_logloss: 1.09458
[210]	valid_0's multi_logloss: 1.0942
[220]	valid_0's multi_logloss: 1.09394
[230]	valid_0's multi_logloss

In [37]:
print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Out of fold log loss 1.0927355848269298


In [38]:
result_lgb_valid = multi_estimators_predict(lgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090214
Model 1 metric: 1.089329
Model 2 metric: 1.089982
Model 3 metric: 1.090136
Model 4 metric: 1.090564
Model 5 metric: 1.089468
Model 6 metric: 1.090378
Result model metric: 1.089295


### Catboost

In [27]:
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)

    param = {
        "loss_function": "MultiClass",
        "eval_metric": "MultiClass",
        "task_type": "GPU",
        
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        #"subsample": trial.suggest_float("subsample", 0.1, 1),

        'min_data_in_leaf': 25,
        'depth': trial.suggest_int('depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.13 ),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),                       
        #'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), # for bayesian bootstrap only

        #'bootstrap_type': 'Bernoulli',
        #'leaf_estimation_method': 'Newton',

        'grow_policy': "SymmetricTree",
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)
    preds = gbm.predict_proba(valid_x)


    log_loss = sklearn.metrics.log_loss(valid_y, preds)
    return log_loss

In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=20), direction="minimize"
    )
study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

In [34]:
optuna.visualization.plot_param_importances(study)

In [31]:
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [32]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

Wed May 19 18:54:33 2021, Cross-Validation, 70000 rows, 50 cols




0:	learn: 1.3572517	test: 1.3571211	best: 1.3571211 (0)	total: 13.6ms	remaining: 1m 7s
10:	learn: 1.2011189	test: 1.2005878	best: 1.2005878 (10)	total: 88ms	remaining: 39.9s
20:	learn: 1.1466094	test: 1.1460703	best: 1.1460703 (20)	total: 180ms	remaining: 42.7s
30:	learn: 1.1251125	test: 1.1245359	best: 1.1245359 (30)	total: 257ms	remaining: 41.2s
40:	learn: 1.1161059	test: 1.1156166	best: 1.1156166 (40)	total: 329ms	remaining: 39.8s
50:	learn: 1.1119770	test: 1.1115263	best: 1.1115263 (50)	total: 406ms	remaining: 39.4s
60:	learn: 1.1096509	test: 1.1093131	best: 1.1093131 (60)	total: 492ms	remaining: 39.8s
70:	learn: 1.1082383	test: 1.1079554	best: 1.1079554 (70)	total: 565ms	remaining: 39.2s
80:	learn: 1.1071259	test: 1.1069030	best: 1.1069030 (80)	total: 638ms	remaining: 38.8s
90:	learn: 1.1062570	test: 1.1060874	best: 1.1060874 (90)	total: 716ms	remaining: 38.6s
100:	learn: 1.1053605	test: 1.1052913	best: 1.1052913 (100)	total: 795ms	remaining: 38.6s
110:	learn: 1.1045954	test: 1.10



0:	learn: 1.3571219	test: 1.3571545	best: 1.3571545 (0)	total: 8.54ms	remaining: 42.7s
10:	learn: 1.2007522	test: 1.2010643	best: 1.2010643 (10)	total: 81.6ms	remaining: 37s
20:	learn: 1.1461645	test: 1.1467449	best: 1.1467449 (20)	total: 168ms	remaining: 39.8s
30:	learn: 1.1246133	test: 1.1254315	best: 1.1254315 (30)	total: 241ms	remaining: 38.6s
40:	learn: 1.1155451	test: 1.1166310	best: 1.1166310 (40)	total: 312ms	remaining: 37.7s
50:	learn: 1.1113720	test: 1.1128158	best: 1.1128158 (50)	total: 379ms	remaining: 36.7s
60:	learn: 1.1091908	test: 1.1109032	best: 1.1109032 (60)	total: 451ms	remaining: 36.5s
70:	learn: 1.1078227	test: 1.1097061	best: 1.1097061 (70)	total: 518ms	remaining: 35.9s
80:	learn: 1.1066419	test: 1.1088951	best: 1.1088951 (80)	total: 586ms	remaining: 35.6s
90:	learn: 1.1056977	test: 1.1081584	best: 1.1081584 (90)	total: 666ms	remaining: 35.9s
100:	learn: 1.1048339	test: 1.1074312	best: 1.1074312 (100)	total: 734ms	remaining: 35.6s
110:	learn: 1.1040384	test: 1.10



0:	learn: 1.3573331	test: 1.3573251	best: 1.3573251 (0)	total: 8.37ms	remaining: 41.9s
10:	learn: 1.2009488	test: 1.2013142	best: 1.2013142 (10)	total: 76.3ms	remaining: 34.6s
20:	learn: 1.1462733	test: 1.1468265	best: 1.1468265 (20)	total: 177ms	remaining: 41.9s
30:	learn: 1.1247577	test: 1.1254875	best: 1.1254875 (30)	total: 268ms	remaining: 43s
40:	learn: 1.1156738	test: 1.1165044	best: 1.1165044 (40)	total: 336ms	remaining: 40.6s
50:	learn: 1.1114922	test: 1.1126024	best: 1.1126024 (50)	total: 404ms	remaining: 39.2s
60:	learn: 1.1091977	test: 1.1105763	best: 1.1105763 (60)	total: 478ms	remaining: 38.7s
70:	learn: 1.1077879	test: 1.1093271	best: 1.1093271 (70)	total: 550ms	remaining: 38.2s
80:	learn: 1.1066971	test: 1.1083020	best: 1.1083020 (80)	total: 617ms	remaining: 37.5s
90:	learn: 1.1058059	test: 1.1075443	best: 1.1075443 (90)	total: 688ms	remaining: 37.1s
100:	learn: 1.1050992	test: 1.1068318	best: 1.1068318 (100)	total: 758ms	remaining: 36.8s
110:	learn: 1.1042629	test: 1.10



0:	learn: 1.3571322	test: 1.3570889	best: 1.3570889 (0)	total: 7.36ms	remaining: 36.8s
10:	learn: 1.2008469	test: 1.2008311	best: 1.2008311 (10)	total: 70.1ms	remaining: 31.8s
20:	learn: 1.1463359	test: 1.1465278	best: 1.1465278 (20)	total: 141ms	remaining: 33.4s
30:	learn: 1.1248117	test: 1.1252808	best: 1.1252808 (30)	total: 229ms	remaining: 36.8s
40:	learn: 1.1158182	test: 1.1163665	best: 1.1163665 (40)	total: 292ms	remaining: 35.3s
50:	learn: 1.1116444	test: 1.1122930	best: 1.1122930 (50)	total: 355ms	remaining: 34.4s
60:	learn: 1.1094083	test: 1.1100981	best: 1.1100981 (60)	total: 420ms	remaining: 34s
70:	learn: 1.1079533	test: 1.1086542	best: 1.1086542 (70)	total: 495ms	remaining: 34.3s
80:	learn: 1.1068509	test: 1.1076933	best: 1.1076933 (80)	total: 556ms	remaining: 33.8s
90:	learn: 1.1059729	test: 1.1069651	best: 1.1069651 (90)	total: 617ms	remaining: 33.3s
100:	learn: 1.1052335	test: 1.1062563	best: 1.1062563 (100)	total: 679ms	remaining: 32.9s
110:	learn: 1.1043919	test: 1.10



0:	learn: 1.3571689	test: 1.3572625	best: 1.3572625 (0)	total: 7.51ms	remaining: 37.5s
10:	learn: 1.2007922	test: 1.2012711	best: 1.2012711 (10)	total: 75.4ms	remaining: 34.2s
20:	learn: 1.1462438	test: 1.1469089	best: 1.1469089 (20)	total: 144ms	remaining: 34.2s
30:	learn: 1.1248206	test: 1.1256273	best: 1.1256273 (30)	total: 212ms	remaining: 33.9s
40:	learn: 1.1157879	test: 1.1166930	best: 1.1166930 (40)	total: 273ms	remaining: 33s
50:	learn: 1.1116802	test: 1.1126684	best: 1.1126684 (50)	total: 334ms	remaining: 32.5s
60:	learn: 1.1094699	test: 1.1105489	best: 1.1105489 (60)	total: 401ms	remaining: 32.4s
70:	learn: 1.1079650	test: 1.1092886	best: 1.1092886 (70)	total: 467ms	remaining: 32.4s
80:	learn: 1.1068784	test: 1.1083830	best: 1.1083830 (80)	total: 527ms	remaining: 32s
90:	learn: 1.1059768	test: 1.1076502	best: 1.1076502 (90)	total: 588ms	remaining: 31.7s
100:	learn: 1.1051878	test: 1.1070056	best: 1.1070056 (100)	total: 649ms	remaining: 31.5s
110:	learn: 1.1043747	test: 1.1063



0:	learn: 1.3571547	test: 1.3573180	best: 1.3573180 (0)	total: 8.33ms	remaining: 41.6s
10:	learn: 1.2007440	test: 1.2013859	best: 1.2013859 (10)	total: 75.4ms	remaining: 34.2s
20:	learn: 1.1461392	test: 1.1473464	best: 1.1473464 (20)	total: 150ms	remaining: 35.5s
30:	learn: 1.1245980	test: 1.1261084	best: 1.1261084 (30)	total: 227ms	remaining: 36.4s
40:	learn: 1.1155316	test: 1.1173799	best: 1.1173799 (40)	total: 302ms	remaining: 36.5s
50:	learn: 1.1113521	test: 1.1135167	best: 1.1135167 (50)	total: 369ms	remaining: 35.8s
60:	learn: 1.1090594	test: 1.1114246	best: 1.1114246 (60)	total: 445ms	remaining: 36s
70:	learn: 1.1077279	test: 1.1102334	best: 1.1102334 (70)	total: 510ms	remaining: 35.4s
80:	learn: 1.1066318	test: 1.1093990	best: 1.1093990 (80)	total: 579ms	remaining: 35.2s
90:	learn: 1.1056484	test: 1.1086289	best: 1.1086289 (90)	total: 653ms	remaining: 35.2s
100:	learn: 1.1048660	test: 1.1081355	best: 1.1081355 (100)	total: 726ms	remaining: 35.2s
110:	learn: 1.1040763	test: 1.10



0:	learn: 1.3571240	test: 1.3571084	best: 1.3571084 (0)	total: 8.38ms	remaining: 41.9s
10:	learn: 1.2008124	test: 1.2009263	best: 1.2009263 (10)	total: 88.3ms	remaining: 40.1s
20:	learn: 1.1462413	test: 1.1467803	best: 1.1467803 (20)	total: 167ms	remaining: 39.7s
30:	learn: 1.1248010	test: 1.1254342	best: 1.1254342 (30)	total: 260ms	remaining: 41.7s
40:	learn: 1.1157469	test: 1.1165163	best: 1.1165163 (40)	total: 329ms	remaining: 39.8s
50:	learn: 1.1115549	test: 1.1125670	best: 1.1125670 (50)	total: 403ms	remaining: 39.1s
60:	learn: 1.1092816	test: 1.1105470	best: 1.1105470 (60)	total: 479ms	remaining: 38.8s
70:	learn: 1.1079509	test: 1.1093200	best: 1.1093200 (70)	total: 547ms	remaining: 37.9s
80:	learn: 1.1067630	test: 1.1081793	best: 1.1081793 (80)	total: 615ms	remaining: 37.4s
90:	learn: 1.1058638	test: 1.1074137	best: 1.1074137 (90)	total: 694ms	remaining: 37.4s
100:	learn: 1.1050174	test: 1.1067195	best: 1.1067195 (100)	total: 761ms	remaining: 36.9s
110:	learn: 1.1042057	test: 1.

In [39]:
print(f"Out of fold log loss {metrics.log_loss(y_train, cb_oof)}")

Out of fold log loss 1.0928103271360072


In [40]:
result_cb_valid = multi_estimators_predict(cb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090869
Model 1 metric: 1.090986
Model 2 metric: 1.090854
Model 3 metric: 1.090863
Model 4 metric: 1.091755
Model 5 metric: 1.090482
Model 6 metric: 1.09084
Result model metric: 1.090395


### XGBoost

In [None]:
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)
    dtrain = xgb.Dataset(train_x, label=train_y)
    dvalid = xgb.Dataset(valid_x, label=valid_y)

    param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        "verbosity": -1,
 
       
        'num_class': 4,

        'max_depth': trial.suggest_int('max_depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.13 ),

    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "multi_logloss")
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
    )

    preds = gbm.predict(valid_x)
    log_loss = sklearn.metrics.log_loss(valid_y, preds)
    return log_loss

In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=20), direction="minimize"
    )
study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

In [41]:
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

xgb_params = {
    'objective': 'multi:softprob',
    #'eval_metric': 'mlogloss',
    'random_state': 13,
    'max_depth': 5, 
    'n_estimators': 2000,
    'num_class': 4
}

xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols
Wed May 19 18:59:54 2021, Cross-Validation, 70000 rows, 50 cols
[0]	validation_0-mlogloss:1.35369
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.18545
[20]	validation_0-mlogloss:1.13539
[30]	validation_0-mlogloss:1.11952
[40]	validation_0-mlogloss:1.11341
[50]	validation_0-mlogloss:1.11038
[60]	validation_0-mlogloss:1.1083
[70]	validation_0-mlogloss:1.10684
[80]	validation_0-mlogloss:1.1058
[90]	validation_0-mlogloss:1.1047
[100]	validation_0-mlogloss:1.1034
[110]	validation_0-mlogloss:1.10248
[120]	validation_0-mlogloss:1.10168
[130]	validation_0-mlogloss:1.10102
[140]	validation_0-mlogloss:1.10025
[150]	validation_0-mlogloss:1.09976
[160]	validation_0-mlogloss:1.09919
[170]	validation_0-mlogloss:1.09868
[180]	validation_0-mlogloss:1.09815
[190]	validation_0-mlogloss:1.09748
[200]	validation_0-mlogloss:1.0969

In [42]:
print(f"Out of fold log loss {metrics.log_loss(y_train, xgb_oof)}")

Out of fold log loss 1.0931592292917518


In [43]:
result_xgb_valid = multi_estimators_predict(xgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090678
Model 1 metric: 1.09052
Model 2 metric: 1.091207
Model 3 metric: 1.091059
Model 4 metric: 1.091386
Model 5 metric: 1.090657
Model 6 metric: 1.091556
Result model metric: 1.090071


## Валидация ансамблей

In [182]:
metrics.log_loss(y_valid, result_cb_valid)

1.0900329580008001

In [183]:
metrics.log_loss(y_valid, result_xgb_valid)

1.090070791935424

In [185]:
metrics.log_loss(y_valid, result_lgb_valid)

1.089295294594471

In [187]:
result_lgb_xgb_valid = np.dstack((result_lgb_valid, result_xgb_valid))
result_lgb_xgb_valid_mean = np.mean(result_lgb_xgb_valid, axis=2)
print(f"result_lgb_xgb: {metrics.log_loss(y_valid, result_lgb_xgb_valid_mean)}")

result_lgb_xgb: 1.0891426892327547


In [188]:
result_lgb_cb_valid = np.dstack((result_lgb_valid, result_cb_valid))
result_lgb_cb_valid_mean = np.mean(result_lgb_cb_valid, axis=2)
print(f"result_lgb_cb: {metrics.log_loss(y_valid, result_lgb_cb_valid_mean)}")

result_lgb_cb: 1.0892798863568327


In [189]:
result_xgb_cb_valid = np.dstack((result_xgb_valid, result_cb_valid))
result_xgb_cb_valid_mean = np.mean(result_xgb_cb_valid, axis=2)
print(f"result_xgb_cb: {metrics.log_loss(y_valid, result_xgb_cb_valid_mean)}")

result_xgb_cb: 1.0896947421802183


In [190]:
result_lgb_xgb_cb_valid = np.dstack((result_lgb_valid, result_xgb_valid, result_cb_valid))
result_lgb_xgb_cb_valid_mean = np.mean(result_lgb_xgb_cb_valid, axis=2)
print(f"result_lgb_xgb_cb: {metrics.log_loss(y_valid, result_lgb_xgb_cb_valid_mean)}")

result_lgb_xgb_cb: 1.0892300999555697


Вывод: лучшее качество у ансамбля lightgbm и xgboost по среднему

## Модель второго уровня (логисическая регрессия на ансамблях)

In [44]:
result_lgbm_df = pd.DataFrame(result_lgb_valid)
result_xgb_df = pd.DataFrame(result_xgb_valid)
result_cb_df = pd.DataFrame(result_cb_valid)
result_lgbm_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_valid = pd.concat([result_lgbm_df, result_xgb_df, result_cb_df], axis=1) 

In [45]:
x_train_logreg, x_valid_logreg, y_train_logreg, y_valid_logreg = train_test_split(
    result_valid, y_valid, train_size=0.7, shuffle=True, random_state=1,
)

params = {'random_state': 0, 
          'max_iter': 300, 
          'multi_class': 'multinomial', 
          'solver': 'lbfgs'}

logreg = LogisticRegression(**params)
logreg.fit(x_train_logreg, y_train_logreg)

LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0)

In [46]:
y_pred_logreg = logreg.predict_proba(x_valid_logreg)
metrics.log_loss(y_valid_logreg, y_pred_logreg)

1.0863465268415893

Вывод: наилучший результат при построении модели второго уровня (логистической регрессии) на предсказаниях ансамблей бустинговых алгоритмов

In [47]:
# Обучаем на предсказаниях ансамблей на валидационном датасете и будем использовать для получения
# итогового результата на тесте
logreg_ensamble_for_test = LogisticRegression(**params)
logreg_ensamble_for_test.fit(result_valid, y_valid)

LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0)

## Модель второго уровня (логистическая регрессия на предсказаниях каждого отдельного алгоритма бустинга)

In [211]:
lgb_res_0 = lgb_estimators[0].predict_proba(x_valid)
lgb_res_1 = lgb_estimators[1].predict_proba(x_valid)
lgb_res_2 = lgb_estimators[2].predict_proba(x_valid)
lgb_res_3 = lgb_estimators[3].predict_proba(x_valid)
lgb_res_4 = lgb_estimators[4].predict_proba(x_valid)
lgb_res_5 = lgb_estimators[5].predict_proba(x_valid)
lgb_res_6 = lgb_estimators[6].predict_proba(x_valid)
lgb_res_0_df = pd.DataFrame(lgb_res_0)
lgb_res_1_df = pd.DataFrame(lgb_res_1)
lgb_res_2_df = pd.DataFrame(lgb_res_2)
lgb_res_3_df = pd.DataFrame(lgb_res_3)
lgb_res_4_df = pd.DataFrame(lgb_res_4)
lgb_res_5_df = pd.DataFrame(lgb_res_5)
lgb_res_6_df = pd.DataFrame(lgb_res_6)
lgb_res_0_df.rename(columns={0:'lgb_0_class_1',	1:'lgb_0_class_2',	2:'lgb_0_class_3',	3:'lgb_0_class_4'}, inplace=True)
lgb_res_1_df.rename(columns={0:'lgb_1_class_1',	1:'lgb_1_class_2',	2:'lgb_1_class_3',	3:'lgb_1_class_4'}, inplace=True)
lgb_res_2_df.rename(columns={0:'lgb_2_class_1',	1:'lgb_2_class_2',	2:'lgb_2_class_3',	3:'lgb_2_class_4'}, inplace=True)
lgb_res_3_df.rename(columns={0:'lgb_3_class_1',	1:'lgb_3_class_2',	2:'lgb_3_class_3',	3:'lgb_3_class_4'}, inplace=True)
lgb_res_4_df.rename(columns={0:'lgb_4_class_1',	1:'lgb_4_class_2',	2:'lgb_4_class_3',	3:'lgb_4_class_4'}, inplace=True)
lgb_res_5_df.rename(columns={0:'lgb_5_class_1',	1:'lgb_5_class_2',	2:'lgb_5_class_3',	3:'lgb_5_class_4'}, inplace=True)
lgb_res_6_df.rename(columns={0:'lgb_6_class_1',	1:'lgb_6_class_2',	2:'lgb_6_class_3',	3:'lgb_6_class_4'}, inplace=True)

In [212]:
xgb_res_0 = xgb_estimators[0].predict_proba(x_valid)
xgb_res_1 = xgb_estimators[1].predict_proba(x_valid)
xgb_res_2 = xgb_estimators[2].predict_proba(x_valid)
xgb_res_3 = xgb_estimators[3].predict_proba(x_valid)
xgb_res_4 = xgb_estimators[4].predict_proba(x_valid)
xgb_res_5 = xgb_estimators[5].predict_proba(x_valid)
xgb_res_6 = xgb_estimators[6].predict_proba(x_valid)
xgb_res_0_df = pd.DataFrame(xgb_res_0)
xgb_res_1_df = pd.DataFrame(xgb_res_1)
xgb_res_2_df = pd.DataFrame(xgb_res_2)
xgb_res_3_df = pd.DataFrame(xgb_res_3)
xgb_res_4_df = pd.DataFrame(xgb_res_4)
xgb_res_5_df = pd.DataFrame(xgb_res_5)
xgb_res_6_df = pd.DataFrame(xgb_res_6)
xgb_res_0_df.rename(columns={0:'xgb_0_class_1',	1:'xgb_0_class_2',	2:'xgb_0_class_3',	3:'xgb_0_class_4'}, inplace=True)
xgb_res_1_df.rename(columns={0:'xgb_1_class_1',	1:'xgb_1_class_2',	2:'xgb_1_class_3',	3:'xgb_1_class_4'}, inplace=True)
xgb_res_2_df.rename(columns={0:'xgb_2_class_1',	1:'xgb_2_class_2',	2:'xgb_2_class_3',	3:'xgb_2_class_4'}, inplace=True)
xgb_res_3_df.rename(columns={0:'xgb_3_class_1',	1:'xgb_3_class_2',	2:'xgb_3_class_3',	3:'xgb_3_class_4'}, inplace=True)
xgb_res_4_df.rename(columns={0:'xgb_4_class_1',	1:'xgb_4_class_2',	2:'xgb_4_class_3',	3:'xgb_4_class_4'}, inplace=True)
xgb_res_5_df.rename(columns={0:'xgb_5_class_1',	1:'xgb_5_class_2',	2:'xgb_5_class_3',	3:'xgb_5_class_4'}, inplace=True)
xgb_res_6_df.rename(columns={0:'xgb_6_class_1',	1:'xgb_6_class_2',	2:'xgb_6_class_3',	3:'xgb_6_class_4'}, inplace=True)

In [213]:
cb_res_0 = cb_estimators[0].predict_proba(x_valid)
cb_res_1 = cb_estimators[1].predict_proba(x_valid)
cb_res_2 = cb_estimators[2].predict_proba(x_valid)
cb_res_3 = cb_estimators[3].predict_proba(x_valid)
cb_res_4 = cb_estimators[4].predict_proba(x_valid)
cb_res_5 = cb_estimators[5].predict_proba(x_valid)
cb_res_6 = cb_estimators[6].predict_proba(x_valid)
cb_res_0_df = pd.DataFrame(cb_res_0)
cb_res_1_df = pd.DataFrame(cb_res_1)
cb_res_2_df = pd.DataFrame(cb_res_2)
cb_res_3_df = pd.DataFrame(cb_res_3)
cb_res_4_df = pd.DataFrame(cb_res_4)
cb_res_5_df = pd.DataFrame(cb_res_5)
cb_res_6_df = pd.DataFrame(cb_res_6)
cb_res_0_df.rename(columns={0:'cb_0_class_1',	1:'cb_0_class_2',	2:'cb_0_class_3',	3:'cb_0_class_4'}, inplace=True)
cb_res_1_df.rename(columns={0:'cb_1_class_1',	1:'cb_1_class_2',	2:'cb_1_class_3',	3:'cb_1_class_4'}, inplace=True)
cb_res_2_df.rename(columns={0:'cb_2_class_1',	1:'cb_2_class_2',	2:'cb_2_class_3',	3:'cb_2_class_4'}, inplace=True)
cb_res_3_df.rename(columns={0:'cb_3_class_1',	1:'cb_3_class_2',	2:'cb_3_class_3',	3:'cb_3_class_4'}, inplace=True)
cb_res_4_df.rename(columns={0:'cb_4_class_1',	1:'cb_4_class_2',	2:'cb_4_class_3',	3:'cb_4_class_4'}, inplace=True)
cb_res_5_df.rename(columns={0:'cb_5_class_1',	1:'cb_5_class_2',	2:'cb_5_class_3',	3:'cb_5_class_4'}, inplace=True)
cb_res_6_df.rename(columns={0:'cb_6_class_1',	1:'cb_6_class_2',	2:'cb_6_class_3',	3:'cb_6_class_4'}, inplace=True)

In [220]:
result_valid_each = pd.concat([cb_res_0_df, cb_res_1_df, cb_res_2_df, cb_res_3_df, cb_res_4_df, cb_res_5_df, cb_res_6_df,
                    lgb_res_0_df, lgb_res_1_df, lgb_res_2_df, lgb_res_3_df, lgb_res_4_df, lgb_res_5_df, lgb_res_6_df,
                    xgb_res_0_df, xgb_res_1_df, xgb_res_2_df, xgb_res_3_df, xgb_res_4_df, xgb_res_5_df, xgb_res_6_df], axis=1) 

In [221]:
x_train_logreg, x_valid_logreg, y_train_logreg, y_valid_logreg = train_test_split(
    result_valid_each, y_valid, train_size=0.7, shuffle=True, random_state=1,
)

params = {'random_state': 0, 
          'max_iter': 300, 
          'multi_class': 'multinomial', 
          'solver': 'lbfgs'}

logreg = LogisticRegression(**params)
logreg.fit(x_train_logreg, y_train_logreg)

LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0)

In [222]:
y_pred_logreg = logreg.predict_proba(x_valid_logreg)
metrics.log_loss(y_valid_logreg, y_pred_logreg)

1.0865620338640811

Вывод: логистическая регрессия над результатами каждого алгоритма бустинга показала немного худший результат, чем на результатами ансамбля

## Получение результата с усреднением выходов ансамблей

In [223]:
data = get_input("train.csv")
test = get_input("test.csv")
data.drop(columns='id', inplace=True)
sample_submission = get_input("sample_submission.csv")

#data['sum'] = data[data.columns.to_list()[:50]].sum(axis=1) # норм
#data['max'] = data[data.columns.to_list()[:50]].max(axis=1) # норм
#data['not_nul_features'] = (data!=0)[data.columns.to_list()[:50]].sum(axis=1)
#data['2+13'] = data['feature_2'] + data['feature_13']
#data['6+15'] = data['feature_6'] + data['feature_15']

#test['sum'] = test[test.columns.to_list()[:50]].sum(axis=1) # норм
#test['max'] = test[test.columns.to_list()[:50]].max(axis=1) # норм
#test['not_nul_features'] = (test!=0)[test.columns.to_list()[:50]].sum(axis=1)
#test['2+13'] = test['feature_2'] + test['feature_13']
#test['6+15'] = test['feature_6'] + test['feature_15']

mapper = {'Class_1': 1, 'Class_2': 2, 'Class_3': 3, 'Class_4': 4}
data['target'] = data['target'].map(mapper)

y_train = data['target']
x_train = data.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [224]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")
result_lgb = multi_estimators_predict(lgb_estimators, test.drop(columns='id'))

Wed May 19 17:39:14 2021, Cross-Validation, 100000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.11463
[20]	valid_0's multi_logloss: 1.11174
[30]	valid_0's multi_logloss: 1.10928
[40]	valid_0's multi_logloss: 1.10737
[50]	valid_0's multi_logloss: 1.10574
[60]	valid_0's multi_logloss: 1.10432
[70]	valid_0's multi_logloss: 1.10316
[80]	valid_0's multi_logloss: 1.10201
[90]	valid_0's multi_logloss: 1.10105
[100]	valid_0's multi_logloss: 1.10029
[110]	valid_0's multi_logloss: 1.09952
[120]	valid_0's multi_logloss: 1.0989
[130]	valid_0's multi_logloss: 1.09833
[140]	valid_0's multi_logloss: 1.09786
[150]	valid_0's multi_logloss: 1.09742
[160]	valid_0's multi_logloss: 1.09698
[170]	valid_0's multi_logloss: 1.09653
[180]	valid_0's multi_logloss: 1.09612
[190]	valid_0's multi_logloss: 1.09581
[200]	valid_0's multi_logloss: 1.09555
[210]	valid_0's multi_logloss: 1.09527
[220]	valid_0's multi_logloss: 1.09497
[230]	valid_0's multi_log

In [225]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")
result_cb = multi_estimators_predict(cb_estimators, test.drop(columns='id'))

Wed May 19 17:44:35 2021, Cross-Validation, 100000 rows, 50 cols




0:	learn: 1.3569208	test: 1.3570112	best: 1.3570112 (0)	total: 9.81ms	remaining: 49s
10:	learn: 1.1999903	test: 1.2004217	best: 1.2004217 (10)	total: 88.6ms	remaining: 40.2s
20:	learn: 1.1454962	test: 1.1461476	best: 1.1461476 (20)	total: 172ms	remaining: 40.8s
30:	learn: 1.1241088	test: 1.1250662	best: 1.1250662 (30)	total: 257ms	remaining: 41.1s
40:	learn: 1.1151137	test: 1.1162494	best: 1.1162494 (40)	total: 335ms	remaining: 40.5s
50:	learn: 1.1109877	test: 1.1121634	best: 1.1121634 (50)	total: 413ms	remaining: 40.1s
60:	learn: 1.1087906	test: 1.1101227	best: 1.1101227 (60)	total: 499ms	remaining: 40.4s
70:	learn: 1.1073104	test: 1.1087635	best: 1.1087635 (70)	total: 585ms	remaining: 40.6s
80:	learn: 1.1062240	test: 1.1078849	best: 1.1078849 (80)	total: 663ms	remaining: 40.3s
90:	learn: 1.1052214	test: 1.1069812	best: 1.1069812 (90)	total: 746ms	remaining: 40.3s
100:	learn: 1.1044037	test: 1.1062325	best: 1.1062325 (100)	total: 823ms	remaining: 39.9s
110:	learn: 1.1036808	test: 1.10



0:	learn: 1.3570102	test: 1.3570398	best: 1.3570398 (0)	total: 8.94ms	remaining: 44.7s
10:	learn: 1.2002096	test: 1.2003323	best: 1.2003323 (10)	total: 82.2ms	remaining: 37.3s
20:	learn: 1.1457335	test: 1.1459537	best: 1.1459537 (20)	total: 160ms	remaining: 38s
30:	learn: 1.1243465	test: 1.1244288	best: 1.1244288 (30)	total: 245ms	remaining: 39.2s
40:	learn: 1.1153421	test: 1.1155581	best: 1.1155581 (40)	total: 319ms	remaining: 38.6s
50:	learn: 1.1111900	test: 1.1115351	best: 1.1115351 (50)	total: 391ms	remaining: 38s
60:	learn: 1.1089609	test: 1.1093891	best: 1.1093891 (60)	total: 469ms	remaining: 37.9s
70:	learn: 1.1075517	test: 1.1080455	best: 1.1080455 (70)	total: 539ms	remaining: 37.4s
80:	learn: 1.1065999	test: 1.1071061	best: 1.1071061 (80)	total: 615ms	remaining: 37.4s
90:	learn: 1.1055562	test: 1.1061780	best: 1.1061780 (90)	total: 693ms	remaining: 37.4s
100:	learn: 1.1047160	test: 1.1052911	best: 1.1052911 (100)	total: 768ms	remaining: 37.3s
110:	learn: 1.1038849	test: 1.1045



0:	learn: 1.3570089	test: 1.3570284	best: 1.3570284 (0)	total: 9.09ms	remaining: 45.5s
10:	learn: 1.2000543	test: 1.2003992	best: 1.2003992 (10)	total: 82.3ms	remaining: 37.3s
20:	learn: 1.1454413	test: 1.1462312	best: 1.1462312 (20)	total: 161ms	remaining: 38.2s
30:	learn: 1.1240990	test: 1.1253116	best: 1.1253116 (30)	total: 253ms	remaining: 40.6s
40:	learn: 1.1150410	test: 1.1165267	best: 1.1165267 (40)	total: 329ms	remaining: 39.8s
50:	learn: 1.1109029	test: 1.1125472	best: 1.1125472 (50)	total: 401ms	remaining: 38.9s
60:	learn: 1.1087165	test: 1.1105858	best: 1.1105858 (60)	total: 479ms	remaining: 38.8s
70:	learn: 1.1072984	test: 1.1094224	best: 1.1094224 (70)	total: 552ms	remaining: 38.3s
80:	learn: 1.1062192	test: 1.1085464	best: 1.1085464 (80)	total: 630ms	remaining: 38.2s
90:	learn: 1.1051664	test: 1.1075909	best: 1.1075909 (90)	total: 710ms	remaining: 38.3s
100:	learn: 1.1042531	test: 1.1068103	best: 1.1068103 (100)	total: 783ms	remaining: 38s
110:	learn: 1.1034807	test: 1.10



0:	learn: 1.3569908	test: 1.3570099	best: 1.3570099 (0)	total: 8.11ms	remaining: 40.5s
10:	learn: 1.2001747	test: 1.2002862	best: 1.2002862 (10)	total: 90.6ms	remaining: 41.1s
20:	learn: 1.1456662	test: 1.1459829	best: 1.1459829 (20)	total: 165ms	remaining: 39.2s
30:	learn: 1.1242656	test: 1.1246157	best: 1.1246157 (30)	total: 248ms	remaining: 39.8s
40:	learn: 1.1152577	test: 1.1156518	best: 1.1156518 (40)	total: 322ms	remaining: 38.9s
50:	learn: 1.1111032	test: 1.1116097	best: 1.1116097 (50)	total: 391ms	remaining: 38s
60:	learn: 1.1088524	test: 1.1094821	best: 1.1094821 (60)	total: 465ms	remaining: 37.7s
70:	learn: 1.1074366	test: 1.1081726	best: 1.1081726 (70)	total: 534ms	remaining: 37.1s
80:	learn: 1.1063346	test: 1.1071923	best: 1.1071923 (80)	total: 601ms	remaining: 36.5s
90:	learn: 1.1053466	test: 1.1062540	best: 1.1062540 (90)	total: 675ms	remaining: 36.4s
100:	learn: 1.1044869	test: 1.1055540	best: 1.1055540 (100)	total: 743ms	remaining: 36s
110:	learn: 1.1037442	test: 1.1049



0:	learn: 1.3569875	test: 1.3569956	best: 1.3569956 (0)	total: 7.76ms	remaining: 38.8s
10:	learn: 1.2002222	test: 1.2003771	best: 1.2003771 (10)	total: 77.8ms	remaining: 35.3s
20:	learn: 1.1456908	test: 1.1459861	best: 1.1459861 (20)	total: 150ms	remaining: 35.6s
30:	learn: 1.1241815	test: 1.1247121	best: 1.1247121 (30)	total: 224ms	remaining: 35.9s
40:	learn: 1.1151979	test: 1.1158953	best: 1.1158953 (40)	total: 289ms	remaining: 35s
50:	learn: 1.1110443	test: 1.1118836	best: 1.1118836 (50)	total: 355ms	remaining: 34.5s
60:	learn: 1.1087215	test: 1.1098233	best: 1.1098233 (60)	total: 421ms	remaining: 34.1s
70:	learn: 1.1072960	test: 1.1085265	best: 1.1085265 (70)	total: 487ms	remaining: 33.8s
80:	learn: 1.1061152	test: 1.1075917	best: 1.1075917 (80)	total: 556ms	remaining: 33.7s
90:	learn: 1.1051178	test: 1.1067808	best: 1.1067808 (90)	total: 631ms	remaining: 34s
100:	learn: 1.1042019	test: 1.1060670	best: 1.1060670 (100)	total: 709ms	remaining: 34.4s
110:	learn: 1.1034475	test: 1.1054



0:	learn: 1.3570821	test: 1.3571302	best: 1.3571302 (0)	total: 9.25ms	remaining: 46.3s
10:	learn: 1.2002321	test: 1.2003625	best: 1.2003625 (10)	total: 82.4ms	remaining: 37.4s
20:	learn: 1.1456481	test: 1.1458113	best: 1.1458113 (20)	total: 174ms	remaining: 41.2s
30:	learn: 1.1242939	test: 1.1245906	best: 1.1245906 (30)	total: 254ms	remaining: 40.7s
40:	learn: 1.1152803	test: 1.1157208	best: 1.1157208 (40)	total: 325ms	remaining: 39.3s
50:	learn: 1.1111075	test: 1.1116048	best: 1.1116048 (50)	total: 401ms	remaining: 38.9s
60:	learn: 1.1088592	test: 1.1095260	best: 1.1095260 (60)	total: 483ms	remaining: 39.1s
70:	learn: 1.1074711	test: 1.1082637	best: 1.1082637 (70)	total: 556ms	remaining: 38.6s
80:	learn: 1.1063964	test: 1.1073407	best: 1.1073407 (80)	total: 628ms	remaining: 38.1s
90:	learn: 1.1053967	test: 1.1064539	best: 1.1064539 (90)	total: 711ms	remaining: 38.4s
100:	learn: 1.1044791	test: 1.1057881	best: 1.1057881 (100)	total: 784ms	remaining: 38s
110:	learn: 1.1036625	test: 1.10



0:	learn: 1.3570015	test: 1.3569846	best: 1.3569846 (0)	total: 8.74ms	remaining: 43.7s
10:	learn: 1.2002481	test: 1.2000149	best: 1.2000149 (10)	total: 92.5ms	remaining: 41.9s
20:	learn: 1.1457516	test: 1.1453474	best: 1.1453474 (20)	total: 173ms	remaining: 40.9s
30:	learn: 1.1243981	test: 1.1240389	best: 1.1240389 (30)	total: 250ms	remaining: 40s
40:	learn: 1.1154136	test: 1.1150437	best: 1.1150437 (40)	total: 324ms	remaining: 39.1s
50:	learn: 1.1112984	test: 1.1109366	best: 1.1109366 (50)	total: 401ms	remaining: 38.9s
60:	learn: 1.1090224	test: 1.1087442	best: 1.1087442 (60)	total: 485ms	remaining: 39.2s
70:	learn: 1.1076110	test: 1.1073951	best: 1.1073951 (70)	total: 556ms	remaining: 38.6s
80:	learn: 1.1065649	test: 1.1063147	best: 1.1063147 (80)	total: 626ms	remaining: 38s
90:	learn: 1.1056041	test: 1.1053692	best: 1.1053692 (90)	total: 703ms	remaining: 37.9s
100:	learn: 1.1047949	test: 1.1046326	best: 1.1046326 (100)	total: 772ms	remaining: 37.4s
110:	learn: 1.1040626	test: 1.1040

In [226]:
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")
result_xgb = multi_estimators_predict(xgb_estimators, test.drop(columns='id'))

Wed May 19 17:46:37 2021, Cross-Validation, 100000 rows, 50 cols
[0]	validation_0-mlogloss:1.35378
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.18522
[20]	validation_0-mlogloss:1.13534
[30]	validation_0-mlogloss:1.11928
[40]	validation_0-mlogloss:1.1132
[50]	validation_0-mlogloss:1.1101
[60]	validation_0-mlogloss:1.10823
[70]	validation_0-mlogloss:1.10675
[80]	validation_0-mlogloss:1.10563
[90]	validation_0-mlogloss:1.10452
[100]	validation_0-mlogloss:1.10353
[110]	validation_0-mlogloss:1.10262
[120]	validation_0-mlogloss:1.10194
[130]	validation_0-mlogloss:1.10134
[140]	validation_0-mlogloss:1.10067
[150]	validation_0-mlogloss:1.10012
[160]	validation_0-mlogloss:1.09961
[170]	validation_0-mlogloss:1.09898
[180]	validation_0-mlogloss:1.09844
[190]	validation_0-mlogloss:1.09803
[200]	validation_0-mlogloss:1.09759
[210]	validation_0-mlogloss:1.09704
[220]	validation_0-mlogloss:1.09664
[230]	validation_0-mlogloss:1.09624
[240]	validatio

In [22]:
result_lgb_xgb_cb = np.dstack((result_lgb, result_xgb, result_cb))
result_lgb_xgb_cb_mean = np.mean(result_lgb_xgb_cb, axis=2)

In [23]:
test_pred_df = pd.DataFrame(result_lgb_xgb_cb_mean)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

In [24]:
result

Unnamed: 0,id,class_1,class_2,class_3,class_4
0,100000,0.093071,0.619720,0.165824,0.121385
1,100001,0.077836,0.690752,0.141058,0.090354
2,100002,0.084690,0.643264,0.175257,0.096789
3,100003,0.085663,0.535393,0.292535,0.086409
4,100004,0.074546,0.623293,0.186189,0.115972
...,...,...,...,...,...
49995,149995,0.088292,0.696105,0.156804,0.058799
49996,149996,0.077911,0.655623,0.131053,0.135413
49997,149997,0.081790,0.523733,0.229251,0.165226
49998,149998,0.081625,0.594241,0.165784,0.158350


Вывод: результат 0.8643. Надо оспользовать двухуровневую модель

## Получение результата двухуровневой модели

In [11]:
data = get_input("train.csv")
test = get_input("test.csv")
data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols


In [None]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

In [13]:
print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Out of fold log loss 1.0927355848269298


In [14]:
result_lgb_valid = multi_estimators_predict(lgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090214
Model 1 metric: 1.089329
Model 2 metric: 1.089982
Model 3 metric: 1.090136
Model 4 metric: 1.090564
Model 5 metric: 1.089468
Model 6 metric: 1.090378
Result model metric: 1.089295


In [None]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

In [16]:
print(f"Out of fold log loss {metrics.log_loss(y_train, cb_oof)}")

Out of fold log loss 1.0928152109200098


In [17]:
result_cb_valid = multi_estimators_predict(cb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090639
Model 1 metric: 1.090936
Model 2 metric: 1.09082
Model 3 metric: 1.09097
Model 4 metric: 1.09134
Model 5 metric: 1.090482
Model 6 metric: 1.090903
Result model metric: 1.090346


In [None]:
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

In [19]:
print(f"Out of fold log loss {metrics.log_loss(y_train, xgb_oof)}")

Out of fold log loss 1.0931592292917518


In [20]:
result_xgb_valid = multi_estimators_predict(xgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090678
Model 1 metric: 1.09052
Model 2 metric: 1.091207
Model 3 metric: 1.091059
Model 4 metric: 1.091386
Model 5 metric: 1.090657
Model 6 metric: 1.091556
Result model metric: 1.090071


In [21]:
result_lgb_valid_df = pd.DataFrame(result_lgb_valid)
result_xgb_valid_df = pd.DataFrame(result_xgb_valid)
result_cb_valid_df = pd.DataFrame(result_cb_valid)
result_lgb_valid_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_valid_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_valid_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result = pd.concat([result_lgb_valid_df, result_xgb_valid_df, result_cb_valid_df], axis=1) 

In [23]:
params = {'random_state': 0, 
          'max_iter': 300, 
          'multi_class': 'multinomial', 
          'solver': 'lbfgs'}

logreg = LogisticRegression(**params)
logreg.fit(result, y_valid)

LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0)

In [25]:
result_xgb_test = multi_estimators_predict(xgb_estimators, test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(lgb_estimators, test.drop(columns='id'))
result_cb_test = multi_estimators_predict(cb_estimators, test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

y_pred_logreg = logreg.predict_proba(result_for_logreg)

test_pred_df = pd.DataFrame(y_pred_logreg)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

Вывод: результат 0.8653. Недостаток: приходится разбивать тренировочный датасет на две части для обучения двухуровневой модели. Каждый из уровней обучается на усеченном наборе данных. Выход: использовать читерство и обучать модель второго уровня на предсказаниях не модели первого уровня а на модели, обученной на усеченном датасете. Тогда модель первого уровня можно обучить на всем тренировочном датасете.

## Получение результата двухуровневой модели с читерством

In [48]:
data = get_input("train.csv")
test = get_input("test.csv")
data.drop(columns='id', inplace=True)
sample_submission = get_input("sample_submission.csv")

mapper = {'Class_1': 1, 'Class_2': 2, 'Class_3': 3, 'Class_4': 4}
data['target'] = data['target'].map(mapper)

y_train = data['target']
x_train = data.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [49]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Wed May 19 19:04:49 2021, Cross-Validation, 100000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.11492
[20]	valid_0's multi_logloss: 1.11183
[30]	valid_0's multi_logloss: 1.10949
[40]	valid_0's multi_logloss: 1.10757
[50]	valid_0's multi_logloss: 1.10592
[60]	valid_0's multi_logloss: 1.10454
[70]	valid_0's multi_logloss: 1.1033
[80]	valid_0's multi_logloss: 1.10229
[90]	valid_0's multi_logloss: 1.10132
[100]	valid_0's multi_logloss: 1.10035
[110]	valid_0's multi_logloss: 1.09963
[120]	valid_0's multi_logloss: 1.09893
[130]	valid_0's multi_logloss: 1.09829
[140]	valid_0's multi_logloss: 1.09778
[150]	valid_0's multi_logloss: 1.09736
[160]	valid_0's multi_logloss: 1.09698
[170]	valid_0's multi_logloss: 1.09649
[180]	valid_0's multi_logloss: 1.09613
[190]	valid_0's multi_logloss: 1.09579
[200]	valid_0's multi_logloss: 1.09551
[210]	valid_0's multi_logloss: 1.09525
[220]	valid_0's multi_logloss: 1.09494
[230]	valid_0's multi_log

In [50]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Wed May 19 19:09:44 2021, Cross-Validation, 100000 rows, 50 cols




0:	learn: 1.3570696	test: 1.3571273	best: 1.3571273 (0)	total: 9.07ms	remaining: 45.3s
10:	learn: 1.2001790	test: 1.2004731	best: 1.2004731 (10)	total: 92.9ms	remaining: 42.2s
20:	learn: 1.1455296	test: 1.1461818	best: 1.1461818 (20)	total: 178ms	remaining: 42.2s
30:	learn: 1.1241185	test: 1.1249118	best: 1.1249118 (30)	total: 268ms	remaining: 42.9s
40:	learn: 1.1151455	test: 1.1161105	best: 1.1161105 (40)	total: 345ms	remaining: 41.7s
50:	learn: 1.1110716	test: 1.1121235	best: 1.1121235 (50)	total: 422ms	remaining: 41s
60:	learn: 1.1088484	test: 1.1100391	best: 1.1100391 (60)	total: 502ms	remaining: 40.7s
70:	learn: 1.1074304	test: 1.1087829	best: 1.1087829 (70)	total: 581ms	remaining: 40.3s
80:	learn: 1.1063289	test: 1.1078099	best: 1.1078099 (80)	total: 661ms	remaining: 40.1s
90:	learn: 1.1053639	test: 1.1069056	best: 1.1069056 (90)	total: 744ms	remaining: 40.1s
100:	learn: 1.1045733	test: 1.1062809	best: 1.1062809 (100)	total: 823ms	remaining: 39.9s
110:	learn: 1.1038375	test: 1.10



0:	learn: 1.3570923	test: 1.3571067	best: 1.3571067 (0)	total: 8.33ms	remaining: 41.6s
10:	learn: 1.2002142	test: 1.2002782	best: 1.2002782 (10)	total: 77.9ms	remaining: 35.3s
20:	learn: 1.1456235	test: 1.1458424	best: 1.1458424 (20)	total: 154ms	remaining: 36.6s
30:	learn: 1.1242168	test: 1.1244741	best: 1.1244741 (30)	total: 234ms	remaining: 37.5s
40:	learn: 1.1152313	test: 1.1156049	best: 1.1156049 (40)	total: 304ms	remaining: 36.8s
50:	learn: 1.1110831	test: 1.1114454	best: 1.1114454 (50)	total: 374ms	remaining: 36.3s
60:	learn: 1.1088948	test: 1.1093680	best: 1.1093680 (60)	total: 451ms	remaining: 36.5s
70:	learn: 1.1074926	test: 1.1080307	best: 1.1080307 (70)	total: 526ms	remaining: 36.5s
80:	learn: 1.1063995	test: 1.1069496	best: 1.1069496 (80)	total: 597ms	remaining: 36.2s
90:	learn: 1.1055386	test: 1.1061392	best: 1.1061392 (90)	total: 672ms	remaining: 36.3s
100:	learn: 1.1047667	test: 1.1054388	best: 1.1054388 (100)	total: 742ms	remaining: 36s
110:	learn: 1.1040121	test: 1.10



0:	learn: 1.3570830	test: 1.3570965	best: 1.3570965 (0)	total: 8.32ms	remaining: 41.6s
10:	learn: 1.2000125	test: 1.2003488	best: 1.2003488 (10)	total: 80.4ms	remaining: 36.5s
20:	learn: 1.1454423	test: 1.1461273	best: 1.1461273 (20)	total: 158ms	remaining: 37.4s
30:	learn: 1.1240225	test: 1.1249501	best: 1.1249501 (30)	total: 236ms	remaining: 37.8s
40:	learn: 1.1150217	test: 1.1164545	best: 1.1164545 (40)	total: 307ms	remaining: 37.1s
50:	learn: 1.1108722	test: 1.1124519	best: 1.1124519 (50)	total: 384ms	remaining: 37.3s
60:	learn: 1.1087179	test: 1.1105639	best: 1.1105639 (60)	total: 461ms	remaining: 37.4s
70:	learn: 1.1072487	test: 1.1093213	best: 1.1093213 (70)	total: 532ms	remaining: 37s
80:	learn: 1.1060955	test: 1.1084023	best: 1.1084023 (80)	total: 605ms	remaining: 36.7s
90:	learn: 1.1051665	test: 1.1076518	best: 1.1076518 (90)	total: 686ms	remaining: 37s
100:	learn: 1.1043274	test: 1.1069367	best: 1.1069367 (100)	total: 772ms	remaining: 37.5s
110:	learn: 1.1035577	test: 1.1063



0:	learn: 1.3570884	test: 1.3570943	best: 1.3570943 (0)	total: 8.23ms	remaining: 41.2s
10:	learn: 1.2001809	test: 1.2004024	best: 1.2004024 (10)	total: 78.9ms	remaining: 35.8s
20:	learn: 1.1456050	test: 1.1458625	best: 1.1458625 (20)	total: 162ms	remaining: 38.5s
30:	learn: 1.1242387	test: 1.1246252	best: 1.1246252 (30)	total: 237ms	remaining: 38s
40:	learn: 1.1152205	test: 1.1157168	best: 1.1157168 (40)	total: 307ms	remaining: 37.1s
50:	learn: 1.1110965	test: 1.1116739	best: 1.1116739 (50)	total: 375ms	remaining: 36.4s
60:	learn: 1.1089181	test: 1.1095335	best: 1.1095335 (60)	total: 447ms	remaining: 36.2s
70:	learn: 1.1074983	test: 1.1082656	best: 1.1082656 (70)	total: 515ms	remaining: 35.8s
80:	learn: 1.1063389	test: 1.1071523	best: 1.1071523 (80)	total: 587ms	remaining: 35.6s
90:	learn: 1.1053935	test: 1.1062164	best: 1.1062164 (90)	total: 664ms	remaining: 35.8s
100:	learn: 1.1045987	test: 1.1055867	best: 1.1055867 (100)	total: 732ms	remaining: 35.5s
110:	learn: 1.1037876	test: 1.10



0:	learn: 1.3570800	test: 1.3571169	best: 1.3571169 (0)	total: 8.47ms	remaining: 42.3s
10:	learn: 1.2002421	test: 1.2003685	best: 1.2003685 (10)	total: 90.8ms	remaining: 41.2s
20:	learn: 1.1455926	test: 1.1459235	best: 1.1459235 (20)	total: 172ms	remaining: 40.7s
30:	learn: 1.1241795	test: 1.1247424	best: 1.1247424 (30)	total: 255ms	remaining: 40.8s
40:	learn: 1.1151312	test: 1.1158211	best: 1.1158211 (40)	total: 325ms	remaining: 39.3s
50:	learn: 1.1109649	test: 1.1117578	best: 1.1117578 (50)	total: 396ms	remaining: 38.4s
60:	learn: 1.1087776	test: 1.1097587	best: 1.1097587 (60)	total: 474ms	remaining: 38.4s
70:	learn: 1.1072962	test: 1.1084773	best: 1.1084773 (70)	total: 553ms	remaining: 38.4s
80:	learn: 1.1061702	test: 1.1075518	best: 1.1075518 (80)	total: 630ms	remaining: 38.3s
90:	learn: 1.1051842	test: 1.1067701	best: 1.1067701 (90)	total: 719ms	remaining: 38.8s
100:	learn: 1.1043629	test: 1.1061030	best: 1.1061030 (100)	total: 799ms	remaining: 38.7s
110:	learn: 1.1035035	test: 1.



0:	learn: 1.3570794	test: 1.3570411	best: 1.3570411 (0)	total: 8.34ms	remaining: 41.7s
10:	learn: 1.2002440	test: 1.2003618	best: 1.2003618 (10)	total: 78.5ms	remaining: 35.6s
20:	learn: 1.1456533	test: 1.1457270	best: 1.1457270 (20)	total: 168ms	remaining: 39.9s
30:	learn: 1.1242741	test: 1.1244638	best: 1.1244638 (30)	total: 246ms	remaining: 39.4s
40:	learn: 1.1152883	test: 1.1156173	best: 1.1156173 (40)	total: 326ms	remaining: 39.4s
50:	learn: 1.1111663	test: 1.1116303	best: 1.1116303 (50)	total: 396ms	remaining: 38.4s
60:	learn: 1.1089614	test: 1.1095643	best: 1.1095643 (60)	total: 473ms	remaining: 38.3s
70:	learn: 1.1075227	test: 1.1082408	best: 1.1082408 (70)	total: 544ms	remaining: 37.7s
80:	learn: 1.1064573	test: 1.1073995	best: 1.1073995 (80)	total: 613ms	remaining: 37.2s
90:	learn: 1.1054713	test: 1.1065468	best: 1.1065468 (90)	total: 688ms	remaining: 37.1s
100:	learn: 1.1045190	test: 1.1056410	best: 1.1056410 (100)	total: 757ms	remaining: 36.7s
110:	learn: 1.1036852	test: 1.



0:	learn: 1.3570836	test: 1.3570374	best: 1.3570374 (0)	total: 8.26ms	remaining: 41.3s
10:	learn: 1.2003324	test: 1.2002527	best: 1.2002527 (10)	total: 75.2ms	remaining: 34.1s
20:	learn: 1.1458366	test: 1.1456441	best: 1.1456441 (20)	total: 153ms	remaining: 36.3s
30:	learn: 1.1244550	test: 1.1242604	best: 1.1242604 (30)	total: 238ms	remaining: 38.2s
40:	learn: 1.1154320	test: 1.1151865	best: 1.1151865 (40)	total: 314ms	remaining: 37.9s
50:	learn: 1.1113153	test: 1.1109760	best: 1.1109760 (50)	total: 383ms	remaining: 37.2s
60:	learn: 1.1091773	test: 1.1088360	best: 1.1088360 (60)	total: 452ms	remaining: 36.6s
70:	learn: 1.1076930	test: 1.1074743	best: 1.1074743 (70)	total: 516ms	remaining: 35.8s
80:	learn: 1.1065895	test: 1.1063393	best: 1.1063393 (80)	total: 580ms	remaining: 35.2s
90:	learn: 1.1056996	test: 1.1055807	best: 1.1055807 (90)	total: 643ms	remaining: 34.7s
100:	learn: 1.1048401	test: 1.1047606	best: 1.1047606 (100)	total: 722ms	remaining: 35s
110:	learn: 1.1040735	test: 1.10

In [51]:
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Wed May 19 19:11:34 2021, Cross-Validation, 100000 rows, 50 cols
[0]	validation_0-mlogloss:1.35378
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.18522
[20]	validation_0-mlogloss:1.13534
[30]	validation_0-mlogloss:1.11928
[40]	validation_0-mlogloss:1.1132
[50]	validation_0-mlogloss:1.1101
[60]	validation_0-mlogloss:1.10823
[70]	validation_0-mlogloss:1.10675
[80]	validation_0-mlogloss:1.10563
[90]	validation_0-mlogloss:1.10452
[100]	validation_0-mlogloss:1.10353
[110]	validation_0-mlogloss:1.10262
[120]	validation_0-mlogloss:1.10194
[130]	validation_0-mlogloss:1.10134
[140]	validation_0-mlogloss:1.10067
[150]	validation_0-mlogloss:1.10012
[160]	validation_0-mlogloss:1.09961
[170]	validation_0-mlogloss:1.09898
[180]	validation_0-mlogloss:1.09844
[190]	validation_0-mlogloss:1.09803
[200]	validation_0-mlogloss:1.09759
[210]	validation_0-mlogloss:1.09704
[220]	validation_0-mlogloss:1.09664
[230]	validation_0-mlogloss:1.09624
[240]	validatio

In [52]:
result_xgb_test = multi_estimators_predict(xgb_estimators, test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(lgb_estimators, test.drop(columns='id'))
result_cb_test = multi_estimators_predict(cb_estimators, test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

y_pred_logreg = logreg_ensamble_for_test.predict_proba(result_for_logreg)

test_pred_df = pd.DataFrame(y_pred_logreg)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

0.8581 получен при использовании двухуровневой модели с читерством. При этом первый уровень для обучения использовал весь датасет. Второй уровень (логистическая регрессия) для обучения использовал треть тренировочного датасета состоящего из предсказаний модели первого уровня, обученной на 2/3 датасета. Это поволяет повысить результат на lb но отчести неправльно. Правильно делать как в предыдущем пункте, но там результат выше. Выход: использовать кроссвалидированную двузуровневую модель.