## Загрузка данных и подключение библиотек

In [1]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-may-2021
!pip install catboost
!pip install eli5
!pip install optuna
!pip install shap
!pip install scikit-learn-extra
!unzip /content/test.csv.zip
!unzip /content/train.csv.zip
!unzip /content/sample_submission.csv.zip

kaggle.json
Downloading train.csv.zip to /content
  0% 0.00/1.72M [00:00<?, ?B/s]
100% 1.72M/1.72M [00:00<00:00, 57.1MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/128k [00:00<?, ?B/s]
100% 128k/128k [00:00<00:00, 138MB/s]
Downloading test.csv.zip to /content
  0% 0.00/851k [00:00<?, ?B/s]
100% 851k/851k [00:00<00:00, 55.5MB/s]
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 47kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/d1/54/04cab6e1c0ae535bec93f795d8403fdf6caf66fa5a6512263202dbb14ea6/eli5-0.11.0-py2.py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 6.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import catboost as cb
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA, FactorAnalysis as FA
from typing import List, Optional
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split
from typing import List, Tuple
import scipy.stats as ss
from sklearn_extra.cluster import KMedoids
import math
from sklearn.utils.validation import check_is_fitted
import eli5
from sklearn.base import BaseEstimator, TransformerMixin
import time
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import shap
import missingno as msno
from sklearn.inspection import permutation_importance
from eli5.sklearn import PermutationImportance
import optuna
from sklearn.metrics import log_loss
pd.plotting.register_matplotlib_converters()
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("viridis", 10)
sns.set_palette(pal)

## Используемые функции

In [3]:
def get_input(data_path: str) -> pd.DataFrame:
  """
  Считывание данных и вывод основной информации о наборе данных.

  Parmeters
  ---------
  data_path: str - название файла

  Returns
  -------
  data: pandas.core.frame.DataFrame - загруженный набор данных в pandas.Dataframe
  """
  base_path = "/content"
  data = pd.read_csv(f"{base_path}/{data_path}")
  data.columns = [col.lower() for col in data.columns]
  print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")
  return data

In [4]:
def plot_feature_importance(importance, names, model_type, figsize=(10,8)):

  #Create arrays from feature importance and feature names
  feature_importance = np.array(importance)
  feature_names = np.array(names)

  #Create a DataFrame using a Dictionary
  data={'feature_names':feature_names,'feature_importance':feature_importance}
  fi_df = pd.DataFrame(data)

  #Sort the DataFrame in order decreasing feature importance
  fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

  #Define size of bar plot
  plt.figure(figsize=figsize)
  #Plot Searborn bar chart
  sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
  #Add chart labels
  plt.title(model_type + ' FEATURE IMPORTANCE')
  plt.xlabel('FEATURE IMPORTANCE')
  plt.ylabel('FEATURE NAMES')
  return list(fi_df['feature_names'].values)

In [5]:
def multi_estimators_predict(estimators: List,
                             x_valid: pd.DataFrame,
                             y_valid = None,
                             metric: callable = None,
                             scalers: List = None):
  preds = []
  evals = []
  if scalers:
    assert len(estimators) == len(scalers)

  for i in range(len(estimators)):
    if scalers:
      pred = estimators[i].predict_proba(scalers[i].transform(x_valid))
    else:
      if type(estimators[0]) == xgb.core.Booster:
        pred = estimators[i].predict(x_valid)
      else:
        pred = estimators[i].predict_proba(x_valid)

    
    preds.append(pred)
    if (y_valid is not None) and (metric is not None):
      eval = metric(y_valid, pred)
      evals.append(eval)
  
  result = np.stack([preds], axis=0)
  result = np.squeeze(result, axis=0)
  result = np.mean(result, axis=0)

  if (y_valid is not None) & (metric is not None):
    for i, eval in enumerate(evals):
      print(f"Model {i} metric: {eval:.7}")
    print(f"Result model metric: {metric(y_valid, result):.7}")

  return result

In [6]:
def catboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)], verbose=10, early_stopping_rounds=100,
            #cat_features=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [7]:
def lightgbm_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели lightgbm.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros((X.shape[0], 4))
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="multi_logloss", verbose=10, early_stopping_rounds=50,
            categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [8]:
def xgboost_cv_fit(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели xgboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros((X.shape[0], 4))

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="mlogloss", 
            verbose=10, 
            early_stopping_rounds=50,
        )

        oof_preds[valid_idx] = model.predict_proba(x_valid)
        score = metrics.log_loss(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

## Загрузка данных и построение моделей

### lightgbm + optuna

In [None]:
data = get_input("train.csv")
data.drop(columns='id', inplace=True)
mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)
#target = data['target']
#data = data.drop(columns=['target'])

lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    #'num_boost_round': 100,
    'max_depth': 8,
    'num_class': 4,
    'num_leaves': 6, 
}

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

dtrain = lgb.Dataset(data=x_train, label=y_train)
dvalid = lgb.Dataset(data=x_valid, label=y_valid)

result = lgb.cv(
    params=lgbm_params,
    train_set=dtrain,
    num_boost_round=1000,
    early_stopping_rounds=100,
    verbose_eval=10,
    stratified=True,
    seed=42,
    metrics="multi_logloss",
    shuffle=True,
    nfold=2
)

train.csv: shape = 100000 rows, 52 cols


In [None]:
result['multi_logloss-mean'][-1] + result['multi_logloss-stdv'][-1]

1.098969788749625

In [None]:
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dvalid = lgb.Dataset(valid_x, label=valid_y)

    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'num_class': 4,

        'max_depth': trial.suggest_int('max_depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.13 ),

    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
    )

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)

    log_loss = sklearn.metrics.log_loss(valid_y, preds)
    return log_loss

In [None]:
import optuna.integration.lightgbm as lgb
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)
    dtrain = lgb.Dataset(train_x, label=train_y)
    dvalid = lgb.Dataset(valid_x, label=valid_y)

    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'num_class': 4,

        'max_depth': trial.suggest_int('max_depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.13 ),
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    result = lgb.cv(
      params=param,
      train_set=dtrain,
      num_boost_round=10000,
      early_stopping_rounds=100,
      verbose_eval=10,
      stratified=True,
      seed=42,
      metrics="multi_logloss",
      shuffle=True,
      nfold=5
    )
 
    log_loss = result['multi_logloss-mean'][-1] + result['multi_logloss-stdv'][-1]
    return log_loss

In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=20), direction="minimize"
    )
study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
print(study.best_params)

{'lambda_l1': 1.45120566383297e-06, 'lambda_l2': 0.003368552965821498, 'num_leaves': 32, 'feature_fraction': 0.4471131328810426, 'bagging_fraction': 0.5888198372400193, 'bagging_freq': 4, 'min_child_samples': 32}


In [None]:
study.best_value

1.0833801288748757

In [72]:
import lightgbm as lgb
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [10]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'lambda_l1': 5.5056959920762045, # reg_alpha
    'lambda_l2': 1.2560855982145416, # lambda_l2
    'num_leaves': 6,
    'max_depth': 8,
    'feature_fraction': 0.557158037184633, # colsample_bytree
    'bagging_fraction': 0.43089966688052994, # subsample
    'bagging_freq': 1,
    'min_child_samples': 60,
    'learning_rate': 0.06662526682252737,
    'random_state': 42, 
}

lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

Sun May 23 19:42:35 2021, Cross-Validation, 70000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.115
[20]	valid_0's multi_logloss: 1.11173
[30]	valid_0's multi_logloss: 1.10944
[40]	valid_0's multi_logloss: 1.10752
[50]	valid_0's multi_logloss: 1.10572
[60]	valid_0's multi_logloss: 1.10412
[70]	valid_0's multi_logloss: 1.1028
[80]	valid_0's multi_logloss: 1.10162
[90]	valid_0's multi_logloss: 1.10055
[100]	valid_0's multi_logloss: 1.09961
[110]	valid_0's multi_logloss: 1.09884
[120]	valid_0's multi_logloss: 1.09813
[130]	valid_0's multi_logloss: 1.09751
[140]	valid_0's multi_logloss: 1.09696
[150]	valid_0's multi_logloss: 1.09646
[160]	valid_0's multi_logloss: 1.09598
[170]	valid_0's multi_logloss: 1.09556
[180]	valid_0's multi_logloss: 1.09526
[190]	valid_0's multi_logloss: 1.09492
[200]	valid_0's multi_logloss: 1.09458
[210]	valid_0's multi_logloss: 1.0942
[220]	valid_0's multi_logloss: 1.09394
[230]	valid_0's multi_logloss

In [11]:
print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Out of fold log loss 1.0927355848269298


In [12]:
result_lgb_valid = multi_estimators_predict(lgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090214
Model 1 metric: 1.089329
Model 2 metric: 1.089982
Model 3 metric: 1.090136
Model 4 metric: 1.090564
Model 5 metric: 1.089468
Model 6 metric: 1.090378
Result model metric: 1.089295


In [9]:
import lightgbm as lgb
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=100,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [10]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

Sun May 23 21:41:52 2021, Cross-Validation, 70000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.11273
[20]	valid_0's multi_logloss: 1.10964
[30]	valid_0's multi_logloss: 1.10719
[40]	valid_0's multi_logloss: 1.10532
[50]	valid_0's multi_logloss: 1.10367
[60]	valid_0's multi_logloss: 1.1023
[70]	valid_0's multi_logloss: 1.10096
[80]	valid_0's multi_logloss: 1.09979
[90]	valid_0's multi_logloss: 1.09866
[100]	valid_0's multi_logloss: 1.09783
[110]	valid_0's multi_logloss: 1.09689
[120]	valid_0's multi_logloss: 1.0962
[130]	valid_0's multi_logloss: 1.09564
[140]	valid_0's multi_logloss: 1.09505
[150]	valid_0's multi_logloss: 1.09447
[160]	valid_0's multi_logloss: 1.09391
[170]	valid_0's multi_logloss: 1.0934
[180]	valid_0's multi_logloss: 1.09292
[190]	valid_0's multi_logloss: 1.09243
[200]	valid_0's multi_logloss: 1.09206
[210]	valid_0's multi_logloss: 1.09173
[220]	valid_0's multi_logloss: 1.09143
[230]	valid_0's multi_loglos

In [11]:
print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Out of fold log loss 1.089564904210907


In [17]:
result_lgb_valid = multi_estimators_predict(lgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.09676
Model 1 metric: 1.096874
Model 2 metric: 1.097303
Model 3 metric: 1.096696
Model 4 metric: 1.096749
Model 5 metric: 1.096947
Model 6 metric: 1.096998
Result model metric: 1.096104


### Catboost

In [None]:
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)

    param = {
        "loss_function": "MultiClass",
        "eval_metric": "MultiClass",
        "task_type": "GPU",
        
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        #"subsample": trial.suggest_float("subsample", 0.1, 1),

        'min_data_in_leaf': 25,
        'depth': trial.suggest_int('depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.13 ),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),                       
        #'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), # for bayesian bootstrap only

        #'bootstrap_type': 'Bernoulli',
        #'leaf_estimation_method': 'Newton',

        'grow_policy': "SymmetricTree",
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)
    preds = gbm.predict_proba(valid_x)


    log_loss = sklearn.metrics.log_loss(valid_y, preds)
    return log_loss

In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=20), direction="minimize"
    )
study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_param_importances(study)

In [13]:
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=100,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [14]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

Sun May 23 21:46:48 2021, Cross-Validation, 70000 rows, 50 cols
0:	learn: 1.3570484	test: 1.3569765	best: 1.3569765 (0)	total: 6.77ms	remaining: 33.8s
10:	learn: 1.1992988	test: 1.1994396	best: 1.1994396 (10)	total: 62.1ms	remaining: 28.2s
20:	learn: 1.1440779	test: 1.1445191	best: 1.1445191 (20)	total: 112ms	remaining: 26.6s
30:	learn: 1.1222859	test: 1.1229320	best: 1.1229320 (30)	total: 163ms	remaining: 26.1s
40:	learn: 1.1130544	test: 1.1139148	best: 1.1139148 (40)	total: 222ms	remaining: 26.9s
50:	learn: 1.1088443	test: 1.1098447	best: 1.1098447 (50)	total: 272ms	remaining: 26.4s
60:	learn: 1.1066470	test: 1.1077947	best: 1.1077947 (60)	total: 333ms	remaining: 27s
70:	learn: 1.1052060	test: 1.1064326	best: 1.1064326 (70)	total: 388ms	remaining: 26.9s
80:	learn: 1.1041491	test: 1.1054440	best: 1.1054440 (80)	total: 446ms	remaining: 27.1s
90:	learn: 1.1031810	test: 1.1045230	best: 1.1045230 (90)	total: 495ms	remaining: 26.7s
100:	learn: 1.1023573	test: 1.1038536	best: 1.1038536 (100

In [15]:
print(f"Out of fold log loss {metrics.log_loss(y_train, cb_oof)}")

Out of fold log loss 1.089640747806268


In [16]:
result_cb_valid = multi_estimators_predict(cb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.097091
Model 1 metric: 1.096699
Model 2 metric: 1.097202
Model 3 metric: 1.096606
Model 4 metric: 1.096465
Model 5 metric: 1.09692
Model 6 metric: 1.097099
Result model metric: 1.096138


### XGBoost

In [17]:
def objective(trial):
    data = get_input("train.csv")
    data.drop(columns='id', inplace=True)
    mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
    data['target'] = data['target'].map(mapper)
    target = data['target']
    data = data.drop(columns=['target'])

    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.35)
    dtrain = xgb.Dataset(train_x, label=train_y)
    dvalid = xgb.Dataset(valid_x, label=valid_y)

    param = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        "verbosity": -1,
 
       
        'num_class': 4,

        'max_depth': trial.suggest_int('max_depth', 3,7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.13 ),

    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "multi_logloss")
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
    )

    preds = gbm.predict(valid_x)
    log_loss = sklearn.metrics.log_loss(valid_y, preds)
    return log_loss

In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=20), direction="minimize"
    )
study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

In [18]:
data = get_input("train.csv")
test = get_input("test.csv")
sample_submission = get_input("sample_submission.csv")

data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=100,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

xgb_params = {
    'objective': 'multi:softprob',
    #'eval_metric': 'mlogloss',
    'random_state': 13,
    'max_depth': 5, 
    'n_estimators': 2000,
    'num_class': 4
}

xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols
Sun May 23 21:49:51 2021, Cross-Validation, 70000 rows, 50 cols
[0]	validation_0-mlogloss:1.35355
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.18386
[20]	validation_0-mlogloss:1.1332
[30]	validation_0-mlogloss:1.11722
[40]	validation_0-mlogloss:1.11115
[50]	validation_0-mlogloss:1.10809
[60]	validation_0-mlogloss:1.10624
[70]	validation_0-mlogloss:1.10489
[80]	validation_0-mlogloss:1.10363
[90]	validation_0-mlogloss:1.10258
[100]	validation_0-mlogloss:1.10153
[110]	validation_0-mlogloss:1.10056
[120]	validation_0-mlogloss:1.0996
[130]	validation_0-mlogloss:1.09888
[140]	validation_0-mlogloss:1.09813
[150]	validation_0-mlogloss:1.09744
[160]	validation_0-mlogloss:1.09675
[170]	validation_0-mlogloss:1.09617
[180]	validation_0-mlogloss:1.09557
[190]	validation_0-mlogloss:1.09501
[200]	validation_0-mlogloss:1.09

In [19]:
print(f"Out of fold log loss {metrics.log_loss(y_train, xgb_oof)}")

Out of fold log loss 1.0902780185917647


In [20]:
result_xgb_valid = multi_estimators_predict(xgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.097828
Model 1 metric: 1.097387
Model 2 metric: 1.097642
Model 3 metric: 1.09769
Model 4 metric: 1.097575
Model 5 metric: 1.097152
Model 6 metric: 1.09767
Result model metric: 1.096631


## Валидация ансамблей

In [21]:
metrics.log_loss(y_valid, result_cb_valid)

1.0961384172615571

In [22]:
metrics.log_loss(y_valid, result_xgb_valid)

1.0966314097315073

In [23]:
metrics.log_loss(y_valid, result_lgb_valid)

1.0961035940329105

In [24]:
result_lgb_xgb_valid = np.dstack((result_lgb_valid, result_xgb_valid))
result_lgb_xgb_valid_mean = np.mean(result_lgb_xgb_valid, axis=2)
print(f"result_lgb_xgb: {metrics.log_loss(y_valid, result_lgb_xgb_valid_mean)}")

result_lgb_xgb: 1.0958163799695186


In [25]:
result_lgb_cb_valid = np.dstack((result_lgb_valid, result_cb_valid))
result_lgb_cb_valid_mean = np.mean(result_lgb_cb_valid, axis=2)
print(f"result_lgb_cb: {metrics.log_loss(y_valid, result_lgb_cb_valid_mean)}")

result_lgb_cb: 1.0956951509068418


In [26]:
result_xgb_cb_valid = np.dstack((result_xgb_valid, result_cb_valid))
result_xgb_cb_valid_mean = np.mean(result_xgb_cb_valid, axis=2)
print(f"result_xgb_cb: {metrics.log_loss(y_valid, result_xgb_cb_valid_mean)}")

result_xgb_cb: 1.0960269161599514


In [27]:
result_lgb_xgb_cb_valid = np.dstack((result_lgb_valid, result_xgb_valid, result_cb_valid))
result_lgb_xgb_cb_valid_mean = np.mean(result_lgb_xgb_cb_valid, axis=2)
print(f"result_lgb_xgb_cb: {metrics.log_loss(y_valid, result_lgb_xgb_cb_valid_mean)}")

result_lgb_xgb_cb: 1.0956974487751212


Вывод: лучшее качество у ансамбля lightgbm и xgboost по среднему

## Модель второго уровня (логисическая регрессия на ансамблях)

In [28]:
result_lgbm_df = pd.DataFrame(result_lgb_valid)
result_xgb_df = pd.DataFrame(result_xgb_valid)
result_cb_df = pd.DataFrame(result_cb_valid)
result_lgbm_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_valid = pd.concat([result_lgbm_df, result_xgb_df, result_cb_df], axis=1) 

In [29]:
x_train_logreg, x_valid_logreg, y_train_logreg, y_valid_logreg = train_test_split(
    result_valid, y_valid, train_size=0.7, shuffle=True, random_state=3,
)

params = {'random_state': 0, 
          'max_iter': 300, 
          'multi_class': 'multinomial', 
          'solver': 'lbfgs'}

logreg = LogisticRegression(**params)
logreg.fit(x_train_logreg, y_train_logreg)

y_pred_logreg = logreg.predict_proba(x_valid_logreg)
metrics.log_loss(y_valid_logreg, y_pred_logreg)

1.0869137473340493

Вывод: наилучший результат при построении модели второго уровня (логистической регрессии) на предсказаниях ансамблей бустинговых алгоритмов

In [30]:
# Обучаем на предсказаниях ансамблей на валидационном датасете и будем использовать для получения
# итогового результата на тесте
logreg_ensamble_for_test = LogisticRegression(**params)
logreg_ensamble_for_test.fit(result_valid, y_valid)

LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0)

## Модель второго уровня (логистическая регрессия на предсказаниях каждого отдельного алгоритма бустинга)

In [46]:
def get_df_results_from_estimators(lgb_estimators: List,
                                   xgb_estimators: List,
                                   cb_estimators: List,
                                   data: pd.DataFrame):
  """
  Получение DataFrame, содержащего предсказания всех estimators для data.

  Parameters
  ----------
  params: dict
      Словарь гиперпараметров модели.

  lgb_estimators: List
      Список моделей lightgbm.

  xgb_estimators: List
      Список моделей xgboost.

  cb_estimators: List
      Список моделей catboost.

  Returns
  -------
  result: pd.DataFrame
      Датафрейм с результатами предсказаний моделей.
  """
  assert len(lgb_estimators) == len(xgb_estimators) == len(cb_estimators)
  result_lgb = pd.DataFrame(lgb_estimators[0].predict_proba(data)).rename(columns={0:'lgb_0_class_1',	1:'lgb_0_class_2',	2:'lgb_0_class_3',	3:'lgb_0_class_4'})
  result_xgb = pd.DataFrame(xgb_estimators[0].predict_proba(data)).rename(columns={0:'xgb_0_class_1',	1:'xgb_0_class_2',	2:'xgb_0_class_3',	3:'xgb_0_class_4'})
  result_cb = pd.DataFrame(cb_estimators[0].predict_proba(data)).rename(columns={0:'cb_0_class_1',	1:'cb_0_class_2',	2:'cb_0_class_3',	3:'cb_0_class_4'})
  result = pd.concat([result_lgb, result_xgb, result_cb], axis=1)
  for i in range(1, len(lgb_estimators)):
    new_lgb_result = pd.DataFrame(lgb_estimators[i].predict_proba(data)).rename(columns={0:f'lgb_{i}_class_1',	1:f'lgb_{i}_class_2',	2:f'lgb_{i}_class_3',	3:f'lgb_{i}_class_4'})
    new_xgb_result = pd.DataFrame(xgb_estimators[i].predict_proba(data)).rename(columns={0:f'xgb_{i}_class_1',	1:f'xgb_{i}_class_2',	2:f'xgb_{i}_class_3',	3:f'xgb_{i}_class_4'})
    new_cb_result = pd.DataFrame(cb_estimators[i].predict_proba(data)).rename(columns={0:f'cb_{i}_class_1',	1:f'cb_{i}_class_2',	2:f'cb_{i}_class_3',	3:f'cb_{i}_class_4'})
    result = pd.concat([result, new_lgb_result, new_xgb_result, new_cb_result], axis=1)
  return result

In [47]:
result_valid_each = get_df_results_from_estimators(lgb_estimators, xgb_estimators, cb_estimators, x_valid)

In [49]:
x_train_logreg, x_valid_logreg, y_train_logreg, y_valid_logreg = train_test_split(
    result_valid_each, y_valid, train_size=0.7, shuffle=True, random_state=3,
)

params = {'random_state': 0, 
          'max_iter': 500, 
          'multi_class': 'multinomial',
          'solver': 'lbfgs',
          'C': 0.2}

logreg = LogisticRegression(**params)
logreg.fit(x_train_logreg, y_train_logreg)

y_pred_logreg = logreg.predict_proba(x_valid_logreg)
metrics.log_loss(y_valid_logreg, y_pred_logreg)

1.0836126940478514

Вывод: логистическая регрессия над результатами каждого алгоритма бустинга показала немного худший результат, чем на результатами ансамбля

## Получение результата с усреднением выходов ансамблей

In [None]:
data = get_input("train.csv")
test = get_input("test.csv")
data.drop(columns='id', inplace=True)
sample_submission = get_input("sample_submission.csv")

#data['sum'] = data[data.columns.to_list()[:50]].sum(axis=1) # норм
#data['max'] = data[data.columns.to_list()[:50]].max(axis=1) # норм
#data['not_nul_features'] = (data!=0)[data.columns.to_list()[:50]].sum(axis=1)
#data['2+13'] = data['feature_2'] + data['feature_13']
#data['6+15'] = data['feature_6'] + data['feature_15']

#test['sum'] = test[test.columns.to_list()[:50]].sum(axis=1) # норм
#test['max'] = test[test.columns.to_list()[:50]].max(axis=1) # норм
#test['not_nul_features'] = (test!=0)[test.columns.to_list()[:50]].sum(axis=1)
#test['2+13'] = test['feature_2'] + test['feature_13']
#test['6+15'] = test['feature_6'] + test['feature_15']

mapper = {'Class_1': 1, 'Class_2': 2, 'Class_3': 3, 'Class_4': 4}
data['target'] = data['target'].map(mapper)

y_train = data['target']
x_train = data.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [None]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")
result_lgb = multi_estimators_predict(lgb_estimators, test.drop(columns='id'))

Wed May 19 17:39:14 2021, Cross-Validation, 100000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.11463
[20]	valid_0's multi_logloss: 1.11174
[30]	valid_0's multi_logloss: 1.10928
[40]	valid_0's multi_logloss: 1.10737
[50]	valid_0's multi_logloss: 1.10574
[60]	valid_0's multi_logloss: 1.10432
[70]	valid_0's multi_logloss: 1.10316
[80]	valid_0's multi_logloss: 1.10201
[90]	valid_0's multi_logloss: 1.10105
[100]	valid_0's multi_logloss: 1.10029
[110]	valid_0's multi_logloss: 1.09952
[120]	valid_0's multi_logloss: 1.0989
[130]	valid_0's multi_logloss: 1.09833
[140]	valid_0's multi_logloss: 1.09786
[150]	valid_0's multi_logloss: 1.09742
[160]	valid_0's multi_logloss: 1.09698
[170]	valid_0's multi_logloss: 1.09653
[180]	valid_0's multi_logloss: 1.09612
[190]	valid_0's multi_logloss: 1.09581
[200]	valid_0's multi_logloss: 1.09555
[210]	valid_0's multi_logloss: 1.09527
[220]	valid_0's multi_logloss: 1.09497
[230]	valid_0's multi_log

In [None]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")
result_cb = multi_estimators_predict(cb_estimators, test.drop(columns='id'))

Wed May 19 17:44:35 2021, Cross-Validation, 100000 rows, 50 cols




0:	learn: 1.3569208	test: 1.3570112	best: 1.3570112 (0)	total: 9.81ms	remaining: 49s
10:	learn: 1.1999903	test: 1.2004217	best: 1.2004217 (10)	total: 88.6ms	remaining: 40.2s
20:	learn: 1.1454962	test: 1.1461476	best: 1.1461476 (20)	total: 172ms	remaining: 40.8s
30:	learn: 1.1241088	test: 1.1250662	best: 1.1250662 (30)	total: 257ms	remaining: 41.1s
40:	learn: 1.1151137	test: 1.1162494	best: 1.1162494 (40)	total: 335ms	remaining: 40.5s
50:	learn: 1.1109877	test: 1.1121634	best: 1.1121634 (50)	total: 413ms	remaining: 40.1s
60:	learn: 1.1087906	test: 1.1101227	best: 1.1101227 (60)	total: 499ms	remaining: 40.4s
70:	learn: 1.1073104	test: 1.1087635	best: 1.1087635 (70)	total: 585ms	remaining: 40.6s
80:	learn: 1.1062240	test: 1.1078849	best: 1.1078849 (80)	total: 663ms	remaining: 40.3s
90:	learn: 1.1052214	test: 1.1069812	best: 1.1069812 (90)	total: 746ms	remaining: 40.3s
100:	learn: 1.1044037	test: 1.1062325	best: 1.1062325 (100)	total: 823ms	remaining: 39.9s
110:	learn: 1.1036808	test: 1.10



0:	learn: 1.3570102	test: 1.3570398	best: 1.3570398 (0)	total: 8.94ms	remaining: 44.7s
10:	learn: 1.2002096	test: 1.2003323	best: 1.2003323 (10)	total: 82.2ms	remaining: 37.3s
20:	learn: 1.1457335	test: 1.1459537	best: 1.1459537 (20)	total: 160ms	remaining: 38s
30:	learn: 1.1243465	test: 1.1244288	best: 1.1244288 (30)	total: 245ms	remaining: 39.2s
40:	learn: 1.1153421	test: 1.1155581	best: 1.1155581 (40)	total: 319ms	remaining: 38.6s
50:	learn: 1.1111900	test: 1.1115351	best: 1.1115351 (50)	total: 391ms	remaining: 38s
60:	learn: 1.1089609	test: 1.1093891	best: 1.1093891 (60)	total: 469ms	remaining: 37.9s
70:	learn: 1.1075517	test: 1.1080455	best: 1.1080455 (70)	total: 539ms	remaining: 37.4s
80:	learn: 1.1065999	test: 1.1071061	best: 1.1071061 (80)	total: 615ms	remaining: 37.4s
90:	learn: 1.1055562	test: 1.1061780	best: 1.1061780 (90)	total: 693ms	remaining: 37.4s
100:	learn: 1.1047160	test: 1.1052911	best: 1.1052911 (100)	total: 768ms	remaining: 37.3s
110:	learn: 1.1038849	test: 1.1045



0:	learn: 1.3570089	test: 1.3570284	best: 1.3570284 (0)	total: 9.09ms	remaining: 45.5s
10:	learn: 1.2000543	test: 1.2003992	best: 1.2003992 (10)	total: 82.3ms	remaining: 37.3s
20:	learn: 1.1454413	test: 1.1462312	best: 1.1462312 (20)	total: 161ms	remaining: 38.2s
30:	learn: 1.1240990	test: 1.1253116	best: 1.1253116 (30)	total: 253ms	remaining: 40.6s
40:	learn: 1.1150410	test: 1.1165267	best: 1.1165267 (40)	total: 329ms	remaining: 39.8s
50:	learn: 1.1109029	test: 1.1125472	best: 1.1125472 (50)	total: 401ms	remaining: 38.9s
60:	learn: 1.1087165	test: 1.1105858	best: 1.1105858 (60)	total: 479ms	remaining: 38.8s
70:	learn: 1.1072984	test: 1.1094224	best: 1.1094224 (70)	total: 552ms	remaining: 38.3s
80:	learn: 1.1062192	test: 1.1085464	best: 1.1085464 (80)	total: 630ms	remaining: 38.2s
90:	learn: 1.1051664	test: 1.1075909	best: 1.1075909 (90)	total: 710ms	remaining: 38.3s
100:	learn: 1.1042531	test: 1.1068103	best: 1.1068103 (100)	total: 783ms	remaining: 38s
110:	learn: 1.1034807	test: 1.10



0:	learn: 1.3569908	test: 1.3570099	best: 1.3570099 (0)	total: 8.11ms	remaining: 40.5s
10:	learn: 1.2001747	test: 1.2002862	best: 1.2002862 (10)	total: 90.6ms	remaining: 41.1s
20:	learn: 1.1456662	test: 1.1459829	best: 1.1459829 (20)	total: 165ms	remaining: 39.2s
30:	learn: 1.1242656	test: 1.1246157	best: 1.1246157 (30)	total: 248ms	remaining: 39.8s
40:	learn: 1.1152577	test: 1.1156518	best: 1.1156518 (40)	total: 322ms	remaining: 38.9s
50:	learn: 1.1111032	test: 1.1116097	best: 1.1116097 (50)	total: 391ms	remaining: 38s
60:	learn: 1.1088524	test: 1.1094821	best: 1.1094821 (60)	total: 465ms	remaining: 37.7s
70:	learn: 1.1074366	test: 1.1081726	best: 1.1081726 (70)	total: 534ms	remaining: 37.1s
80:	learn: 1.1063346	test: 1.1071923	best: 1.1071923 (80)	total: 601ms	remaining: 36.5s
90:	learn: 1.1053466	test: 1.1062540	best: 1.1062540 (90)	total: 675ms	remaining: 36.4s
100:	learn: 1.1044869	test: 1.1055540	best: 1.1055540 (100)	total: 743ms	remaining: 36s
110:	learn: 1.1037442	test: 1.1049



0:	learn: 1.3569875	test: 1.3569956	best: 1.3569956 (0)	total: 7.76ms	remaining: 38.8s
10:	learn: 1.2002222	test: 1.2003771	best: 1.2003771 (10)	total: 77.8ms	remaining: 35.3s
20:	learn: 1.1456908	test: 1.1459861	best: 1.1459861 (20)	total: 150ms	remaining: 35.6s
30:	learn: 1.1241815	test: 1.1247121	best: 1.1247121 (30)	total: 224ms	remaining: 35.9s
40:	learn: 1.1151979	test: 1.1158953	best: 1.1158953 (40)	total: 289ms	remaining: 35s
50:	learn: 1.1110443	test: 1.1118836	best: 1.1118836 (50)	total: 355ms	remaining: 34.5s
60:	learn: 1.1087215	test: 1.1098233	best: 1.1098233 (60)	total: 421ms	remaining: 34.1s
70:	learn: 1.1072960	test: 1.1085265	best: 1.1085265 (70)	total: 487ms	remaining: 33.8s
80:	learn: 1.1061152	test: 1.1075917	best: 1.1075917 (80)	total: 556ms	remaining: 33.7s
90:	learn: 1.1051178	test: 1.1067808	best: 1.1067808 (90)	total: 631ms	remaining: 34s
100:	learn: 1.1042019	test: 1.1060670	best: 1.1060670 (100)	total: 709ms	remaining: 34.4s
110:	learn: 1.1034475	test: 1.1054



0:	learn: 1.3570821	test: 1.3571302	best: 1.3571302 (0)	total: 9.25ms	remaining: 46.3s
10:	learn: 1.2002321	test: 1.2003625	best: 1.2003625 (10)	total: 82.4ms	remaining: 37.4s
20:	learn: 1.1456481	test: 1.1458113	best: 1.1458113 (20)	total: 174ms	remaining: 41.2s
30:	learn: 1.1242939	test: 1.1245906	best: 1.1245906 (30)	total: 254ms	remaining: 40.7s
40:	learn: 1.1152803	test: 1.1157208	best: 1.1157208 (40)	total: 325ms	remaining: 39.3s
50:	learn: 1.1111075	test: 1.1116048	best: 1.1116048 (50)	total: 401ms	remaining: 38.9s
60:	learn: 1.1088592	test: 1.1095260	best: 1.1095260 (60)	total: 483ms	remaining: 39.1s
70:	learn: 1.1074711	test: 1.1082637	best: 1.1082637 (70)	total: 556ms	remaining: 38.6s
80:	learn: 1.1063964	test: 1.1073407	best: 1.1073407 (80)	total: 628ms	remaining: 38.1s
90:	learn: 1.1053967	test: 1.1064539	best: 1.1064539 (90)	total: 711ms	remaining: 38.4s
100:	learn: 1.1044791	test: 1.1057881	best: 1.1057881 (100)	total: 784ms	remaining: 38s
110:	learn: 1.1036625	test: 1.10



0:	learn: 1.3570015	test: 1.3569846	best: 1.3569846 (0)	total: 8.74ms	remaining: 43.7s
10:	learn: 1.2002481	test: 1.2000149	best: 1.2000149 (10)	total: 92.5ms	remaining: 41.9s
20:	learn: 1.1457516	test: 1.1453474	best: 1.1453474 (20)	total: 173ms	remaining: 40.9s
30:	learn: 1.1243981	test: 1.1240389	best: 1.1240389 (30)	total: 250ms	remaining: 40s
40:	learn: 1.1154136	test: 1.1150437	best: 1.1150437 (40)	total: 324ms	remaining: 39.1s
50:	learn: 1.1112984	test: 1.1109366	best: 1.1109366 (50)	total: 401ms	remaining: 38.9s
60:	learn: 1.1090224	test: 1.1087442	best: 1.1087442 (60)	total: 485ms	remaining: 39.2s
70:	learn: 1.1076110	test: 1.1073951	best: 1.1073951 (70)	total: 556ms	remaining: 38.6s
80:	learn: 1.1065649	test: 1.1063147	best: 1.1063147 (80)	total: 626ms	remaining: 38s
90:	learn: 1.1056041	test: 1.1053692	best: 1.1053692 (90)	total: 703ms	remaining: 37.9s
100:	learn: 1.1047949	test: 1.1046326	best: 1.1046326 (100)	total: 772ms	remaining: 37.4s
110:	learn: 1.1040626	test: 1.1040

In [None]:
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")
result_xgb = multi_estimators_predict(xgb_estimators, test.drop(columns='id'))

Wed May 19 17:46:37 2021, Cross-Validation, 100000 rows, 50 cols
[0]	validation_0-mlogloss:1.35378
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.18522
[20]	validation_0-mlogloss:1.13534
[30]	validation_0-mlogloss:1.11928
[40]	validation_0-mlogloss:1.1132
[50]	validation_0-mlogloss:1.1101
[60]	validation_0-mlogloss:1.10823
[70]	validation_0-mlogloss:1.10675
[80]	validation_0-mlogloss:1.10563
[90]	validation_0-mlogloss:1.10452
[100]	validation_0-mlogloss:1.10353
[110]	validation_0-mlogloss:1.10262
[120]	validation_0-mlogloss:1.10194
[130]	validation_0-mlogloss:1.10134
[140]	validation_0-mlogloss:1.10067
[150]	validation_0-mlogloss:1.10012
[160]	validation_0-mlogloss:1.09961
[170]	validation_0-mlogloss:1.09898
[180]	validation_0-mlogloss:1.09844
[190]	validation_0-mlogloss:1.09803
[200]	validation_0-mlogloss:1.09759
[210]	validation_0-mlogloss:1.09704
[220]	validation_0-mlogloss:1.09664
[230]	validation_0-mlogloss:1.09624
[240]	validatio

In [None]:
result_lgb_xgb_cb = np.dstack((result_lgb, result_xgb, result_cb))
result_lgb_xgb_cb_mean = np.mean(result_lgb_xgb_cb, axis=2)

In [None]:
test_pred_df = pd.DataFrame(result_lgb_xgb_cb_mean)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

In [None]:
result

Unnamed: 0,id,class_1,class_2,class_3,class_4
0,100000,0.093071,0.619720,0.165824,0.121385
1,100001,0.077836,0.690752,0.141058,0.090354
2,100002,0.084690,0.643264,0.175257,0.096789
3,100003,0.085663,0.535393,0.292535,0.086409
4,100004,0.074546,0.623293,0.186189,0.115972
...,...,...,...,...,...
49995,149995,0.088292,0.696105,0.156804,0.058799
49996,149996,0.077911,0.655623,0.131053,0.135413
49997,149997,0.081790,0.523733,0.229251,0.165226
49998,149998,0.081625,0.594241,0.165784,0.158350


Вывод: результат 0.8643. Надо оспользовать двухуровневую модель

## Получение результата двухуровневой модели

In [None]:
data = get_input("train.csv")
test = get_input("test.csv")
data.drop(columns='id', inplace=True)

mapper = {'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}
data['target'] = data['target'].map(mapper)

train, valid = train_test_split(
    data, train_size=0.7, shuffle=True, random_state=1,
)

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

y_train = train['target']
x_train = train.drop(columns=['target'])
y_valid = valid['target']
x_valid = valid.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols


In [None]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

In [None]:
print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Out of fold log loss 1.0927355848269298


In [None]:
result_lgb_valid = multi_estimators_predict(lgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090214
Model 1 metric: 1.089329
Model 2 metric: 1.089982
Model 3 metric: 1.090136
Model 4 metric: 1.090564
Model 5 metric: 1.089468
Model 6 metric: 1.090378
Result model metric: 1.089295


In [None]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

In [None]:
print(f"Out of fold log loss {metrics.log_loss(y_train, cb_oof)}")

Out of fold log loss 1.0928152109200098


In [None]:
result_cb_valid = multi_estimators_predict(cb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090639
Model 1 metric: 1.090936
Model 2 metric: 1.09082
Model 3 metric: 1.09097
Model 4 metric: 1.09134
Model 5 metric: 1.090482
Model 6 metric: 1.090903
Result model metric: 1.090346


In [None]:
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

In [None]:
print(f"Out of fold log loss {metrics.log_loss(y_train, xgb_oof)}")

Out of fold log loss 1.0931592292917518


In [None]:
result_xgb_valid = multi_estimators_predict(xgb_estimators, x_valid, y_valid, metrics.log_loss)

Model 0 metric: 1.090678
Model 1 metric: 1.09052
Model 2 metric: 1.091207
Model 3 metric: 1.091059
Model 4 metric: 1.091386
Model 5 metric: 1.090657
Model 6 metric: 1.091556
Result model metric: 1.090071


In [None]:
result_lgb_valid_df = pd.DataFrame(result_lgb_valid)
result_xgb_valid_df = pd.DataFrame(result_xgb_valid)
result_cb_valid_df = pd.DataFrame(result_cb_valid)
result_lgb_valid_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_valid_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_valid_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result = pd.concat([result_lgb_valid_df, result_xgb_valid_df, result_cb_valid_df], axis=1) 

In [None]:
params = {'random_state': 0, 
          'max_iter': 300, 
          'multi_class': 'multinomial', 
          'solver': 'lbfgs'}

logreg = LogisticRegression(**params)
logreg.fit(result, y_valid)

LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0)

In [None]:
result_xgb_test = multi_estimators_predict(xgb_estimators, test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(lgb_estimators, test.drop(columns='id'))
result_cb_test = multi_estimators_predict(cb_estimators, test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

y_pred_logreg = logreg.predict_proba(result_for_logreg)

test_pred_df = pd.DataFrame(y_pred_logreg)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

Вывод: результат 0.8653. Недостаток: приходится разбивать тренировочный датасет на две части для обучения двухуровневой модели. Каждый из уровней обучается на усеченном наборе данных. Выход: использовать читерство и обучать модель второго уровня на предсказаниях не модели первого уровня а на модели, обученной на усеченном датасете. Тогда модель первого уровня можно обучить на всем тренировочном датасете.

## Получение результата двухуровневой модели с читерством

In [31]:
data = get_input("train.csv")
test = get_input("test.csv")
data.drop(columns='id', inplace=True)
sample_submission = get_input("sample_submission.csv")

mapper = {'Class_1': 1, 'Class_2': 2, 'Class_3': 3, 'Class_4': 4}
data['target'] = data['target'].map(mapper)

y_train = data['target']
x_train = data.drop(columns=['target'])

train.csv: shape = 100000 rows, 52 cols
test.csv: shape = 50000 rows, 51 cols
sample_submission.csv: shape = 50000 rows, 5 cols


In [32]:
lgbm_params = {
    "boosting_type ": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    'n_estimators': 5000,
    'reg_alpha': 11.159097442782404, 
    'reg_lambda': 6.133258909357973e-05, 
    'colsample_bytree': 0.3, 
    'subsample': 0.40110232869776463, 
    'learning_rate': 0.07591597509135133, 
    #'max_depth': 92, 
    #'num_leaves': 867,
    'max_depth': 8, 
    'num_leaves': 6, 
    'min_child_samples': 165, 
    'min_child_weight': 0.0029014788191160327, 
    'cat_smooth': 79, 
    'cat_l2': 3,
    'random_state': 42, 
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

lgb_estimators, lgb_oof = lightgbm_cv_fit(
    lgbm_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Sun May 23 21:54:18 2021, Cross-Validation, 100000 rows, 50 cols
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's multi_logloss: 1.11492
[20]	valid_0's multi_logloss: 1.11183
[30]	valid_0's multi_logloss: 1.10949
[40]	valid_0's multi_logloss: 1.10757
[50]	valid_0's multi_logloss: 1.10592
[60]	valid_0's multi_logloss: 1.10454
[70]	valid_0's multi_logloss: 1.1033
[80]	valid_0's multi_logloss: 1.10229
[90]	valid_0's multi_logloss: 1.10132
[100]	valid_0's multi_logloss: 1.10035
[110]	valid_0's multi_logloss: 1.09963
[120]	valid_0's multi_logloss: 1.09893
[130]	valid_0's multi_logloss: 1.09829
[140]	valid_0's multi_logloss: 1.09778
[150]	valid_0's multi_logloss: 1.09736
[160]	valid_0's multi_logloss: 1.09698
[170]	valid_0's multi_logloss: 1.09649
[180]	valid_0's multi_logloss: 1.09613
[190]	valid_0's multi_logloss: 1.09579
[200]	valid_0's multi_logloss: 1.09551
[210]	valid_0's multi_logloss: 1.09525
[220]	valid_0's multi_logloss: 1.09494
[230]	valid_0's multi_log

In [33]:
cb_params = {
    "n_estimators": 5000,
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "task_type": "GPU",
    'learning_rate': 0.04992656195498482, 
    'reg_lambda': 37.77866189042851, 
    'subsample': 0.6044483085727145, 
    'random_strength': 0.6855495663972144, 
    'depth': 3, 
    'min_data_in_leaf': 25, 
    'leaf_estimation_iterations': 1,
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_state': 42,
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

cb_estimators, cb_oof = catboost_cv_fit(
    cb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Sun May 23 21:58:20 2021, Cross-Validation, 100000 rows, 50 cols




0:	learn: 1.3570698	test: 1.3571273	best: 1.3571273 (0)	total: 6.64ms	remaining: 33.2s
10:	learn: 1.2002427	test: 1.2004564	best: 1.2004564 (10)	total: 60.9ms	remaining: 27.6s
20:	learn: 1.1456121	test: 1.1461392	best: 1.1461392 (20)	total: 114ms	remaining: 27s
30:	learn: 1.1242443	test: 1.1249285	best: 1.1249285 (30)	total: 171ms	remaining: 27.5s
40:	learn: 1.1152335	test: 1.1161681	best: 1.1161681 (40)	total: 227ms	remaining: 27.5s
50:	learn: 1.1111282	test: 1.1121635	best: 1.1121635 (50)	total: 289ms	remaining: 28.1s
60:	learn: 1.1089046	test: 1.1101243	best: 1.1101243 (60)	total: 349ms	remaining: 28.2s
70:	learn: 1.1074949	test: 1.1089526	best: 1.1089526 (70)	total: 406ms	remaining: 28.2s
80:	learn: 1.1063878	test: 1.1079417	best: 1.1079417 (80)	total: 465ms	remaining: 28.3s
90:	learn: 1.1054969	test: 1.1071523	best: 1.1071523 (90)	total: 518ms	remaining: 27.9s
100:	learn: 1.1046405	test: 1.1063909	best: 1.1063909 (100)	total: 567ms	remaining: 27.5s
110:	learn: 1.1038997	test: 1.10



0:	learn: 1.3570922	test: 1.3571065	best: 1.3571065 (0)	total: 10.7ms	remaining: 53.4s
10:	learn: 1.2002726	test: 1.2003430	best: 1.2003430 (10)	total: 53.4ms	remaining: 24.2s
20:	learn: 1.1456523	test: 1.1458817	best: 1.1458817 (20)	total: 96ms	remaining: 22.8s
30:	learn: 1.1242379	test: 1.1245143	best: 1.1245143 (30)	total: 139ms	remaining: 22.3s
40:	learn: 1.1152072	test: 1.1155727	best: 1.1155727 (40)	total: 177ms	remaining: 21.5s
50:	learn: 1.1110572	test: 1.1114532	best: 1.1114532 (50)	total: 231ms	remaining: 22.4s
60:	learn: 1.1088406	test: 1.1093327	best: 1.1093327 (60)	total: 272ms	remaining: 22s
70:	learn: 1.1074657	test: 1.1080232	best: 1.1080232 (70)	total: 311ms	remaining: 21.6s
80:	learn: 1.1064139	test: 1.1069345	best: 1.1069345 (80)	total: 351ms	remaining: 21.3s
90:	learn: 1.1055885	test: 1.1061525	best: 1.1061525 (90)	total: 390ms	remaining: 21s
100:	learn: 1.1047811	test: 1.1053977	best: 1.1053977 (100)	total: 433ms	remaining: 21s
110:	learn: 1.1040212	test: 1.1047725



0:	learn: 1.3570831	test: 1.3570965	best: 1.3570965 (0)	total: 5.2ms	remaining: 26s
10:	learn: 1.2000123	test: 1.2003488	best: 1.2003488 (10)	total: 44.1ms	remaining: 20s
20:	learn: 1.1455067	test: 1.1461465	best: 1.1461465 (20)	total: 83.2ms	remaining: 19.7s
30:	learn: 1.1240811	test: 1.1249584	best: 1.1249584 (30)	total: 128ms	remaining: 20.6s
40:	learn: 1.1150410	test: 1.1164276	best: 1.1164276 (40)	total: 171ms	remaining: 20.7s
50:	learn: 1.1108950	test: 1.1124311	best: 1.1124311 (50)	total: 214ms	remaining: 20.8s
60:	learn: 1.1087177	test: 1.1105663	best: 1.1105663 (60)	total: 253ms	remaining: 20.5s
70:	learn: 1.1072408	test: 1.1093039	best: 1.1093039 (70)	total: 291ms	remaining: 20.2s
80:	learn: 1.1061032	test: 1.1084012	best: 1.1084012 (80)	total: 330ms	remaining: 20s
90:	learn: 1.1051976	test: 1.1076779	best: 1.1076779 (90)	total: 367ms	remaining: 19.8s
100:	learn: 1.1043605	test: 1.1069793	best: 1.1069793 (100)	total: 418ms	remaining: 20.3s
110:	learn: 1.1035859	test: 1.106403



0:	learn: 1.3570884	test: 1.3570946	best: 1.3570946 (0)	total: 10.9ms	remaining: 54.4s
10:	learn: 1.2001809	test: 1.2004024	best: 1.2004024 (10)	total: 49.9ms	remaining: 22.6s
20:	learn: 1.1456103	test: 1.1458483	best: 1.1458483 (20)	total: 89.1ms	remaining: 21.1s
30:	learn: 1.1242309	test: 1.1245780	best: 1.1245780 (30)	total: 139ms	remaining: 22.3s
40:	learn: 1.1152185	test: 1.1157211	best: 1.1157211 (40)	total: 177ms	remaining: 21.4s
50:	learn: 1.1110833	test: 1.1117016	best: 1.1117016 (50)	total: 221ms	remaining: 21.4s
60:	learn: 1.1088770	test: 1.1095019	best: 1.1095019 (60)	total: 259ms	remaining: 21s
70:	learn: 1.1074529	test: 1.1082154	best: 1.1082154 (70)	total: 303ms	remaining: 21.1s
80:	learn: 1.1062858	test: 1.1070411	best: 1.1070411 (80)	total: 342ms	remaining: 20.8s
90:	learn: 1.1053926	test: 1.1062093	best: 1.1062093 (90)	total: 384ms	remaining: 20.7s
100:	learn: 1.1045917	test: 1.1055949	best: 1.1055949 (100)	total: 427ms	remaining: 20.7s
110:	learn: 1.1037660	test: 1.1



0:	learn: 1.3570801	test: 1.3571166	best: 1.3571166 (0)	total: 5.15ms	remaining: 25.7s
10:	learn: 1.2002420	test: 1.2003685	best: 1.2003685 (10)	total: 56.8ms	remaining: 25.8s
20:	learn: 1.1455926	test: 1.1459235	best: 1.1459235 (20)	total: 95.2ms	remaining: 22.6s
30:	learn: 1.1241975	test: 1.1247792	best: 1.1247792 (30)	total: 139ms	remaining: 22.3s
40:	learn: 1.1151560	test: 1.1158498	best: 1.1158498 (40)	total: 181ms	remaining: 21.9s
50:	learn: 1.1109836	test: 1.1117874	best: 1.1117874 (50)	total: 225ms	remaining: 21.8s
60:	learn: 1.1087755	test: 1.1097397	best: 1.1097397 (60)	total: 266ms	remaining: 21.5s
70:	learn: 1.1072922	test: 1.1084087	best: 1.1084087 (70)	total: 305ms	remaining: 21.1s
80:	learn: 1.1061231	test: 1.1074550	best: 1.1074550 (80)	total: 343ms	remaining: 20.8s
90:	learn: 1.1051523	test: 1.1066819	best: 1.1066819 (90)	total: 389ms	remaining: 21s
100:	learn: 1.1043385	test: 1.1059331	best: 1.1059331 (100)	total: 434ms	remaining: 21.1s
110:	learn: 1.1034775	test: 1.1



0:	learn: 1.3570794	test: 1.3570411	best: 1.3570411 (0)	total: 5.07ms	remaining: 25.3s
10:	learn: 1.2002550	test: 1.2003801	best: 1.2003801 (10)	total: 43.8ms	remaining: 19.9s
20:	learn: 1.1456861	test: 1.1457655	best: 1.1457655 (20)	total: 82.2ms	remaining: 19.5s
30:	learn: 1.1242916	test: 1.1244469	best: 1.1244469 (30)	total: 132ms	remaining: 21.2s
40:	learn: 1.1152838	test: 1.1155562	best: 1.1155562 (40)	total: 187ms	remaining: 22.6s
50:	learn: 1.1111418	test: 1.1115936	best: 1.1115936 (50)	total: 239ms	remaining: 23.2s
60:	learn: 1.1089637	test: 1.1095439	best: 1.1095439 (60)	total: 278ms	remaining: 22.5s
70:	learn: 1.1075050	test: 1.1081681	best: 1.1081681 (70)	total: 316ms	remaining: 21.9s
80:	learn: 1.1064485	test: 1.1072692	best: 1.1072692 (80)	total: 354ms	remaining: 21.5s
90:	learn: 1.1054601	test: 1.1064306	best: 1.1064306 (90)	total: 392ms	remaining: 21.2s
100:	learn: 1.1045452	test: 1.1055486	best: 1.1055486 (100)	total: 446ms	remaining: 21.6s
110:	learn: 1.1036988	test: 1



0:	learn: 1.3570834	test: 1.3570374	best: 1.3570374 (0)	total: 5.01ms	remaining: 25.1s
10:	learn: 1.2003464	test: 1.2002271	best: 1.2002271 (10)	total: 53.7ms	remaining: 24.4s
20:	learn: 1.1458723	test: 1.1456774	best: 1.1456774 (20)	total: 93.6ms	remaining: 22.2s
30:	learn: 1.1245007	test: 1.1243187	best: 1.1243187 (30)	total: 140ms	remaining: 22.4s
40:	learn: 1.1154821	test: 1.1152229	best: 1.1152229 (40)	total: 178ms	remaining: 21.5s
50:	learn: 1.1113581	test: 1.1110494	best: 1.1110494 (50)	total: 222ms	remaining: 21.5s
60:	learn: 1.1092277	test: 1.1089261	best: 1.1089261 (60)	total: 259ms	remaining: 21s
70:	learn: 1.1077631	test: 1.1075481	best: 1.1075481 (70)	total: 297ms	remaining: 20.6s
80:	learn: 1.1066278	test: 1.1064544	best: 1.1064544 (80)	total: 339ms	remaining: 20.6s
90:	learn: 1.1057175	test: 1.1056260	best: 1.1056260 (90)	total: 377ms	remaining: 20.3s
100:	learn: 1.1048832	test: 1.1048612	best: 1.1048612 (100)	total: 418ms	remaining: 20.3s
110:	learn: 1.1040987	test: 1.1

In [34]:
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 13,
    'learning_rate': 0.08356451010151393, 
    'gamma': 0.02911685058980812, 
    'max_depth': 3, 
    'min_child_weight': 10.748514454096288, 
    'max_delta_step': 2.4474818433727927, 
    'subsample': 0.6445037550866027, 
    'colsample_bytree': 0.07634753656242108, 
    'lambda': 13.663280761461781, 
    'alpha': 21.521205761694137, 
    'max_leaves': 48,
    'n_estimators': 2000,
    'num_class': 4,
    'tree_method': 'gpu_hist'
}

cv = StratifiedKFold(n_splits=7, random_state=435, shuffle=True)

xgb_estimators, xgb_encoders, xgb_oof = xgboost_cv_fit(
    xgb_params, x_train, y_train, cv, #categorical=categorical_feature_names
)

print(f"Out of fold log loss {metrics.log_loss(y_train, lgb_oof)}")

Sun May 23 21:59:37 2021, Cross-Validation, 100000 rows, 50 cols
[0]	validation_0-mlogloss:1.35378
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[10]	validation_0-mlogloss:1.18522
[20]	validation_0-mlogloss:1.13534
[30]	validation_0-mlogloss:1.11928
[40]	validation_0-mlogloss:1.1132
[50]	validation_0-mlogloss:1.1101
[60]	validation_0-mlogloss:1.10823
[70]	validation_0-mlogloss:1.10675
[80]	validation_0-mlogloss:1.10563
[90]	validation_0-mlogloss:1.10452
[100]	validation_0-mlogloss:1.10353
[110]	validation_0-mlogloss:1.10262
[120]	validation_0-mlogloss:1.10194
[130]	validation_0-mlogloss:1.10134
[140]	validation_0-mlogloss:1.10067
[150]	validation_0-mlogloss:1.10012
[160]	validation_0-mlogloss:1.09961
[170]	validation_0-mlogloss:1.09898
[180]	validation_0-mlogloss:1.09844
[190]	validation_0-mlogloss:1.09803
[200]	validation_0-mlogloss:1.09759
[210]	validation_0-mlogloss:1.09704
[220]	validation_0-mlogloss:1.09664
[230]	validation_0-mlogloss:1.09624
[240]	validatio

In [35]:
result_xgb_test = multi_estimators_predict(xgb_estimators, test.drop(columns='id'))
result_lgb_test = multi_estimators_predict(lgb_estimators, test.drop(columns='id'))
result_cb_test = multi_estimators_predict(cb_estimators, test.drop(columns='id'))

result_lgb_test_df = pd.DataFrame(result_lgb_test)
result_xgb_test_df = pd.DataFrame(result_xgb_test)
result_cb_test_df = pd.DataFrame(result_cb_test)
result_lgb_test_df.rename(columns={0:'lgb_class_1',	1:'lgb_class_2',	2:'lgb_class_3',	3:'lgb_class_4'}, inplace=True)
result_xgb_test_df.rename(columns={0:'xgb_class_1',	1:'xgb_class_2',	2:'xgb_class_3',	3:'xgb_class_4'}, inplace=True)
result_cb_test_df.rename(columns={0:'cb_class_1',	1:'cb_class_2',	2:'cb_class_3',	3:'cb_class_4'}, inplace=True)
result_for_logreg = pd.concat([result_lgb_test_df, result_xgb_test_df, result_cb_test_df], axis=1) 

In [67]:
#y_pred_logreg_1 = logreg_1.predict_proba(result_for_logreg)
#y_pred_logreg_2 = logreg_2.predict_proba(result_for_logreg)
#y_pred_logreg_3 = logreg_3.predict_proba(result_for_logreg)

In [68]:
#y_pred_logreg = np.dstack((y_pred_logreg_1, y_pred_logreg_2, y_pred_logreg_3))
#y_pred_logreg_mean = np.mean(y_pred_logreg, axis=2)

In [36]:
y_pred_logreg = logreg_ensamble_for_test.predict_proba(result_for_logreg)

In [37]:
test_pred_df = pd.DataFrame(y_pred_logreg)
test_pred_df.rename(columns={0:'class_1',	1:'class_2',	2:'class_3',	3:'class_4'}, inplace=True)
result = test.drop(columns = test.columns.to_list()[1:])
result = pd.concat([result, test_pred_df], axis=1)
result.to_csv('result.csv', index=False)

1.08564 получен при использовании двухуровневой модели с читерством. При этом первый уровень для обучения использовал весь датасет. Второй уровень (логистическая регрессия) для обучения использовал треть тренировочного датасета состоящего из предсказаний модели первого уровня, обученной на 2/3 датасета. Это поволяет повысить результат на lb но отчести неправльно. Правильно делать как в предыдущем пункте, но там результат выше. Выход: использовать кроссвалидированную двухуровневую модель.