In [27]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

from secondary_functions.get_metrics import concat_metrics, check_overfitting
import yaml

from tqdm import tqdm_notebook
import warnings
from typing import Any
warnings.filterwarnings('ignore')

SEED = 1618
N_FOLDS = 5
SAMPLER = optuna.samplers.TPESampler(seed=SEED)
PRUNER = optuna.pruners.MedianPruner(n_warmup_steps=10)
optuna.logging.set_verbosity(optuna.logging.WARNING)
pd.options.display.float_format ='{:,.3f}'.format

In [35]:
with open('preprocessing.yml', 'r', encoding='utf-8') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)
config

{'change_type_columns': {'Рейтинг': 'float32',
  'Мощность_двигателя_лс': 'int16',
  'Пробег_км': 'int32',
  'Год_выпуска': 'int16',
  'Цена': 'int32',
  'Объём_двигателя_л': 'float32',
  'Обмен': 'int8',
  'Много_владельцев': 'int8',
  'Срок_эксплуатации': 'int8',
  'История_пробега_кол_во_записей': 'int8',
  'Владельцев_по_ПТС': 'int8',
  'Выпуск_кол_во_лет': 'int8'},
 'columns_to_drop': ['Название_авто',
  'Расположение',
  'Наименование_поколения'],
 'catb_params': {'boosting_type': 'Ordered',
  'bootstrap_type': 'Bernoulli',
  'cat_features': ['Поколение',
   'Состояние',
   'Модификация',
   'Тип_двигателя',
   'Коробка_передач',
   'Привод',
   'Комплектация',
   'Тип_кузова',
   'Цвет',
   'Руль',
   'Управление_климатом',
   'ПТС',
   'Бренд_авто',
   'Модель_авто',
   'Город',
   'Федеральный_округ',
   'Степень_износа'],
  'colsample_bylevel': 0.07016296475032051,
  'depth': 12,
  'eval_metric': 'MAE',
  'iterations': 3000,
  'l2_leaf_reg': 12.236324946000956,
  'learning_ra

# Загрузка данных, оптимизация данных по памяти.

Используем уже найденные optun'ой параметры для каждой из моделей.

In [3]:
df_train = pd.read_csv('./data_store/train.csv').astype(
    config['change_type_columns'])
df_test = pd.read_csv('./data_store/test.csv').astype(
    config['change_type_columns'])
df_train[df_train.select_dtypes('object').columns] = df_train[
    df_train.select_dtypes('object').columns].astype('category')
df_test[df_test.select_dtypes('object').columns] = df_test[
    df_test.select_dtypes('object').columns].astype('category')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31547 entries, 0 to 31546
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Рейтинг                         31547 non-null  float32 
 1   Цена                            31547 non-null  int32   
 2   Год_выпуска                     31547 non-null  int16   
 3   Пробег_км                       31547 non-null  int32   
 4   История_пробега_кол_во_записей  31547 non-null  int8    
 5   Владельцев_по_ПТС               31547 non-null  int8    
 6   Объём_двигателя_л               31547 non-null  float32 
 7   Выпуск_кол_во_лет               31547 non-null  int8    
 8   Мощность_двигателя_лс           31547 non-null  int16   
 9   Поколение                       31547 non-null  category
 10  Состояние                       31547 non-null  category
 11  Модификация                     31547 non-null  category
 12  Тип_двигателя     

In [5]:
X_train, y_train = df_train.drop('Цена', axis=1),  df_train['Цена'] 
X_test, y_test = df_test.drop('Цена', axis=1), df_test['Цена']

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=SEED)

# Stacking ручной.

Учитывая опыт на бейзлайнах, попробуем стакнуть Tuned_Catboost + Tuned_LGBM + Baseline_Catboost + Baseline_LGBM. В качестве метамоделей протестируем Ridge, LinearRegression, LGBM и KNeighbors.

In [28]:
cv_meta = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
meta_X, meta_X_test = pd.DataFrame(), pd.DataFrame()

In [7]:
catb_tuned_preds_stack = []

for idx, (train_idx,
          test_idx) in tqdm_notebook(enumerate(cv_meta.split(X_train,
                                                             y_train)),
                                     total=cv_meta.get_n_splits()):
    X_train_x, X_val_x = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_x, y_val_x = y_train.values[train_idx], y_train.values[test_idx]

    model = CatBoostRegressor(**config['catb_params'])
    model.fit(X_train_x,
              y_train_x,
              eval_set=[(X_val_x, y_val_x)],
              early_stopping_rounds=50)
    cv_pred = model.predict(X_val_x)[:, 0]
    catb_tuned_preds_stack.append(cv_pred)
    print(f'FOLD {idx + 1}: MAE {mean_absolute_error(y_val_x, cv_pred)}')

model.fit(X_train, y_train, early_stopping_rounds=50)
meta_X['catb_tuned'] = np.concatenate(catb_tuned_preds_stack)
meta_X_test['catb_tuned'] = model.predict(X_test)[:, 0]

  0%|          | 0/5 [00:00<?, ?it/s]

FOLD 1: MAE 221288.12036226896
FOLD 2: MAE 227894.2906961858
FOLD 3: MAE 236864.1912495638
FOLD 4: MAE 215549.0124084577
FOLD 5: MAE 225114.7023138939


In [9]:
lgbm_tuned_preds_stack = []

for idx, (train_idx,
          test_idx) in tqdm_notebook(enumerate(cv_meta.split(X_train,
                                                             y_train)),
                                     total=cv_meta.get_n_splits()):
    X_train_l, X_val_l = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_l, y_val_l = y_train.values[train_idx], y_train.values[test_idx]

    model = LGBMRegressor(**config['lgbm_params'])
    model.fit(X_train_l,
              y_train_l,
              eval_set=[(X_val_l, y_val_l)],
              eval_metric='mae')
    cv_pred = model.predict(X_val_l)
    lgbm_tuned_preds_stack.append(cv_pred)
    print(f'FOLD {idx + 1}: MAE {mean_absolute_error(y_val_l, cv_pred)}')

model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric='mae')
meta_X['lgbm_tuned'] = np.concatenate(lgbm_tuned_preds_stack)
meta_X_test['lgbm_tuned'] = model.predict(X_test)

  0%|          | 0/5 [00:00<?, ?it/s]

FOLD 1: MAE 223419.02977671305
FOLD 2: MAE 230052.95082671646
FOLD 3: MAE 232319.51134048216
FOLD 4: MAE 221853.53292211835
FOLD 5: MAE 222493.5259852807


In [10]:
catb_preds_stack = []

for idx, (train_idx,
          test_idx) in tqdm_notebook(enumerate(cv_meta.split(X_train,
                                                             y_train)),
                                     total=cv_meta.get_n_splits()):
    X_train_c, X_val_c = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_c, y_val_c = y_train.values[train_idx], y_train.values[test_idx]

    model = CatBoostRegressor(
        cat_features=config['catb_params']['cat_features'],
        verbose=0,
        random_state=SEED)
    model.fit(X_train_c,
              y_train_c,
              eval_set=[(X_val_c, y_val_c)],
              early_stopping_rounds=50)
    cv_pred = model.predict(X_val_c)
    catb_preds_stack.append(cv_pred)
    print(f'FOLD {idx + 1}: MAE {mean_absolute_error(y_val_c, cv_pred)}')

model.fit(X_train, y_train, early_stopping_rounds=50)
meta_X['catb'] = np.concatenate(catb_preds_stack)
meta_X_test['catb'] = model.predict(X_test)

  0%|          | 0/5 [00:00<?, ?it/s]

FOLD 1: MAE 226534.08061193782
FOLD 2: MAE 235332.36293761258
FOLD 3: MAE 246107.40019626528
FOLD 4: MAE 225043.1507534035
FOLD 5: MAE 249333.71108076654


In [11]:
lgbm_preds_stack = []

for idx, (train_idx, test_idx) in tqdm_notebook(enumerate(
        cv_meta.split(X_train, y_train)),
                                                total=cv_meta.get_n_splits()):
    X_train_l, X_val_l = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_l, y_val_l = y_train.values[train_idx], y_train.values[test_idx]

    model = LGBMRegressor(verbose=-1, random_state=SEED)
    model.fit(X_train_l,
              y_train_l,
              eval_set=[(X_val_l, y_val_l)],
              eval_metric='mae')
    cv_pred = model.predict(X_val_l)
    lgbm_preds_stack.append(cv_pred)
    print(f'FOLD {idx + 1}: MAE {mean_absolute_error(y_val_l, cv_pred)}')

model.fit(X_train, y_train)
meta_X['lgbm'] = np.concatenate(lgbm_preds_stack)
meta_X_test['lgbm'] = model.predict(X_test)

  0%|          | 0/5 [00:00<?, ?it/s]

FOLD 1: MAE 230944.73607184945
FOLD 2: MAE 237565.33410586594
FOLD 3: MAE 240152.99794118266
FOLD 4: MAE 224787.86057816262
FOLD 5: MAE 233378.24900778738


In [12]:
meta_X[:5]

Unnamed: 0,catb_tuned,lgbm_tuned,catb,lgbm
0,2053157.032,2037879.854,2165014.553,2019299.27
1,2283266.41,2244964.796,2173685.501,2396618.302
2,2726615.314,2735100.349,2598047.057,2636163.64
3,2045108.44,1968837.705,2035189.762,1851131.851
4,2236819.012,2091728.388,2369865.619,2177800.41


In [13]:
meta_X_test[:5]

Unnamed: 0,catb_tuned,lgbm_tuned,catb,lgbm
0,928297.991,895997.319,880556.051,902180.656
1,728641.38,723832.085,710318.811,624275.019
2,1818581.474,1828556.12,1894409.576,1928935.235
3,696554.692,662530.327,745434.785,536918.941
4,2805477.298,2848402.046,2986726.044,2985155.823


In [14]:
meta_X.shape, y_train.shape

((31547, 4), (31547,))

In [15]:
sc = StandardScaler()
final_meta_model = Ridge()
final_meta_model.fit(sc.fit_transform(meta_X), np.log(y_train + 1))

In [16]:
metrics = pd.DataFrame()
metrics = concat_metrics(
    metrics,
    y_train.values,
    np.exp(final_meta_model.predict(meta_X)) - 1,
    meta_X,
    model_name=f'{final_meta_model.__class__.__name__}_tuned_train_stacked')
metrics = concat_metrics(
    metrics,
    y_test.values,
    np.exp(final_meta_model.predict(meta_X_test)) - 1,
    meta_X_test,
    model_name=f'{final_meta_model.__class__.__name__}_tuned_test_stacked')
metrics

Unnamed: 0,Модель,MAE,R2_Adjusted,WAPE,RMSE
0,Ridge_tuned_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,Ridge_tuned_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...


In [17]:
check_overfitting(meta_X, y_train, meta_X_test, y_test,
                  final_meta_model)

Ridge 
MAE train: 1828136.4106328106 
MAE test: 1830318.792494094 
MAE diff: 0.12 %


Ridge отработал с огромной ошибкой, попробуем другие метамодели.

In [18]:
lr_meta = LinearRegression()
lr_meta.fit(sc.transform(meta_X), np.log(y_train + 1))
metrics = concat_metrics(
    metrics,
    y_train.values,
    np.exp(lr_meta.predict(meta_X)) - 1,
    meta_X,
    model_name=f'{lr_meta.__class__.__name__}_train_stacked')
metrics = concat_metrics(
    metrics,
    y_test.values,
    np.exp(lr_meta.predict(meta_X_test)) - 1,
    meta_X_test,
    model_name=f'{lr_meta.__class__.__name__}_test_stacked')
metrics

Unnamed: 0,Модель,MAE,R2_Adjusted,WAPE,RMSE
0,Ridge_tuned_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,Ridge_tuned_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...


In [19]:
for num in tqdm_notebook([5, 10, 30, 100, 500, 1000]):
    knn = KNeighborsRegressor(n_neighbors=num)
    knn.fit(sc.transform(meta_X), np.log(y_train + 1))
    metrics = concat_metrics(metrics, 
                             y_test.values,
                             np.exp(knn.predict(meta_X_test)) - 1, 
                             meta_X_test,
                             f'KNN_{num}_neighbors')
metrics

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Модель,MAE,R2_Adjusted,WAPE,RMSE
0,Ridge_tuned_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,Ridge_tuned_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,KNN_5_neighbors,1219297.903,-0.022,412.183,2247949.612
0,KNN_10_neighbors,1274803.452,-0.001,430.946,2224614.702
0,KNN_30_neighbors,1200748.007,-0.043,405.912,2271204.922
0,KNN_100_neighbors,1196244.814,-0.084,404.390,2315646.034
0,KNN_500_neighbors,1196224.520,-0.087,404.383,2318456.811
0,KNN_1000_neighbors,1195539.022,-0.079,404.151,2310125.164


Линейные алгоритмы проваливаются в бесконечность скорее всего из-за большой ошибки и, как следствие, предсказания слишком большой целевой переменной, метрический также отрабатывает с огромной ошибкой. Попробуем LGBM в качестве метамодели.

In [20]:
def objective_lgbm_meta(trial,
                        X: pd.DataFrame,
                        y: pd.Series,
                        N_FOLDS: int,
                        random_state: int = SEED) -> float:
    ''' Функция для подбора гиперпараметров через фреймворк 
        Optuna для LGBMRegressor. '''

    params = {
        'n_estimators':
        trial.suggest_int('n_estimators', 100, 5000),
        'learning_rate':
        trial.suggest_float('learning_rate', 1e-4, 0.2, log=True),
        'lambda_l1':
        trial.suggest_float('lambda_l1', 0, 100),
        'lambda_l2':
        trial.suggest_float('lambda_l2', 0, 100),
        'max_depth':
        trial.suggest_int('max_depth', 3, 15),
        'feature_fraction':
        trial.suggest_float('feature_fraction', 0.2, 0.9),
        'bagging_freq':
        trial.suggest_int('bagging_freq', 0, 15),
        'bagging_fraction':
        trial.suggest_float('bagging_fraction', 0.05, 0.9),
        'colsample_bytree':
        trial.suggest_float('colsample_bytree', 0.1, 1),
        'min_split_gain':
        trial.suggest_int('min_split_gain', 0, 20),
        'num_leaves':
        trial.suggest_int('num_leaves', 2, 2**15),
        'verbose':
        trial.suggest_categorical('verbose', [-1]),
        'random_state':
        trial.suggest_categorical('random_state', [random_state]),
    }

    cv_opt = StratifiedKFold(n_splits=N_FOLDS,
                             shuffle=True,
                             random_state=random_state)
    cv_predicts = np.empty(N_FOLDS)

    for idx, (train_idx, test_idx) in enumerate(cv_opt.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
            trial, metric='l1')

        model = LGBMRegressor(**params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='mae',
                  callbacks=[pruning_callback])

        y_pred = model.predict(X_val)

        cv_predicts[idx] = mean_absolute_error(y_val, y_pred)

    return np.mean(cv_predicts)

In [21]:
study_lgbm_meta = optuna.create_study(
    direction='minimize',
    pruner=PRUNER,
    sampler=SAMPLER,
    study_name='LGBMRegressor_Meta_Optuna')
func_lgbm_meta = lambda trial: objective_lgbm_meta(
    trial, meta_X, y_train, N_FOLDS=N_FOLDS)

In [22]:
study_lgbm_meta.optimize(func_lgbm_meta,
                         n_trials=25,
                         show_progress_bar=True,
                         n_jobs=-1)

  0%|          | 0/25 [00:00<?, ?it/s]

In [23]:
lgbm_meta = LGBMRegressor(**study_lgbm_meta.best_params)
lgbm_meta.fit(meta_X, y_train)
meta_preds = lgbm_meta.predict(meta_X_test)

metrics = concat_metrics(
    metrics,
    y_train.values,
    lgbm_meta.predict(meta_X),
    meta_X,
    model_name=f'{lgbm_meta.__class__.__name__}_tuned_train_stacked')
metrics = concat_metrics(
    metrics,
    y_test.values,
    meta_preds,
    meta_X_test,
    model_name=f'{lgbm_meta.__class__.__name__}_tuned_test_stacked')
metrics

Unnamed: 0,Модель,MAE,R2_Adjusted,WAPE,RMSE
0,Ridge_tuned_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,Ridge_tuned_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,KNN_5_neighbors,1219297.903,-0.022,412.183,2247949.612
0,KNN_10_neighbors,1274803.452,-0.001,430.946,2224614.702
0,KNN_30_neighbors,1200748.007,-0.043,405.912,2271204.922
0,KNN_100_neighbors,1196244.814,-0.084,404.390,2315646.034
0,KNN_500_neighbors,1196224.520,-0.087,404.383,2318456.811
0,KNN_1000_neighbors,1195539.022,-0.079,404.151,2310125.164


LGBM как и остальные к сожалению не даёт улучшения качества метрик.

# Stacking коробочный.

Попробуем коробочную версию стэкинга от sklearn. Уберём у LGBM гиперпараметр, который потребует применения валидационного сета.

In [36]:
lgbm_params_without_early_stopping = config['lgbm_params'].copy()
lgbm_params_without_early_stopping.pop('early_stopping_rounds', None)
lgbm_params_without_early_stopping.pop('eval_metric', None)

estimators = [
    ('tuned_lgbm', LGBMRegressor(**lgbm_params_without_early_stopping)),
    ('tuned_catb', CatBoostRegressor(**config['catb_params'])),
    ('baseline_catb',
     CatBoostRegressor(cat_features=config['catb_params']['cat_features'],
                       verbose=0,
                       random_state=SEED)),
    ('baseline_lgbm', LGBMRegressor(verbose=-1, random_state=SEED))
]

meta = StackingRegressor(
    estimators=estimators,
    cv=cv_meta,
    final_estimator=RandomForestRegressor(random_state=SEED))

meta.fit(X_train, y_train)

metrics = concat_metrics(metrics,
                         y_train.values,
                         meta.predict(X_train),
                         X_train,
                         model_name=f'{meta.__class__.__name__}_train_stacked')
metrics = concat_metrics(metrics,
                         y_test.values,
                         meta.predict(X_test),
                         X_test,
                         model_name=f'{meta.__class__.__name__}_test_stacked')
metrics

Unnamed: 0,Модель,MAE,R2_Adjusted,WAPE,RMSE
0,Ridge_tuned_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,Ridge_tuned_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_train_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,LinearRegression_test_stacked,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...,Input contains infinity or a value too large f...
0,KNN_5_neighbors,1219297.903,-0.022,412.183,2247949.612
0,KNN_10_neighbors,1274803.452,-0.001,430.946,2224614.702
0,KNN_30_neighbors,1200748.007,-0.043,405.912,2271204.922
0,KNN_100_neighbors,1196244.814,-0.084,404.390,2315646.034
0,KNN_500_neighbors,1196224.520,-0.087,404.383,2318456.811
0,KNN_1000_neighbors,1195539.022,-0.079,404.151,2310125.164


In [None]:
check_overfitting(X_train, y_train, X_test, y_test, meta)

Коробочный стэкинг достаточно неплохо улучшил наши метрики. До этого наш фаворит был LGBMRegressor с MAE: 218,012.288, R2: 0.960, WAPE: 73.699, RMSE: 441,183.265.