# Поиск и обучение наиболее подходящей модели для задачи регрессии

# Dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import  GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Models

В качестве моделей регрессий, среди которых будет искаться наилучшая, выбраны следующие:

* Линейная регрессия
* Линейная регрессия со стохастическим градиентным спуском
* Дерево решений
* Случайный лес
* Adaptive Boosting
* Варианты реализаций градиентного бустинга (catboost, lgbm, xgb)

In [2]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

## Custom utils

In [3]:
from custom_transformers import CyclesToFailureAdder
from utils import show_slice, get_rmse, get_r2, cross_val_evaluation

# Data Loading

In [4]:
df = pd.read_csv('data.csv', sep=',')
origin_cycles = df
origin_cycles = CyclesToFailureAdder().transform(X=origin_cycles) # добавление колонки со значением циклов до отказа и удаление лишних величин
origin_cycles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16138 entries, 0 to 16137
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 16138 non-null  int64  
 1   cycle              16138 non-null  int64  
 2   p02                16138 non-null  float64
 3   p03                16138 non-null  float64
 4   p04                16138 non-null  float64
 5   p05                16138 non-null  float64
 6   p06                16138 non-null  float64
 7   p08                16138 non-null  int64  
 8   p11                16138 non-null  float64
 9   p12                16138 non-null  float64
 10  p13                16138 non-null  float64
 11  p14                16138 non-null  float64
 12  p15                16138 non-null  float64
 13  p17                16138 non-null  float64
 14  p18                16138 non-null  float64
 15  p19                16138 non-null  float64
 16  p20                161

# Split the data

Для того, чтобы обучающая и тестовая выборка были одинаково репрезентативны, необходимо учесть, что в исходном наборе данных содержатся результаты измерений с нескольких моторов. Это позволяет предположить, что эту выборку можно разбить на области-страты (по номеру мотора) и затем осуществить стратифицированную выборку.

In [5]:
train_set = None
test_set = None

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(origin_cycles, origin_cycles["id"]):
    train_set = origin_cycles.loc[train_index]
    test_set = origin_cycles.loc[test_index]

train_set = train_set.drop(columns=['id'])
test_set = test_set.drop(columns=['id'])

Так как представленные показатели измерений имеют разные масштабы, необходимо стандартизировать значения показателей.

In [6]:
data_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

In [7]:
cycles = train_set.drop('cycles_to_failure', axis=1)
cycles_labels = train_set['cycles_to_failure'].copy()

In [8]:
cycles_prepared = data_pipeline.fit_transform(cycles)

# Model choosing and training

In [9]:
best_model = None
lowest_error = float('inf')

for model in [LinearRegression(), DecisionTreeRegressor(), AdaBoostRegressor(), SGDRegressor(),RandomForestRegressor(), LGBMRegressor(), XGBRegressor(), CatBoostRegressor(verbose=False, od_type='IncToDec')]:
    print(f'Current model is {model}')
    regressor = model
    regressor.fit(cycles_prepared, cycles_labels)
    mean_error, std = cross_val_evaluation(cycles_prepared, cycles_labels, regressor)
    print(f'RMSE on the train set: {mean_error}')
    if mean_error < lowest_error:
        lowest_error = mean_error
        best_model = model

    print('---' * 15)
else:
    print(f'The best model is {best_model} with the error value: {lowest_error}')

Current model is LinearRegression()
RMSE on the train set: 36.61458740815899
---------------------------------------------
Current model is DecisionTreeRegressor()
RMSE on the train set: 46.96130274163806
---------------------------------------------
Current model is AdaBoostRegressor()
RMSE on the train set: 37.42345306129563
---------------------------------------------
Current model is SGDRegressor()
RMSE on the train set: 36.687262746892074
---------------------------------------------
Current model is RandomForestRegressor()
RMSE on the train set: 32.64340897297463
---------------------------------------------
Current model is LGBMRegressor()
RMSE on the train set: 32.42144776516712
---------------------------------------------
Current model is XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None,

Наилучшим образом себя показала модель CatBoost от Яндекса. Далее имеет смысл уменьшить значения ошибки пуьтем подбора парметров.

# Parameter tuning

В [документации](https://catboost.ai/en/docs/concepts/parameter-tuning) указано, какие параметры могут быть изменены для получения более приемлемого результата:

* *Number of trees (iterations)* -- если было обнаружено переобучение на тестовой выборке, рекомендуется изменять в большую сторону и указать детектор переобучения (od_Type)
* *Learning rate* -- скорость обучения, чем она меньше, тем больше итераций требуется для обучения. Если обнаружено переобучение, рекомендуется уменьшить значение
* *Tree depth* -- глубина дерева, оптимальное значение которого находитя в диапазоне [4: 10], рекомендуется изменять значения в диапазоне [6:10]

Проверим полученную модель на тестовой выборке на факт переобучения.

In [10]:
X_test = test_set.drop('cycles_to_failure', axis=1)
y_test = test_set["cycles_to_failure"].copy()
X_test_prepared = data_pipeline.transform(X_test)

In [11]:
get_rmse(X_test_prepared, y_test, best_model)

32.20177074493797

In [32]:
get_r2(X_test_prepared, y_test, best_model)

0.722212001064028

Попробуем улучшить имеющийся результат путем поиска других параметров.

In [12]:
best_model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'od_pval': 0,
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'od_type': 'IncToDec',
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'od_wait': 20,
 'random_seed': 0,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.06133

In [13]:
param_grid = [
     {
        'depth' : [6, 10, 12],
        'learning_rate' : [0.01, 0.0613],
        'iterations': [1000, 2000]
      }

]
grid_search = GridSearchCV(best_model, param_grid, cv=2,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(cycles_prepared, cycles_labels)

GridSearchCV(cv=2,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001E15541FE80>,
             param_grid=[{'depth': [6, 10, 12], 'iterations': [1000, 2000],
                          'learning_rate': [0.01, 0.0613]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [14]:
grid_search.best_params_

{'depth': 10, 'iterations': 1000, 'learning_rate': 0.01}

In [15]:
grid_search.best_estimator_

<catboost.core.CatBoostRegressor at 0x1e155477c40>

In [16]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

32.19111992106524 {'depth': 6, 'iterations': 1000, 'learning_rate': 0.01}
32.78445374668167 {'depth': 6, 'iterations': 1000, 'learning_rate': 0.0613}
32.239367074420564 {'depth': 6, 'iterations': 2000, 'learning_rate': 0.01}
33.24256200839746 {'depth': 6, 'iterations': 2000, 'learning_rate': 0.0613}
32.18460081397331 {'depth': 10, 'iterations': 1000, 'learning_rate': 0.01}
32.77707213954453 {'depth': 10, 'iterations': 1000, 'learning_rate': 0.0613}
32.264397647737724 {'depth': 10, 'iterations': 2000, 'learning_rate': 0.01}
32.884402360806774 {'depth': 10, 'iterations': 2000, 'learning_rate': 0.0613}
32.24751775471402 {'depth': 12, 'iterations': 1000, 'learning_rate': 0.01}
32.64271150139105 {'depth': 12, 'iterations': 1000, 'learning_rate': 0.0613}
32.30039103329568 {'depth': 12, 'iterations': 2000, 'learning_rate': 0.01}
32.67810769773072 {'depth': 12, 'iterations': 2000, 'learning_rate': 0.0613}


# Evaluation on the test set

In [17]:
final_model = grid_search.best_estimator_
final_model

<catboost.core.CatBoostRegressor at 0x1e155477c40>

In [18]:
X_test = test_set.drop('cycles_to_failure', axis=1)
y_test = test_set["cycles_to_failure"].copy()
X_test_prepared = data_pipeline.transform(X_test)

In [19]:
get_rmse(X_test_prepared, y_test, final_model)

31.88204737951477

In [20]:
get_r2(X_test_prepared, y_test, final_model)

0.68796437004812

In [35]:
final_model.save_model(fname='failure_prediction.cbm')

# P.S. Проверка гипотезы

Гипотеза заключается в том, что если модель обучить на одной выборке моторов (при этом в ней сохранить параметр id), а проверить на другой, которые она раньше никогда не видела, произойдет переобучение.

In [21]:
df = pd.read_csv('data.csv', sep=',')
origin_cycles = df
origin_cycles = CyclesToFailureAdder().transform(X=origin_cycles) # добавление колонки со значением циклов до отказа и удаление лишних величин
origin_cycles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16138 entries, 0 to 16137
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 16138 non-null  int64  
 1   cycle              16138 non-null  int64  
 2   p02                16138 non-null  float64
 3   p03                16138 non-null  float64
 4   p04                16138 non-null  float64
 5   p05                16138 non-null  float64
 6   p06                16138 non-null  float64
 7   p08                16138 non-null  int64  
 8   p11                16138 non-null  float64
 9   p12                16138 non-null  float64
 10  p13                16138 non-null  float64
 11  p14                16138 non-null  float64
 12  p15                16138 non-null  float64
 13  p17                16138 non-null  float64
 14  p18                16138 non-null  float64
 15  p19                16138 non-null  float64
 16  p20                161

In [22]:
from random import sample

nums = [i for i in range (1, 81)]
train_nums = sample(nums, k=int(80 * 0.8))
test_nums = list(set(nums).difference(set(train_nums)))

train_set = origin_cycles.loc[origin_cycles['id'].isin(train_nums)]
test_set = origin_cycles.loc[origin_cycles['id'].isin(test_nums)]

print(len(nums), len(train_nums), len(test_nums))

80 64 16


In [23]:
from sklearn.utils import shuffle

train_set = shuffle(train_set)
test_set = shuffle(test_set)

In [24]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12864 entries, 4080 to 15094
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 12864 non-null  int64  
 1   cycle              12864 non-null  int64  
 2   p02                12864 non-null  float64
 3   p03                12864 non-null  float64
 4   p04                12864 non-null  float64
 5   p05                12864 non-null  float64
 6   p06                12864 non-null  float64
 7   p08                12864 non-null  int64  
 8   p11                12864 non-null  float64
 9   p12                12864 non-null  float64
 10  p13                12864 non-null  float64
 11  p14                12864 non-null  float64
 12  p15                12864 non-null  float64
 13  p17                12864 non-null  float64
 14  p18                12864 non-null  float64
 15  p19                12864 non-null  float64
 16  p20                

In [25]:
train_set['id'].unique()

array([20, 48, 75, 80, 41, 21, 14,  2,  1, 28, 32,  7, 30, 10, 50, 72, 73,
       65, 67, 16, 15, 58, 70, 33, 64, 42,  9,  3, 55, 56,  5, 51, 36, 53,
       27, 62, 69, 59, 13, 44, 37, 71, 25, 19,  4, 12, 45, 34, 11, 77, 40,
       39, 61, 18, 29, 23, 66, 49, 17, 24, 60, 68,  8,  6], dtype=int64)

In [26]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3274 entries, 9082 to 4409
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3274 non-null   int64  
 1   cycle              3274 non-null   int64  
 2   p02                3274 non-null   float64
 3   p03                3274 non-null   float64
 4   p04                3274 non-null   float64
 5   p05                3274 non-null   float64
 6   p06                3274 non-null   float64
 7   p08                3274 non-null   int64  
 8   p11                3274 non-null   float64
 9   p12                3274 non-null   float64
 10  p13                3274 non-null   float64
 11  p14                3274 non-null   float64
 12  p15                3274 non-null   float64
 13  p17                3274 non-null   float64
 14  p18                3274 non-null   float64
 15  p19                3274 non-null   float64
 16  p20                32

In [27]:
test_set['id'].unique()

array([47, 46, 43, 35, 78, 57, 54, 76, 26, 52, 63, 22, 38, 79, 31, 74],
      dtype=int64)

In [28]:
data_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
cycles = train_set.drop('cycles_to_failure', axis=1)
cycles_labels = train_set['cycles_to_failure'].copy()
cycles_prepared = data_pipeline.fit_transform(cycles)

In [29]:
best_model = None
lowest_error = float('inf')

for model in [LinearRegression(), DecisionTreeRegressor(), AdaBoostRegressor(), SGDRegressor(),RandomForestRegressor(), LGBMRegressor(), XGBRegressor(), CatBoostRegressor(verbose=False, od_type='IncToDec')]:
    print(f'Current model is {model}')
    regressor = model
    regressor.fit(cycles_prepared, cycles_labels)
    mean_error, std = cross_val_evaluation(cycles_prepared, cycles_labels, regressor)
    print(f'RMSE on the train set: {mean_error}')
    if mean_error < lowest_error:
        lowest_error = mean_error
        best_model = model

    print('---' * 15)
else:
    print(f'The best model is {best_model} with the error value: {lowest_error}')

Current model is LinearRegression()
RMSE on the train set: 38.244882174778866
---------------------------------------------
Current model is DecisionTreeRegressor()
RMSE on the train set: 21.19005570772807
---------------------------------------------
Current model is AdaBoostRegressor()
RMSE on the train set: 34.53787298169793
---------------------------------------------
Current model is SGDRegressor()
RMSE on the train set: 38.3075249633565
---------------------------------------------
Current model is RandomForestRegressor()
RMSE on the train set: 13.550285578588571
---------------------------------------------
Current model is LGBMRegressor()
RMSE on the train set: 8.6819547288274
---------------------------------------------
Current model is XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, g

In [30]:
X_test = test_set.drop('cycles_to_failure', axis=1)
y_test = test_set["cycles_to_failure"].copy()
X_test_prepared = data_pipeline.transform(X_test)

In [31]:
get_rmse(X_test_prepared, y_test, best_model)

30.486477738623808

Как и ожидалось, произошло значительное переобучение модели.