<a href="https://colab.research.google.com/github/evk2103/prediction_of_car_prices/blob/main/Selection_of_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Подержанные автомобили

Датасет содержит информацию о характеристиках и ценах подержанных автомобилей в некоторой стране

## Импорт библиотек, константы

In [None]:
!pip install category_encoders -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score

from category_encoders.target_encoder import TargetEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder


In [None]:
RANDOM_STATE = 42

In [None]:
DATASET_PATH = 'https://raw.githubusercontent.com/evk2103/prediction_of_car_prices/main/clean_cars.csv'

In [None]:
# загрузка данных
df = pd.read_csv(DATASET_PATH)

In [None]:
# информация от столбцах
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6769 entries, 0 to 6768
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6769 non-null   object 
 1   year           6769 non-null   int64  
 2   selling_price  6769 non-null   int64  
 3   km_driven      6769 non-null   int64  
 4   fuel           6769 non-null   object 
 5   seller_type    6769 non-null   object 
 6   transmission   6769 non-null   object 
 7   owner          6769 non-null   object 
 8   mileage        6769 non-null   float64
 9   engine         6769 non-null   int64  
 10  max_power      6769 non-null   float64
 11  seats          6769 non-null   int64  
 12  torque_1       6769 non-null   float64
 13  torque_2       6769 non-null   int64  
dtypes: float64(3), int64(6), object(5)
memory usage: 740.5+ KB


In [None]:
# случайные три записи из датасета
df.sample(3)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,torque_1,torque_2
6344,Renault KWID Climber 1.0 MT BSIV,2019,300000,35000,Petrol,Individual,Manual,First Owner,23.01,999,67.0,5,91.0,4250
1067,Chevrolet Optra Magnum 1.6 LT Petrol,2009,250000,80000,Petrol,Individual,Manual,Second Owner,12.9,1598,102.5,5,148.0,4000
4794,Hyundai Sonata 2.4 GDi MT,2012,525000,70000,Petrol,Individual,Manual,Second Owner,13.44,2359,198.25,5,250.0,4250


In [None]:
X = df.drop(columns=['selling_price'])
y = df['selling_price']

In [None]:
y_log = np.log1p(y)

In [None]:
X.owner.unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [None]:
# закодируем столбец owner
owner = {'Test Drive Car': 0, 'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4}
X['owner'] = X['owner'].apply(lambda x: owner[x])

In [None]:
# колонки fuel, seller_type, transmission закодируем методом one-hot-encoding
X = pd.get_dummies(X, columns=['fuel', 'seller_type', 'transmission'], drop_first=True)

In [None]:
X.sample(3)

Unnamed: 0,name,year,km_driven,owner,mileage,engine,max_power,seats,torque_1,torque_2,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
745,Tata Indigo CS LX (TDI) BS III,2012,120000,2,19.09,1396,69.0,5,135.0,2500,1,0,0,1,0,1
41,Toyota Innova 2.5 G (Diesel) 7 Seater BS IV,2013,99000,1,12.99,2494,100.0,7,200.0,2400,1,0,0,0,0,1
4497,Chevrolet Optra Magnum 1.6 LT Petrol,2010,60000,2,12.9,1598,102.5,5,148.0,4000,0,0,1,1,0,1


In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y_log, test_size=0.25, random_state=RANDOM_STATE)

# Подбор модели

In [None]:
cat_cols = ['name']

In [None]:
p0 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', LinearRegression())
    ])

p1 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('model_', LinearRegression())
    ])

p2 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', StandardScaler()),
    ('model_', LinearRegression())
    ])

p3 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('model_', LinearRegression())
    ])

p4 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=10)),
    ('scaler_', MinMaxScaler()),
    ('model_', LinearRegression())
    ])

p5 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', MinMaxScaler()),
    ('model_', LinearRegression())
    ])

p6 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', LinearRegression())
    ])

p7 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', MinMaxScaler()),
    ('model_', LinearRegression())
    ])

p8 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('pf_', PolynomialFeatures(degree=2)),
    ('scaler_', MinMaxScaler()),
    ('model_', LinearRegression())
    ])

In [None]:
for i,p in enumerate([p0,p1,p2,p3,p4,p5,p6,p7,p8]):
    p.fit(Xtrain, ytrain)
    pred = p.predict(Xtest)
    pred = np.expm1(pred)
    test = np.expm1(ytest)
    print('p' + str(i), r2_score(test, pred))

p0 0.8850877387662601
p1 0.8852332385710323
p2 0.8795513366763279
p3 0.8852332385710329
p4 0.8850877387662605
p5 0.879551336676327
p6 0.8922921776984953
p7 0.892292177698497
p8 0.9246038834267057


In [None]:
p10 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', Ridge())
    ])

p11 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('model_', Ridge())
    ])

p12 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', StandardScaler()),
    ('model_', Ridge())
    ])

p13 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('model_', Ridge())
    ])

p14 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=10)),
    ('scaler_', MinMaxScaler()),
    ('model_', Ridge())
    ])

p15 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', MinMaxScaler()),
    ('model_', Ridge())
    ])

p16 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', Ridge())
    ])

p17 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', MinMaxScaler()),
    ('model_', Ridge())
    ])

p18 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('pf_', PolynomialFeatures(degree=2)),
    ('scaler_', StandardScaler()),
    ('model_', Ridge())
    ])

In [None]:
for i,p in enumerate([p10,p11,p12,p13,p14,p15,p16,p17,p18]):
    p.fit(Xtrain, ytrain)
    pred = p.predict(Xtest)
    pred = np.expm1(pred)
    test = np.expm1(ytest)
    print('p' + str(i+10), r2_score(test, pred))

p10 0.885095360133078
p11 0.8852231586527526
p12 0.8797806473883044
p13 0.8836955729899518
p14 0.8878577134447536
p15 0.887268314018754
p16 0.8922179127277438
p17 0.889224165350222
p18 0.9194517766034083


In [None]:
p20 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', SVR())
    ])

p21 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('model_', SVR())
    ])

p22 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', StandardScaler()),
    ('model_', SVR())
    ])

p23 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('model_', SVR())
    ])

p24 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=10)),
    ('scaler_', MinMaxScaler()),
    ('model_', SVR())
    ])

p25 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', MinMaxScaler()),
    ('model_', SVR())
    ])

p26 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', SVR())
    ])

p27 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', MinMaxScaler()),
    ('model_', SVR())
    ])

p28 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('pf_', PolynomialFeatures(degree=2)),
    ('scaler_', StandardScaler()),
    ('model_', SVR())
    ])

In [None]:
for i,p in enumerate([p20,p21,p22,p23,p24,p25,p26,p27,p28]):
    p.fit(Xtrain, ytrain)
    pred = p.predict(Xtest)
    pred = np.expm1(pred)
    test = np.expm1(ytest)
    print('p' + str(i+20), r2_score(test, pred))

p20 0.9329773702295554
p21 0.9421851702687488
p22 0.9028121466847722
p23 0.9281103158390008
p24 0.9262788802360423
p25 0.8982843440027111
p26 0.943995943361664
p27 0.925336824345091
p28 0.9346871292065217


In [None]:
p30 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', KNeighborsRegressor())
    ])

p31 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('model_', KNeighborsRegressor())
    ])

p32 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', StandardScaler()),
    ('model_', KNeighborsRegressor())
    ])

p33 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('model_', KNeighborsRegressor())
    ])

p34 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=10)),
    ('scaler_', MinMaxScaler()),
    ('model_', KNeighborsRegressor())
    ])

p35 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', MinMaxScaler()),
    ('model_', KNeighborsRegressor())
    ])

p36 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', KNeighborsRegressor())
    ])

p37 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', MinMaxScaler()),
    ('model_', KNeighborsRegressor())
    ])

p38 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('pf_', PolynomialFeatures(degree=2)),
    ('scaler_', StandardScaler()),
    ('model_', KNeighborsRegressor())
    ])

In [None]:
for i,p in enumerate([p30,p31,p32,p33,p34,p35,p36,p37,p38]):
    p.fit(Xtrain, ytrain)
    pred = p.predict(Xtest)
    pred = np.expm1(pred)
    test = np.expm1(ytest)
    print('p' + str(i+30), r2_score(test, pred))

p30 0.9278698715845578
p31 0.9270174343166939
p32 0.9317957798507289
p33 0.9250149323441588
p34 0.9254165673683419
p35 0.9234739739097139
p36 0.9148460491645939
p37 0.9150650107400459
p38 0.9208314931207646


In [None]:
p40 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', RandomForestRegressor())
    ])

p41 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('model_', RandomForestRegressor())
    ])

p42 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', StandardScaler()),
    ('model_', RandomForestRegressor())
    ])

p43 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('model_', RandomForestRegressor())
    ])

p44 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=10)),
    ('scaler_', MinMaxScaler()),
    ('model_', RandomForestRegressor())
    ])

p45 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', MinMaxScaler()),
    ('model_', RandomForestRegressor())
    ])

p46 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', RandomForestRegressor())
    ])

p47 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', MinMaxScaler()),
    ('model_', RandomForestRegressor())
    ])

p48 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('pf_', PolynomialFeatures(degree=2)),
    ('scaler_', MinMaxScaler()),
    ('model_', RandomForestRegressor())
    ])

In [None]:
for i,p in enumerate([p40,p41,p42,p43,p44,p45,p46,p47,p48]):
    p.fit(Xtrain, ytrain)
    pred = p.predict(Xtest)
    pred = np.expm1(pred)
    test = np.expm1(ytest)
    print('p' + str(i+40), r2_score(test, pred))

p40 0.8914411105599639
p41 0.9731492332905674
p42 0.8882997420786055
p43 0.9740202250805873
p44 0.8903505399062325
p45 0.8886069333516519
p46 0.9627179231241763
p47 0.9621979595814528
p48 0.9706711044685429


In [None]:
p50 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p51 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p52 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', StandardScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p53 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p54 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=10)),
    ('scaler_', MinMaxScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p55 = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=100)),
    ('scaler_', MinMaxScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p56 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', GradientBoostingRegressor())
    ])

p57 = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', MinMaxScaler()),
    ('model_', GradientBoostingRegressor())
    ])


In [None]:
for i,p in enumerate([p50,p51,p52,p53,p54,p55,p56,p57]):
    p.fit(Xtrain, ytrain)
    pred = p.predict(Xtest)
    pred = np.expm1(pred)
    test = np.expm1(ytest)
    print('p' + str(i+50), r2_score(test, pred))

p50 0.8951817542160954
p51 0.9528691318941845
p52 0.8894216153609452
p53 0.951943035906946
p54 0.8954192928043735
p55 0.8894394859867611
p56 0.9496288759132577
p57 0.9496467776537714


Сравним лучшие рузультаты:

* RandomForestRegressor: 0.9740202250805873
* GradientBoostingRegressor: 0.9528691318941845
* SVR: 0.943995943361664
* KNeighborsRegressor: 0.9317957798507289
* LinearRegression: 0.9246038834267057
* Ridge: 0.9194517766034083

Лучшие результаты у RandomForestRegressor, GradientBoostingRegressor и SVR. К этим моделям будем подбирать оптимальные гиперпараметры

# Улучшение модели

In [None]:
p_svr = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('svr', SVR())
    ])

In [None]:
params = {'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'svr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

gs = GridSearchCV(p_svr, params, cv=3, scoring='r2', verbose=0, n_jobs=-1)

gs.fit(Xtrain, ytrain)

print(gs.best_score_)
print(gs.best_params_)

21 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 217, in fit
    raise ValueError(
ValueError: Precomputed matrix must be a square matrix. Input is a 3384x16 matrix.

             nan  8.77175557e-01  7.59920848e-01  7.18755878e-01
  6.9412747

0.916337441801371
{'svr__C': 10, 'svr__kernel': 'rbf'}


In [None]:
p_svr_best = Pipeline([
    ('encoder_',LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('svr', SVR(C=10, kernel='rbf'))
    ])

In [None]:
p_svr_best.fit(Xtrain, ytrain)
pred = p_svr_best.predict(Xtest)
pred = np.expm1(pred)
test = np.expm1(ytest)
print(r2_score(test, pred))

0.9594770517808918


Подбор гиперпараметров для SVM позволил повысить качество на 0,0155

In [None]:
!pip install optuna -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna

In [None]:
def objective_gb(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    criterion = trial.suggest_categorical("criterion", ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])
    n_estimators = trial.suggest_int("n_estimators", 20, 1000)

    p_rf = Pipeline([
        ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
        ('scaler_', MinMaxScaler()),
        ('rf', RandomForestRegressor(max_depth=max_depth, criterion=criterion, n_estimators=n_estimators, random_state=RANDOM_STATE))
        ])

    score = cross_val_score(p_rf, Xtrain, ytrain, cv=3, scoring='r2', n_jobs=-1).mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective_gb, n_trials=50)

[I 2023-06-28 18:54:17,773] A new study created in memory with name: no-name-74e1bf3f-e035-423a-bcb6-de6319c7c52a
[I 2023-06-28 18:54:47,292] Trial 0 finished with value: 0.934622070949377 and parameters: {'max_depth': 13, 'criterion': 'squared_error', 'n_estimators': 784}. Best is trial 0 with value: 0.934622070949377.
[I 2023-06-28 18:54:55,738] Trial 1 finished with value: 0.9330215836372155 and parameters: {'max_depth': 11, 'criterion': 'friedman_mse', 'n_estimators': 286}. Best is trial 0 with value: 0.934622070949377.
[I 2023-06-28 18:54:58,927] Trial 2 finished with value: 0.8860656074804765 and parameters: {'max_depth': 5, 'criterion': 'squared_error', 'n_estimators': 214}. Best is trial 0 with value: 0.934622070949377.
[I 2023-06-28 18:58:00,826] Trial 3 finished with value: 0.9327917326923115 and parameters: {'max_depth': 11, 'criterion': 'absolute_error', 'n_estimators': 322}. Best is trial 0 with value: 0.934622070949377.
[I 2023-06-28 18:58:01,752] Trial 4 finished with va

In [None]:
study.best_params

{'max_depth': 17, 'criterion': 'absolute_error', 'n_estimators': 998}

In [None]:
p_rf_best = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', MinMaxScaler()),
    ('rf', RandomForestRegressor(max_depth=17, criterion='absolute_error', n_estimators=998, random_state=RANDOM_STATE))
    ])

In [None]:
p_rf_best.fit(Xtrain, ytrain)
pred = p_rf_best.predict(Xtest)
pred = np.expm1(pred)
test = np.expm1(ytest)
print(r2_score(test, pred))

0.9748775572278443


Подбор гиперпараметров для RandomForestRegressor повысил метрику R^2 всего лишь на 0,0009

In [None]:
def objective_gb(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    n_estimators = trial.suggest_int("n_estimators", 20, 1000)

    p_gb = Pipeline([
        ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
        ('scaler_', StandardScaler()),
        ('gb', GradientBoostingRegressor(max_depth=max_depth, n_estimators=n_estimators))
        ])

    score = cross_val_score(p_gb, Xtrain, ytrain, cv=3, scoring='r2', n_jobs=-1).mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective_gb, n_trials=50)

[I 2023-06-28 18:32:39,006] A new study created in memory with name: no-name-e9f143ef-007f-4833-9a06-5b77e124a100
[I 2023-06-28 18:32:53,562] Trial 0 finished with value: 0.9382215986088004 and parameters: {'max_depth': 3, 'n_estimators': 920}. Best is trial 0 with value: 0.9382215986088004.
[I 2023-06-28 18:32:55,851] Trial 1 finished with value: 0.9219905640189271 and parameters: {'max_depth': 2, 'n_estimators': 251}. Best is trial 0 with value: 0.9382215986088004.
[I 2023-06-28 18:33:14,508] Trial 2 finished with value: 0.9299934487675175 and parameters: {'max_depth': 8, 'n_estimators': 581}. Best is trial 0 with value: 0.9382215986088004.
[I 2023-06-28 18:33:41,712] Trial 3 finished with value: 0.922234034695745 and parameters: {'max_depth': 11, 'n_estimators': 615}. Best is trial 0 with value: 0.9382215986088004.
[I 2023-06-28 18:33:53,938] Trial 4 finished with value: 0.9371830281765364 and parameters: {'max_depth': 5, 'n_estimators': 553}. Best is trial 0 with value: 0.938221598

In [None]:
p_gb_best = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('gb', GradientBoostingRegressor(max_depth=4, n_estimators=561, random_state=RANDOM_STATE))
    ])

In [None]:
p_gb_best.fit(Xtrain, ytrain)
pred = p_gb_best.predict(Xtest)
pred = np.expm1(pred)
test = np.expm1(ytest)
print(r2_score(test, pred))

0.9790927602162188


Подбор гиперпараметров для GradientBoostingRegressor повысил метрику качества на 0,0262.


Из трех моделей наилучшая метрика у GradientBoostingRegressor.

Проверим даст ли повышения качества сочетание моделей.

In [None]:
pred_rf = np.expm1(p_rf_best.predict(Xtest))
pred_gb = np.expm1(p_gb_best.predict(Xtest))
pred_svr = np.expm1(p_svr_best.predict(Xtest))

In [None]:
for i in np.arange(0.1, 1.0, 0.1):
  i = round(i, 1)
  pred = i * pred_rf + (1-i) * pred_gb
  print(i, '--', r2_score(test, pred))

0.1 -- 0.9795196916390837
0.2 -- 0.9797592589744456
0.3 -- 0.9798114622223046
0.4 -- 0.9796763013826606
0.5 -- 0.9793537764555136
0.6 -- 0.9788438874408637
0.7 -- 0.9781466343387107
0.8 -- 0.9772620171490549
0.9 -- 0.976190035871896


In [None]:
for i in np.arange(0.1, 1.0, 0.1):
  i = round(i, 1)
  pred = i * pred_svr + (1-i) * pred_gb
  print(i, '--', r2_score(test, pred))

0.1 -- 0.9790955692174361
0.2 -- 0.9787716314584238
0.3 -- 0.9779837193214829
0.4 -- 0.9767318328066131
0.5 -- 0.9750159719138147
0.6 -- 0.9728361366430875
0.7 -- 0.9701923269944317
0.8 -- 0.9670845429678471
0.9 -- 0.9635127845633338


In [None]:
for i in np.arange(0.1, 1.0, 0.1):
  i = round(i, 1)
  pred = i * pred_svr + (1-i) * pred_rf
  print(i, '--', r2_score(test, pred))

0.1 -- 0.9758572830232164
0.2 -- 0.9762345518972838
0.3 -- 0.9760624971294365
0.4 -- 0.9753411187196744
0.5 -- 0.9740704166679975
0.6 -- 0.9722503909744059
0.7 -- 0.9698810416388995
0.8 -- 0.9669623686614783
0.9 -- 0.9634943720421425


Максимальную прибавку дало сочетание градиетнтого бустинга и случайного леса, но она незначительная: 0,0007.

Для итоговой модели будем использовать GradientBoostingRegressor с гиперпараметрами: max_depth=4, n_estimators=561.