In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [2]:
%pip install gdown



In [5]:
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}

Downloading...
From (original): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI
From (redirected): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI&confirm=t&uuid=1bcc1dde-3644-4036-8071-a720fb80b589
To: /content/dataset.csv
100% 1.01G/1.01G [00:10<00:00, 95.2MB/s]


In [6]:
df = pd.read_csv('dataset.csv')
print(df.shape)

(604047, 24)


In [7]:
df_with_na_column = df[df['engine_displacement'].isna()]
df_with_na_column

Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
210905,2003,175000,CONDITION_OK,0,True,,,Чебаркуль,PRIVATE,Hyundai,...,,,,LEFT,380000,MEDIUM,available_for_checkup;pts_original;real_photo;...,,seats-5,


In [8]:
df.drop(210905, inplace=True)

In [9]:
df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)


In [10]:
df['pts_original'].fillna(True, inplace=True)
df['accidents_resolution'].fillna('OK', inplace=True)
df['auto_class'].fillna('NOT SPECIFIED', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['pts_original'].fillna(True, inplace=True)
  df['pts_original'].fillna(True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['accidents_resolution'].fillna('OK', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because 

In [11]:
df.drop('horse_power', axis=1, inplace=True)

In [12]:
def get_unique_values(series, sep=';'):
    uniq = set()
    for cell in series.dropna():
        for piece in cell.split(sep):
            s = piece.strip()
            if s:
                uniq.add(s)
    return np.array(list(uniq))

all_tags = get_unique_values(df['tags'])
all_options = get_unique_values(df['complectation_available_options'])
all_equipments = get_unique_values(df['equipment'])

In [13]:
def create_binary_features(df, column, unique_values, sep=';'):
    return (
        df[column]
        .str.get_dummies(sep=sep)
        .reindex(columns=unique_values, fill_value=0)
        .astype('int8')
    )

tags_dummies = create_binary_features(df, 'tags', all_tags)
options_dummies = create_binary_features(df, 'complectation_available_options', all_options)
equipment_dummies = create_binary_features(df, 'equipment', all_equipments)

In [14]:
tags_dummies = tags_dummies.astype('Sparse[int]')
options_dummies = options_dummies.astype('Sparse[int]')
equipment_dummies = equipment_dummies.astype('Sparse[int]')

In [15]:
full_df = pd.concat([
    df.drop(columns=['tags', 'complectation_available_options', 'equipment']),
    tags_dummies,
    options_dummies.drop(columns=['condition']),
    equipment_dummies.drop(columns=['condition'])
], axis=1)

In [16]:
duplicated_cols = full_df.columns[full_df.columns.duplicated()].unique()
for col in duplicated_cols:
    full_df[col] = full_df[col].astype("int8")

In [17]:
import numpy as np

names = full_df.columns[full_df.columns.duplicated()].unique()
new_cols = {}

for name in names:
    cols_i = [col for col in full_df.columns if col == name]
    sub_df = full_df.loc[:, cols_i].astype('int8')   # DataFrame shape (n_rows, k)

    # берём максимум по строкам → Series shape (n_rows,)
    max_series = sub_df.max(axis=1).astype('int8')

    new_cols[name] = max_series

    # удаляем *все* старые колонки с этим именем
    full_df.drop(columns=cols_i, inplace=True)

# теперь присваиваем новые колонки–Series (они одномерны!)
for name, s in new_cols.items():
    full_df[name] = s


  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s


In [18]:
full_df.sample(10)

Unnamed: 0,production_year,mileage,condition,owners_number,accidents_resolution,region,seller_type,brand,model,body_type,...,rcta,body-kit,auto-mirrors,multi-wheel,ptf,combo-interior,drl,wheel-configuration2,17-inch-wheels,wheel-memory
436947,2024,0,CONDITION_OK,0,OK,Санкт-Петербург,COMMERCIAL,Skoda,Karoq,ALLROAD_5_DOORS,...,0,0,1,1,0,1,1,1,0,0
329729,2020,40460,CONDITION_OK,1,ERROR,Москва,PRIVATE,Mercedes-Benz,E-Класс,COUPE_HARDTOP,...,0,0,1,1,0,0,0,1,0,0
343497,2000,380000,CONDITION_OK,4,OK,Москва,PRIVATE,Mercedes-Benz,M-Класс,ALLROAD_5_DOORS,...,0,0,0,0,0,0,0,0,0,0
439784,2019,170000,CONDITION_OK,1,ERROR,Москва,PRIVATE,Skoda,Octavia,LIFTBACK,...,0,0,1,1,1,0,1,1,1,0
550845,2023,107,CONDITION_OK,1,OK,Краснодар,PRIVATE,Lada (ВАЗ),Largus,WAGON_5_DOORS,...,0,0,0,1,0,0,1,0,0,0
431808,2015,110727,CONDITION_OK,1,OK,Уфа,COMMERCIAL,Renault,Sandero,HATCHBACK_5_DOORS,...,0,0,0,0,1,0,0,0,0,0
167063,2024,0,CONDITION_OK,0,OK,Химки,COMMERCIAL,Genesis,GV80 Coupe,ALLROAD_5_DOORS,...,1,0,1,1,0,0,1,1,0,0
391523,2022,13000,CONDITION_OK,0,OK,Владивосток,COMMERCIAL,Nissan,X-Trail,ALLROAD_5_DOORS,...,0,0,0,0,0,0,0,0,0,0
210052,2024,100,CONDITION_OK,1,OK,Новосибирск,PRIVATE,Hyundai,Elantra,SEDAN,...,0,0,0,0,0,0,0,0,0,0
46816,2024,34,CONDITION_OK,1,OK,Москва,COMMERCIAL,BMW,X5,ALLROAD_5_DOORS,...,0,0,0,0,0,0,0,0,0,0


In [19]:
import pandas as pd
import numpy as np
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

numeric_df = full_df.select_dtypes(include=[np.number])
numeric_df = numeric_df.loc[:, numeric_df.nunique() > 1]

corr = numeric_df.corr().abs().fillna(0)

dist = 1 - corr

dist = (dist + dist.T) / 2
np.fill_diagonal(dist.values, 0)

dist = dist.clip(lower=0)

dist_vect = squareform(dist)
Z = hierarchy.linkage(dist_vect, method='average')

max_d = 0.4
clusters = hierarchy.fcluster(Z, t=max_d, criterion='distance')

cluster_df = pd.DataFrame({'feature': numeric_df.columns, 'cluster': clusters})
to_keep = []
for c in cluster_df['cluster'].unique():
    members = cluster_df.loc[cluster_df['cluster'] == c, 'feature']
    avg_corr = corr.loc[members, members].mean().sort_values(ascending=False)
    to_keep.append(avg_corr.index[0])
to_keep = list(dict.fromkeys(to_keep))

to_drop = [col for col in numeric_df.columns if col not in to_keep]
df_reduced = full_df.drop(columns=to_drop)

print(f"Удалено числовых коррелирующих признаков: {len(to_drop)}")
print(f"Осталось признаков в df_reduced: {df_reduced.shape[1]} из {full_df.shape[1]}")

Удалено числовых коррелирующих признаков: 115
Осталось признаков в df_reduced: 301 из 416


In [20]:
df_reduced.sample(10)

Unnamed: 0,mileage,condition,accidents_resolution,region,seller_type,brand,model,body_type,doors_count,seats,...,body-mouldings,third-row-seats,auto-park,door-sill-panel,rcta,body-kit,multi-wheel,ptf,combo-interior,17-inch-wheels
518746,33000,CONDITION_OK,ERROR,Аксай,PRIVATE,Lada (ВАЗ),2107,SEDAN,4,5,...,0,0,0,0,0,0,0,0,0,0
224119,23211,CONDITION_OK,OK,Москва,PRIVATE,Hyundai,Solaris,SEDAN,4,5,...,1,0,0,0,0,0,1,1,0,0
351020,6122,CONDITION_OK,OK,Уссурийск,COMMERCIAL,Mini,Countryman,ALLROAD_5_DOORS,5,5,...,0,0,0,0,0,0,0,0,0,0
424138,4500,CONDITION_OK,ERROR,Погар,PRIVATE,Renault,Kaptur,ALLROAD_5_DOORS,5,5,...,1,0,0,0,0,0,0,0,0,0
89776,245600,CONDITION_OK,OK,Тверь,PRIVATE,Chevrolet,Aveo,SEDAN,4,5,...,0,0,0,0,0,0,0,0,0,0
297634,18300,CONDITION_OK,OK,Москва,PRIVATE,Lexus,RX,ALLROAD_5_DOORS,5,5,...,1,0,0,0,0,0,1,1,0,0
449720,0,CONDITION_OK,OK,Москва,COMMERCIAL,Soueast,S07,ALLROAD_5_DOORS,5,5;7,...,0,0,0,0,1,0,1,1,0,0
49847,121000,CONDITION_OK,OK,Москва,PRIVATE,BMW,X6,ALLROAD_5_DOORS,5,5,...,0,0,1,1,0,1,1,1,0,0
48872,0,CONDITION_OK,OK,Краснодар,COMMERCIAL,BMW,X6,ALLROAD_5_DOORS,5,5,...,0,0,0,0,0,0,0,0,0,0
846,363500,CONDITION_OK,OK,Уфа,PRIVATE,Alfa Romeo,166,SEDAN,4,5,...,0,0,0,1,0,0,0,1,0,0


## Обучим LightGBM и CatBoost на df_reduced

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
X = df_reduced.drop('price', axis=1)
y = df_reduced['price']

In [None]:
num_features = X.select_dtypes(include=['int64', 'float64', 'int8']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
])

**LightGBM**

In [None]:
lgbm_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('model', LGBMRegressor(objective='regression', random_state=42))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [None]:
lgbm_pipeline.fit(X_train, y_train)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.600399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6396
[LightGBM] [Info] Number of data points in the train set: 483236, number of used features: 2692
[LightGBM] [Info] Start training from score 2807543.358738


In [None]:
y_pred_lgbm = lgbm_pipeline.predict(X_test)



In [None]:
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm  = r2_score(y_test, y_pred_lgbm)

In [None]:
print("LightGBM:")
print(f"  MSE:  {mse_lgbm:.2f}")
print(f"  MAE:  {mae_lgbm:.2f}")
print(f"  R^2:  {r2_lgbm:.2f}")

LightGBM:
  MSE:  7009385199619.20
  MAE:  474050.08
  R^2:  0.75


**CatBoost**

In [None]:
cat_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('model', CatBoostRegressor(loss_function='RMSE', verbose=0, random_seed=RANDOM_STATE))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [None]:
cat_pipeline.fit(X_train, y_train)



In [None]:
y_pred_cat  = cat_pipeline.predict(X_test)



In [None]:
mse_cat = mean_squared_error(y_test, y_pred_cat)
mae_cat = mean_absolute_error(y_test, y_pred_cat)
r2_cat  = r2_score(y_test, y_pred_cat)

In [None]:
print("LightGBM:")
print(f"  MSE:  {mse_lgbm:.2f}")
print(f"  MAE:  {mae_lgbm:.2f}")
print(f"  R^2:  {r2_lgbm:.2f}")

print("CatBoost:")
print(f"  MSE:  {mse_cat:.2f}")
print(f"  MAE:  {mae_cat:.2f}")
print(f"  R^2:  {r2_cat:.2f}")

LightGBM:
  MSE:  7009385199619.20
  MAE:  474050.08
  R^2:  0.75
CatBoost:
  MSE:  8188312842766.43
  MAE:  417499.35
  R^2:  0.71


## Попробуем подобрать гиперпараметры с помощью Bayesian Optimization

**CatBoost**

In [24]:
import optuna
from optuna.trial import TrialState
from catboost import CatBoostRegressor, utils as cb_utils
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import r2_score
import joblib

In [25]:
X = df_reduced.drop('price', axis=1)
y = df_reduced['price']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [26]:
num_features = X.select_dtypes(include=['int64','float64','int8']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_features),
])

In [27]:
X_train_proc = preprocessor.fit_transform(X_train)
X_valid_proc = preprocessor.transform(X_valid)



In [28]:
n_gpus = cb_utils.get_gpu_device_count()
if n_gpus == 0:
    raise RuntimeError("GPU не найдена!")
devices = ",".join(str(i) for i in range(n_gpus))

In [29]:
N_TRIALS = 50

def objective(trial):
    params = {
        'task_type':     'GPU',
        'devices':       devices,
        'gpu_ram_part':  0.8,
        'iterations':    trial.suggest_int('iterations',   200, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'depth':         trial.suggest_int('depth',        4,  10),
        'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),
        'border_count':  trial.suggest_int('border_count', 32, 255),
        'random_seed':   42,
        'verbose':       False,
    }
    model = CatBoostRegressor(**params)
    model.fit(
        X_train_proc, y_train,
        eval_set=(X_valid_proc, y_valid),
        early_stopping_rounds=50
    )
    preds = model.predict(X_valid_proc)
    return r2_score(y_valid, preds)

def progress_cb(study, trial):
    done = len([t for t in study.trials if t.state == TrialState.COMPLETE])
    print(f"Completed {done}/{N_TRIALS} trials")

In [31]:
study = optuna.create_study(direction='maximize')
study.optimize(
    objective,
    n_trials=N_TRIALS,
    n_jobs=1,
    show_progress_bar=True,
    callbacks=[progress_cb]
)

best = study.best_params
best.update({'task_type':'GPU','devices':devices,'gpu_ram_part':0.8,'random_seed':RANDOM_STATE,'verbose':False})
final_model = CatBoostRegressor(**best)
final_model.fit(X_train_proc, y_train, eval_set=(X_valid_proc, y_valid), early_stopping_rounds=N_TRIALS)

joblib.dump(final_model, 'best_catboost_model.pkl')
joblib.dump(preprocessor,      'preprocessor.pkl')
print("Models saved to best_catboost_model.pkl and preprocessor.pkl")

[I 2025-06-09 12:57:30,850] A new study created in memory with name: no-name-7a57941d-220e-4e34-8401-883bd68ef5d8


  0%|          | 0/50 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 12:59:05,155] Trial 0 finished with value: 0.6932672294177961 and parameters: {'iterations': 794, 'learning_rate': 0.06569379466901307, 'depth': 7, 'l2_leaf_reg': 1.3630312367860937, 'border_count': 94}. Best is trial 0 with value: 0.6932672294177961.
Completed 1/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:00:37,241] Trial 1 finished with value: 0.6418232833303776 and parameters: {'iterations': 1464, 'learning_rate': 0.0023250134557111285, 'depth': 4, 'l2_leaf_reg': 0.23038251992682315, 'border_count': 243}. Best is trial 0 with value: 0.6932672294177961.
Completed 2/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:02:06,839] Trial 2 finished with value: 0.7188220748832538 and parameters: {'iterations': 666, 'learning_rate': 0.03544615615552673, 'depth': 8, 'l2_leaf_reg': 3.1495975520154897, 'border_count': 110}. Best is trial 2 with value: 0.7188220748832538.
Completed 3/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:03:35,112] Trial 3 finished with value: 0.2894551062317464 and parameters: {'iterations': 357, 'learning_rate': 0.001084924803388012, 'depth': 9, 'l2_leaf_reg': 2.3711364736160943, 'border_count': 140}. Best is trial 2 with value: 0.7188220748832538.
Completed 4/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:05:14,661] Trial 4 finished with value: 0.7005345355009212 and parameters: {'iterations': 1908, 'learning_rate': 0.005207692383574777, 'depth': 6, 'l2_leaf_reg': 0.19438432589249957, 'border_count': 105}. Best is trial 2 with value: 0.7188220748832538.
Completed 5/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:06:46,816] Trial 5 finished with value: 0.6677416597187018 and parameters: {'iterations': 1209, 'learning_rate': 0.0057769684304295535, 'depth': 4, 'l2_leaf_reg': 0.2763763806247014, 'border_count': 193}. Best is trial 2 with value: 0.7188220748832538.
Completed 6/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:08:14,775] Trial 6 finished with value: 0.7087576758204766 and parameters: {'iterations': 397, 'learning_rate': 0.07804964454628835, 'depth': 6, 'l2_leaf_reg': 3.6246516280436167, 'border_count': 80}. Best is trial 2 with value: 0.7188220748832538.
Completed 7/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:09:42,470] Trial 7 finished with value: 0.7036058394400189 and parameters: {'iterations': 1343, 'learning_rate': 0.08165148678851036, 'depth': 8, 'l2_leaf_reg': 0.11167994996514394, 'border_count': 96}. Best is trial 2 with value: 0.7188220748832538.
Completed 8/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:11:17,745] Trial 8 finished with value: 0.7207027669732917 and parameters: {'iterations': 914, 'learning_rate': 0.012385822150096412, 'depth': 9, 'l2_leaf_reg': 3.419409274956161, 'border_count': 246}. Best is trial 8 with value: 0.7207027669732917.
Completed 9/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:12:53,597] Trial 9 finished with value: 0.6811746986058143 and parameters: {'iterations': 1423, 'learning_rate': 0.008302883544976668, 'depth': 6, 'l2_leaf_reg': 0.12714521145799562, 'border_count': 34}. Best is trial 8 with value: 0.7207027669732917.
Completed 10/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:14:30,596] Trial 10 finished with value: 0.7222775259423375 and parameters: {'iterations': 891, 'learning_rate': 0.020791240761497996, 'depth': 10, 'l2_leaf_reg': 8.71268825527187, 'border_count': 227}. Best is trial 10 with value: 0.7222775259423375.
Completed 11/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:16:09,977] Trial 11 finished with value: 0.7171746368638499 and parameters: {'iterations': 927, 'learning_rate': 0.01924569379817668, 'depth': 10, 'l2_leaf_reg': 9.931256660725575, 'border_count': 254}. Best is trial 10 with value: 0.7222775259423375.
Completed 12/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:17:49,147] Trial 12 finished with value: 0.7070219951281792 and parameters: {'iterations': 1015, 'learning_rate': 0.017838511472750493, 'depth': 10, 'l2_leaf_reg': 9.326507901977456, 'border_count': 199}. Best is trial 10 with value: 0.7222775259423375.
Completed 13/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:19:21,400] Trial 13 finished with value: 0.7127860511564188 and parameters: {'iterations': 634, 'learning_rate': 0.017995843864554443, 'depth': 9, 'l2_leaf_reg': 5.297580648791713, 'border_count': 209}. Best is trial 10 with value: 0.7222775259423375.
Completed 14/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:21:05,508] Trial 14 finished with value: 0.723451227922383 and parameters: {'iterations': 1704, 'learning_rate': 0.03949554934169496, 'depth': 10, 'l2_leaf_reg': 0.7173170682324248, 'border_count': 222}. Best is trial 14 with value: 0.723451227922383.
Completed 15/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:22:39,950] Trial 15 finished with value: 0.7260927327806069 and parameters: {'iterations': 1781, 'learning_rate': 0.03592650543586629, 'depth': 10, 'l2_leaf_reg': 0.7358560715805967, 'border_count': 167}. Best is trial 15 with value: 0.7260927327806069.
Completed 16/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:24:10,193] Trial 16 finished with value: 0.7085333757844618 and parameters: {'iterations': 1968, 'learning_rate': 0.03848771292759677, 'depth': 8, 'l2_leaf_reg': 0.60044880898334, 'border_count': 167}. Best is trial 15 with value: 0.7260927327806069.
Completed 17/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:25:55,335] Trial 17 finished with value: 0.7206738334491698 and parameters: {'iterations': 1680, 'learning_rate': 0.03848832138782188, 'depth': 10, 'l2_leaf_reg': 0.5313778671265103, 'border_count': 165}. Best is trial 15 with value: 0.7260927327806069.
Completed 18/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:27:25,585] Trial 18 finished with value: 0.7128944089183412 and parameters: {'iterations': 1712, 'learning_rate': 0.04749903665213908, 'depth': 9, 'l2_leaf_reg': 0.8760434688274236, 'border_count': 175}. Best is trial 15 with value: 0.7260927327806069.
Completed 19/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:28:55,758] Trial 19 finished with value: 0.7171643097817783 and parameters: {'iterations': 1661, 'learning_rate': 0.029917088686737207, 'depth': 7, 'l2_leaf_reg': 1.4117038790666794, 'border_count': 134}. Best is trial 15 with value: 0.7260927327806069.
Completed 20/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:30:32,409] Trial 20 finished with value: 0.7093819766990517 and parameters: {'iterations': 1800, 'learning_rate': 0.011148601745147032, 'depth': 5, 'l2_leaf_reg': 0.42928781257433946, 'border_count': 214}. Best is trial 15 with value: 0.7260927327806069.
Completed 21/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:32:16,497] Trial 21 finished with value: 0.7313771967164437 and parameters: {'iterations': 1202, 'learning_rate': 0.02419156726656339, 'depth': 10, 'l2_leaf_reg': 0.9128633915743343, 'border_count': 227}. Best is trial 21 with value: 0.7313771967164437.
Completed 22/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:33:49,314] Trial 22 finished with value: 0.7367231914101138 and parameters: {'iterations': 1541, 'learning_rate': 0.05918691960505747, 'depth': 10, 'l2_leaf_reg': 0.9268978224691037, 'border_count': 223}. Best is trial 22 with value: 0.7367231914101138.
Completed 23/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:35:21,603] Trial 23 finished with value: 0.7192436836373248 and parameters: {'iterations': 1244, 'learning_rate': 0.05743962247541503, 'depth': 9, 'l2_leaf_reg': 1.196817283983042, 'border_count': 185}. Best is trial 22 with value: 0.7367231914101138.
Completed 24/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:37:11,356] Trial 24 finished with value: 0.7231396346706171 and parameters: {'iterations': 1541, 'learning_rate': 0.027354159245114123, 'depth': 10, 'l2_leaf_reg': 1.9540304095803396, 'border_count': 156}. Best is trial 22 with value: 0.7367231914101138.
Completed 25/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:38:37,946] Trial 25 finished with value: 0.6979692276606861 and parameters: {'iterations': 1205, 'learning_rate': 0.08712835272181155, 'depth': 8, 'l2_leaf_reg': 0.3709424477554205, 'border_count': 231}. Best is trial 22 with value: 0.7367231914101138.
Completed 26/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:40:07,533] Trial 26 finished with value: 0.716120116267411 and parameters: {'iterations': 1554, 'learning_rate': 0.05384850448854245, 'depth': 9, 'l2_leaf_reg': 0.8478983487984078, 'border_count': 201}. Best is trial 22 with value: 0.7367231914101138.
Completed 27/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:41:40,303] Trial 27 finished with value: 0.7247690858001132 and parameters: {'iterations': 1089, 'learning_rate': 0.09598184717080638, 'depth': 10, 'l2_leaf_reg': 1.8706847286571462, 'border_count': 128}. Best is trial 22 with value: 0.7367231914101138.
Completed 28/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:43:13,291] Trial 28 finished with value: 0.7171460515077622 and parameters: {'iterations': 1833, 'learning_rate': 0.026304252773581307, 'depth': 9, 'l2_leaf_reg': 0.9806036712374689, 'border_count': 186}. Best is trial 22 with value: 0.7367231914101138.
Completed 29/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:44:47,420] Trial 29 finished with value: 0.7091157396425245 and parameters: {'iterations': 1355, 'learning_rate': 0.013660041310541065, 'depth': 7, 'l2_leaf_reg': 1.2122847009443958, 'border_count': 231}. Best is trial 22 with value: 0.7367231914101138.
Completed 30/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:46:16,181] Trial 30 finished with value: 0.6969192320934086 and parameters: {'iterations': 1593, 'learning_rate': 0.06480951004245614, 'depth': 7, 'l2_leaf_reg': 0.35237850133370724, 'border_count': 152}. Best is trial 22 with value: 0.7367231914101138.
Completed 31/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:47:43,972] Trial 31 finished with value: 0.7241511098693275 and parameters: {'iterations': 1069, 'learning_rate': 0.09951747194790757, 'depth': 10, 'l2_leaf_reg': 1.7762099957267539, 'border_count': 126}. Best is trial 22 with value: 0.7367231914101138.
Completed 32/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:49:16,403] Trial 32 finished with value: 0.7028373375422016 and parameters: {'iterations': 1138, 'learning_rate': 0.053527780437242205, 'depth': 10, 'l2_leaf_reg': 1.5373844086486348, 'border_count': 49}. Best is trial 22 with value: 0.7367231914101138.
Completed 33/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:50:48,145] Trial 33 finished with value: 0.7143363977172361 and parameters: {'iterations': 705, 'learning_rate': 0.06959594226927819, 'depth': 10, 'l2_leaf_reg': 0.6229759795004837, 'border_count': 116}. Best is trial 22 with value: 0.7367231914101138.
Completed 34/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:52:16,961] Trial 34 finished with value: 0.7102205705592477 and parameters: {'iterations': 1429, 'learning_rate': 0.04736908867072914, 'depth': 9, 'l2_leaf_reg': 1.1520125751959203, 'border_count': 70}. Best is trial 22 with value: 0.7367231914101138.
Completed 35/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:53:44,248] Trial 35 finished with value: 0.7237112815130023 and parameters: {'iterations': 1310, 'learning_rate': 0.0995354762422134, 'depth': 8, 'l2_leaf_reg': 2.4216979652706434, 'border_count': 144}. Best is trial 22 with value: 0.7367231914101138.
Completed 36/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:55:31,946] Trial 36 finished with value: 0.6737699927555946 and parameters: {'iterations': 1482, 'learning_rate': 0.001889417258107899, 'depth': 10, 'l2_leaf_reg': 2.383744039240437, 'border_count': 240}. Best is trial 22 with value: 0.7367231914101138.
Completed 37/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:57:00,428] Trial 37 finished with value: 0.6883505394071694 and parameters: {'iterations': 499, 'learning_rate': 0.007683417768454198, 'depth': 9, 'l2_leaf_reg': 0.4489167497650603, 'border_count': 126}. Best is trial 22 with value: 0.7367231914101138.
Completed 38/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 13:58:53,927] Trial 38 finished with value: 0.7234102237773827 and parameters: {'iterations': 1835, 'learning_rate': 0.0035293534089552016, 'depth': 10, 'l2_leaf_reg': 0.7525174456937069, 'border_count': 214}. Best is trial 22 with value: 0.7367231914101138.
Completed 39/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:00:23,194] Trial 39 finished with value: 0.7057353299449285 and parameters: {'iterations': 1011, 'learning_rate': 0.03136174372408727, 'depth': 5, 'l2_leaf_reg': 0.25878545822783333, 'border_count': 175}. Best is trial 22 with value: 0.7367231914101138.
Completed 40/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:01:49,897] Trial 40 finished with value: 0.7175773137397401 and parameters: {'iterations': 237, 'learning_rate': 0.07094213097194198, 'depth': 9, 'l2_leaf_reg': 4.274828135554094, 'border_count': 85}. Best is trial 22 with value: 0.7367231914101138.
Completed 41/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:03:18,852] Trial 41 finished with value: 0.7147691367995074 and parameters: {'iterations': 1098, 'learning_rate': 0.09915192619031886, 'depth': 10, 'l2_leaf_reg': 1.7245732090340047, 'border_count': 121}. Best is trial 22 with value: 0.7367231914101138.
Completed 42/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:04:55,760] Trial 42 finished with value: 0.7298153592769658 and parameters: {'iterations': 783, 'learning_rate': 0.07616518813512438, 'depth': 10, 'l2_leaf_reg': 1.9591659811606947, 'border_count': 133}. Best is trial 22 with value: 0.7367231914101138.
Completed 43/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:06:33,142] Trial 43 finished with value: 0.7181260360066611 and parameters: {'iterations': 791, 'learning_rate': 0.02436988739992741, 'depth': 10, 'l2_leaf_reg': 2.7089838037813796, 'border_count': 108}. Best is trial 22 with value: 0.7367231914101138.
Completed 44/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:08:05,778] Trial 44 finished with value: 0.7205052614464946 and parameters: {'iterations': 854, 'learning_rate': 0.04357245938012274, 'depth': 9, 'l2_leaf_reg': 1.0305552874659607, 'border_count': 143}. Best is trial 22 with value: 0.7367231914101138.
Completed 45/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:09:42,348] Trial 45 finished with value: 0.7368262842322233 and parameters: {'iterations': 599, 'learning_rate': 0.05951760389847296, 'depth': 10, 'l2_leaf_reg': 2.04641012682822, 'border_count': 253}. Best is trial 45 with value: 0.7368262842322233.
Completed 46/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:11:14,101] Trial 46 finished with value: 0.7176739954739549 and parameters: {'iterations': 537, 'learning_rate': 0.03349059798049235, 'depth': 8, 'l2_leaf_reg': 1.414067416989258, 'border_count': 251}. Best is trial 45 with value: 0.7368262842322233.
Completed 47/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:12:49,077] Trial 47 finished with value: 0.7203897307286713 and parameters: {'iterations': 759, 'learning_rate': 0.014814267610488948, 'depth': 10, 'l2_leaf_reg': 3.260095243198889, 'border_count': 241}. Best is trial 45 with value: 0.7368262842322233.
Completed 48/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:14:21,005] Trial 48 finished with value: 0.7286226655734462 and parameters: {'iterations': 542, 'learning_rate': 0.06191130781824827, 'depth': 9, 'l2_leaf_reg': 5.347437704500289, 'border_count': 255}. Best is trial 45 with value: 0.7368262842322233.
Completed 49/50 trials


  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'l2_leaf_reg':   trial.suggest_loguniform('l2_leaf_reg',  1e-1, 10.0),


[I 2025-06-09 14:15:52,667] Trial 49 finished with value: 0.7230531860931899 and parameters: {'iterations': 538, 'learning_rate': 0.06027005210870467, 'depth': 9, 'l2_leaf_reg': 5.964187730957609, 'border_count': 255}. Best is trial 45 with value: 0.7368262842322233.
Completed 50/50 trials
Models saved to best_catboost_model.pkl and preprocessor.pkl


In [32]:
print("Best R²:", study.best_value)
print("Best params:", study.best_params)

Best R²: 0.7368262842322233
Best params: {'iterations': 599, 'learning_rate': 0.05951760389847296, 'depth': 10, 'l2_leaf_reg': 2.04641012682822, 'border_count': 253}


### Выводы

$R^2 = 0.7368$

In [None]:
param = {
    'iterations': 599, 
    'learning_rate': 0.05951760389847296, 
    'depth': 10, 
    'l2_leaf_reg': 2.04641012682822, 
    'border_count': 253
}