In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [34]:
%pip install gdown

Note: you may need to restart the kernel to use updated packages.


In [35]:
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}

Downloading...
From (original): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI
From (redirected): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI&confirm=t&uuid=33c69fad-410a-4176-862a-498918c33127
To: /Users/ilolss/HSE/ml_hse/ml_project/part_6/dataset.csv
100%|██████████████████████████████████████| 1.01G/1.01G [02:02<00:00, 8.29MB/s]


In [36]:
df = pd.read_csv('dataset.csv')
print(df.shape)

(604047, 24)


In [37]:
df_with_na_column = df[df['engine_displacement'].isna()]
df_with_na_column

Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
210905,2003,175000,CONDITION_OK,0,True,,,Чебаркуль,PRIVATE,Hyundai,...,,,,LEFT,380000,MEDIUM,available_for_checkup;pts_original;real_photo;...,,seats-5,


In [38]:
df.drop(210905, inplace=True)

In [39]:
df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)


In [40]:
df['pts_original'].fillna(True, inplace=True)
df['accidents_resolution'].fillna('OK', inplace=True)
df['auto_class'].fillna('NOT SPECIFIED', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['pts_original'].fillna(True, inplace=True)
  df['pts_original'].fillna(True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['accidents_resolution'].fillna('OK', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because 

In [41]:
df.drop('horse_power', axis=1, inplace=True)

In [42]:
def get_unique_values(series, sep=';'):
    uniq = set()
    for cell in series.dropna():
        for piece in cell.split(sep):
            s = piece.strip()
            if s:
                uniq.add(s)
    return np.array(list(uniq))

all_tags = get_unique_values(df['tags'])
all_options = get_unique_values(df['complectation_available_options'])
all_equipments = get_unique_values(df['equipment'])

In [43]:
def create_binary_features(df, column, unique_values, sep=';'):
    return (
        df[column]
        .str.get_dummies(sep=sep)
        .reindex(columns=unique_values, fill_value=0)
        .astype('int8')
    )

tags_dummies = create_binary_features(df, 'tags', all_tags)
options_dummies = create_binary_features(df, 'complectation_available_options', all_options)
equipment_dummies = create_binary_features(df, 'equipment', all_equipments)

In [44]:
tags_dummies = tags_dummies.astype('Sparse[int]')
options_dummies = options_dummies.astype('Sparse[int]')
equipment_dummies = equipment_dummies.astype('Sparse[int]')

In [45]:
full_df = pd.concat([
    df.drop(columns=['tags', 'complectation_available_options', 'equipment']),
    tags_dummies,
    options_dummies.drop(columns=['condition']),
    equipment_dummies.drop(columns=['condition'])
], axis=1)

In [46]:
duplicated_cols = full_df.columns[full_df.columns.duplicated()].unique()
for col in duplicated_cols:
    full_df[col] = full_df[col].astype("int8")

In [47]:
import numpy as np

names = full_df.columns[full_df.columns.duplicated()].unique()
new_cols = {}

for name in names:
    cols_i = [col for col in full_df.columns if col == name]
    sub_df = full_df.loc[:, cols_i].astype('int8')   # DataFrame shape (n_rows, k)
    
    # берём максимум по строкам → Series shape (n_rows,)
    max_series = sub_df.max(axis=1).astype('int8')
    
    new_cols[name] = max_series
    
    # удаляем *все* старые колонки с этим именем
    full_df.drop(columns=cols_i, inplace=True)

# теперь присваиваем новые колонки–Series (они одномерны!)
for name, s in new_cols.items():
    full_df[name] = s


  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s


In [48]:
full_df.sample(10)

Unnamed: 0,production_year,mileage,condition,owners_number,accidents_resolution,region,seller_type,brand,model,body_type,...,body-kit,android-auto,wheel-power,high-beam-assist,black-roof,climate-control-1,ashtray-and-cigarette-lighter,17-inch-wheels,24-inch-wheels,front-seats-heat-vent
436947,2024,0,CONDITION_OK,0,OK,Санкт-Петербург,COMMERCIAL,Skoda,Karoq,ALLROAD_5_DOORS,...,0,0,1,0,0,1,0,0,0,0
329729,2020,40460,CONDITION_OK,1,ERROR,Москва,PRIVATE,Mercedes-Benz,E-Класс,COUPE_HARDTOP,...,0,0,0,0,1,1,1,0,0,0
343497,2000,380000,CONDITION_OK,4,OK,Москва,PRIVATE,Mercedes-Benz,M-Класс,ALLROAD_5_DOORS,...,0,0,0,0,0,0,0,0,0,0
439784,2019,170000,CONDITION_OK,1,ERROR,Москва,PRIVATE,Skoda,Octavia,LIFTBACK,...,0,1,0,0,0,1,1,1,0,0
550845,2023,107,CONDITION_OK,1,OK,Краснодар,PRIVATE,Lada (ВАЗ),Largus,WAGON_5_DOORS,...,0,0,1,0,0,0,0,0,0,0
431808,2015,110727,CONDITION_OK,1,OK,Уфа,COMMERCIAL,Renault,Sandero,HATCHBACK_5_DOORS,...,0,0,1,0,0,0,1,0,0,0
167063,2024,0,CONDITION_OK,0,OK,Химки,COMMERCIAL,Genesis,GV80 Coupe,ALLROAD_5_DOORS,...,0,1,1,1,1,1,0,0,0,1
391523,2022,13000,CONDITION_OK,0,OK,Владивосток,COMMERCIAL,Nissan,X-Trail,ALLROAD_5_DOORS,...,0,0,0,0,0,0,0,0,0,0
210052,2024,100,CONDITION_OK,1,OK,Новосибирск,PRIVATE,Hyundai,Elantra,SEDAN,...,0,0,0,0,0,0,0,0,0,0
46816,2024,34,CONDITION_OK,1,OK,Москва,COMMERCIAL,BMW,X5,ALLROAD_5_DOORS,...,0,0,0,0,0,0,0,0,0,0


In [49]:
import pandas as pd
import numpy as np
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

numeric_df = full_df.select_dtypes(include=[np.number])
numeric_df = numeric_df.loc[:, numeric_df.nunique() > 1]

corr = numeric_df.corr().abs().fillna(0)

dist = 1 - corr

dist = (dist + dist.T) / 2
np.fill_diagonal(dist.values, 0)

dist = dist.clip(lower=0)

dist_vect = squareform(dist)
Z = hierarchy.linkage(dist_vect, method='average')

max_d = 0.4
clusters = hierarchy.fcluster(Z, t=max_d, criterion='distance')

cluster_df = pd.DataFrame({'feature': numeric_df.columns, 'cluster': clusters})
to_keep = []
for c in cluster_df['cluster'].unique():
    members = cluster_df.loc[cluster_df['cluster'] == c, 'feature']
    avg_corr = corr.loc[members, members].mean().sort_values(ascending=False)
    to_keep.append(avg_corr.index[0])
to_keep = list(dict.fromkeys(to_keep))

to_drop = [col for col in numeric_df.columns if col not in to_keep]
df_reduced = full_df.drop(columns=to_drop)

print(f"Удалено числовых коррелирующих признаков: {len(to_drop)}")
print(f"Осталось признаков в df_reduced: {df_reduced.shape[1]} из {full_df.shape[1]}")

Удалено числовых коррелирующих признаков: 115
Осталось признаков в df_reduced: 301 из 416


In [50]:
df_reduced.sample(10)

Unnamed: 0,mileage,condition,accidents_resolution,region,seller_type,brand,model,body_type,doors_count,seats,...,body-mouldings,steering-wheel-gear-shift-paddles,velvet-seats,body-kit,wheel-power,black-roof,ashtray-and-cigarette-lighter,17-inch-wheels,24-inch-wheels,front-seats-heat-vent
518746,33000,CONDITION_OK,ERROR,Аксай,PRIVATE,Lada (ВАЗ),2107,SEDAN,4,5,...,0,0,0,0,0,0,0,0,0,0
224119,23211,CONDITION_OK,OK,Москва,PRIVATE,Hyundai,Solaris,SEDAN,4,5,...,1,0,0,0,1,0,0,0,0,0
351020,6122,CONDITION_OK,OK,Уссурийск,COMMERCIAL,Mini,Countryman,ALLROAD_5_DOORS,5,5,...,0,0,0,0,0,0,0,0,0,0
424138,4500,CONDITION_OK,ERROR,Погар,PRIVATE,Renault,Kaptur,ALLROAD_5_DOORS,5,5,...,1,0,0,0,1,0,0,0,0,0
89776,245600,CONDITION_OK,OK,Тверь,PRIVATE,Chevrolet,Aveo,SEDAN,4,5,...,0,0,0,0,0,0,0,0,0,0
297634,18300,CONDITION_OK,OK,Москва,PRIVATE,Lexus,RX,ALLROAD_5_DOORS,5,5,...,1,1,0,0,1,0,0,0,0,1
449720,0,CONDITION_OK,OK,Москва,COMMERCIAL,Soueast,S07,ALLROAD_5_DOORS,5,5;7,...,0,1,0,0,1,0,0,0,0,1
49847,121000,CONDITION_OK,OK,Москва,PRIVATE,BMW,X6,ALLROAD_5_DOORS,5,5,...,0,1,0,1,0,1,1,0,0,0
48872,0,CONDITION_OK,OK,Краснодар,COMMERCIAL,BMW,X6,ALLROAD_5_DOORS,5,5,...,0,0,0,0,1,0,0,0,0,0
846,363500,CONDITION_OK,OK,Уфа,PRIVATE,Alfa Romeo,166,SEDAN,4,5,...,0,0,0,0,1,0,0,0,0,0


## Обучим LightGBM и CatBoost на df_reduced

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
X = df_reduced.drop('price', axis=1)
y = df_reduced['price']

In [None]:
num_features = X.select_dtypes(include=['int64', 'float64', 'int8']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
])

**LightGBM**

In [None]:
lgbm_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('model', LGBMRegressor(objective='regression', random_state=42))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [None]:
lgbm_pipeline.fit(X_train, y_train)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.600399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6396
[LightGBM] [Info] Number of data points in the train set: 483236, number of used features: 2692
[LightGBM] [Info] Start training from score 2807543.358738


In [None]:
y_pred_lgbm = lgbm_pipeline.predict(X_test)



In [None]:
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm  = r2_score(y_test, y_pred_lgbm)

In [None]:
print("LightGBM:")
print(f"  MSE:  {mse_lgbm:.2f}")
print(f"  MAE:  {mae_lgbm:.2f}")
print(f"  R^2:  {r2_lgbm:.2f}")

LightGBM:
  MSE:  7009385199619.20
  MAE:  474050.08
  R^2:  0.75


**CatBoost**

In [None]:
cat_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('model', CatBoostRegressor(loss_function='RMSE', verbose=0, random_seed=RANDOM_STATE))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [None]:
cat_pipeline.fit(X_train, y_train)



In [None]:
y_pred_cat  = cat_pipeline.predict(X_test)



In [None]:
mse_cat = mean_squared_error(y_test, y_pred_cat)
mae_cat = mean_absolute_error(y_test, y_pred_cat)
r2_cat  = r2_score(y_test, y_pred_cat)

In [None]:
print("LightGBM:")
print(f"  MSE:  {mse_lgbm:.2f}")
print(f"  MAE:  {mae_lgbm:.2f}")
print(f"  R^2:  {r2_lgbm:.2f}")

print("CatBoost:")
print(f"  MSE:  {mse_cat:.2f}")
print(f"  MAE:  {mae_cat:.2f}")
print(f"  R^2:  {r2_cat:.2f}")

LightGBM:
  MSE:  7009385199619.20
  MAE:  474050.08
  R^2:  0.75
CatBoost:
  MSE:  8188312842766.43
  MAE:  417499.35
  R^2:  0.71


## Попробуем подобрать гиперпараметры с помощью Bayesian Optimization

**LightGBM**

In [51]:
import optuna
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor

In [52]:
RANDOM_STATE = 42

In [53]:
X = df_reduced.drop('price', axis=1)
y = df_reduced['price']
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

num_features = X.select_dtypes(include=['int64','float64','int8']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
])

In [54]:
X_train_proc = preprocessor.fit_transform(X_train)
X_valid_proc = preprocessor.transform(X_valid)



In [57]:
import lightgbm as lgb

def objective(trial):
    params = {
        'n_estimators':     trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'num_leaves':       trial.suggest_int('num_leaves', 31, 256),
        'max_depth':        trial.suggest_int('max_depth', 5, 30),
        'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'random_state':     RANDOM_STATE,
        'verbose':          -1,
    }
    model = LGBMRegressor(**params)
    model.fit(
        X_train_proc, y_train,
        eval_set=[(X_valid_proc, y_valid)],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    preds = model.predict(X_valid_proc)
    return r2_score(y_valid, preds)

In [58]:
study = optuna.create_study(direction='maximize', study_name='lgbm_opt_r2')
study.optimize(
    objective,
    n_trials=50,
    n_jobs=-1,
    show_progress_bar=True,
)

print("Best R²:", study.best_value)
print("Best params:", study.best_params)

[I 2025-06-10 09:43:13,366] A new study created in memory with name: lgbm_opt_r2
  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_unif

Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[564]	valid_0's l2: 7.0614e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:05:18,118] Trial 2 finished with value: 0.7478592528022807 and parameters: {'n_estimators': 912, 'learning_rate': 0.06844899583344552, 'num_leaves': 181, 'max_depth': 5, 'subsample': 0.8942898882484294, 'colsample_bytree': 0.6031165535850378, 'reg_alpha': 0.2520936777453948, 'reg_lambda': 3.51357760339482e-06, 'min_child_weight': 31}. Best is trial 2 with value: 0.7478592528022807.
Did not meet early stopping. Best iteration is:
[481]	valid_0's l2: 8.56527e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:05:51,968] Trial 11 finished with value: 0.694160646548931 and parameters: {'n_estimators': 481, 'learning_rate': 0.007153171829912592, 'num_leaves': 109, 'max_depth': 6, 'subsample': 0.6552567454545344, 'colsample_bytree': 0.9854532651623256, 'reg_alpha': 0.010040831856806496, 'reg_lambda': 4.658574306014227e-07, 'min_child_weight': 24}. Best is trial 2 with value: 0.7478592528022807.
Did not meet early stopping. Best iteration is:
[298]	valid_0's l2: 8.91161e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:06:43,080] Trial 5 finished with value: 0.6817940732858276 and parameters: {'n_estimators': 298, 'learning_rate': 0.008731674127486976, 'num_leaves': 73, 'max_depth': 17, 'subsample': 0.5962895051195349, 'colsample_bytree': 0.9138980464849036, 'reg_alpha': 0.006594515762454404, 'reg_lambda': 0.010361284469065608, 'min_child_weight': 29}. Best is trial 2 with value: 0.7478592528022807.
Did not meet early stopping. Best iteration is:
[199]	valid_0's l2: 1.85987e+13


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:07:26,539] Trial 4 finished with value: 0.33589968129758785 and parameters: {'n_estimators': 199, 'learning_rate': 0.0020443842698193997, 'num_leaves': 172, 'max_depth': 9, 'subsample': 0.6748807836393333, 'colsample_bytree': 0.7260009937856174, 'reg_alpha': 5.952550789519932e-07, 'reg_lambda': 0.004459797819419052, 'min_child_weight': 35}. Best is trial 2 with value: 0.7478592528022807.
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[322]	valid_0's l2: 8.42572e+12




Early stopping, best iteration is:
[352]	valid_0's l2: 6.62691e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:10:56,836] Trial 10 finished with value: 0.699143706821973 and parameters: {'n_estimators': 322, 'learning_rate': 0.008607769459454405, 'num_leaves': 152, 'max_depth': 23, 'subsample': 0.9644560496247303, 'colsample_bytree': 0.9293829220478151, 'reg_alpha': 2.619306495867171e-07, 'reg_lambda': 5.643587330603666e-08, 'min_child_weight': 22}. Best is trial 1 with value: 0.7633735937431443.
[I 2025-06-10 10:10:56,839] Trial 1 finished with value: 0.7633735937431443 and parameters: {'n_estimators': 750, 'learning_rate': 0.06282204949878002, 'num_leaves': 255, 'max_depth': 17, 'subsample': 0.5545130740972319, 'colsample_bytree': 0.9990589471851096, 'reg_alpha': 5.036986190760829e-06, 'reg_lambda': 0.00029356916342483614, 'min_child_weight': 7}. Best is trial 1 with value: 0.7633735937431443.
Did not meet early stopping. Best iteration is:
[175]	valid_0's l2: 2.24716e+13
Training until validation scores don't improve for 50 rounds


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:12:44,363] Trial 12 finished with value: 0.19760742199840375 and parameters: {'n_estimators': 175, 'learning_rate': 0.0012719897608666276, 'num_leaves': 229, 'max_depth': 7, 'subsample': 0.9949293042479022, 'colsample_bytree': 0.6350039045882901, 'reg_alpha': 0.01828763754988252, 'reg_lambda': 0.0002791904756001193, 'min_child_weight': 50}. Best is trial 1 with value: 0.7633735937431443.
Did not meet early stopping. Best iteration is:
[1253]	valid_0's l2: 1.16026e+13




Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[532]	valid_0's l2: 8.59615e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


[I 2025-06-10 10:13:54,192] Trial 8 finished with value: 0.6930579694701897 and parameters: {'n_estimators': 532, 'learning_rate': 0.007191916965314556, 'num_leaves': 125, 'max_depth': 15, 'subsample': 0.5173601299920085, 'colsample_bytree': 0.6293305735108923, 'reg_alpha': 2.2828835733305324e-06, 'reg_lambda': 0.0003371021221051116, 'min_child_weight': 49}. Best is trial 1 with value: 0.7633735937431443.
[I 2025-06-10 10:13:54,204] Trial 3 finished with value: 0.5857087483195459 and parameters: {'n_estimators': 1253, 'learning_rate': 0.0011567790305289818, 'num_leaves': 38, 'max_depth': 21, 'subsample': 0.7676071348093482, 'colsample_bytree': 0.5618921653827519, 'reg_alpha': 0.004201043994919595, 'reg_lambda': 0.0003098209036801655, 'min_child_weight': 42}. Best is trial 1 with value: 0.7633735937431443.
Did not meet early stopping. Best iteration is:
[160]	valid_0's l2: 7.53395e+12


Best trial: 1. Best value: 0.763374:  20%|██        | 10/50 [32:29<59:41, 89.53s/it]  

[I 2025-06-10 10:15:42,956] Trial 13 finished with value: 0.7309859582459326 and parameters: {'n_estimators': 160, 'learning_rate': 0.02508072135975103, 'num_leaves': 220, 'max_depth': 24, 'subsample': 0.7178804220744573, 'colsample_bytree': 0.8490830053850906, 'reg_alpha': 8.377271317448566e-08, 'reg_lambda': 2.2285103629510546e-06, 'min_child_weight': 3}. Best is trial 1 with value: 0.7633735937431443.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1153]	valid_0's l2: 6.48367e+12


Best trial: 9. Best value: 0.768488:  22%|██▏       | 11/50 [1:27:41<9:12:00, 849.24s/it]

[I 2025-06-10 11:10:54,938] Trial 9 finished with value: 0.7684880904375512 and parameters: {'n_estimators': 1165, 'learning_rate': 0.019125201080463873, 'num_leaves': 86, 'max_depth': 30, 'subsample': 0.7350943258461548, 'colsample_bytree': 0.7715755654612473, 'reg_alpha': 2.412594714489861e-06, 'reg_lambda': 5.97518398232865e-07, 'min_child_weight': 7}. Best is trial 9 with value: 0.7684880904375512.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[414]	valid_0's l2: 1.89079e+13


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:15:16,493] Trial 17 finished with value: 0.32485861249027104 and parameters: {'n_estimators': 414, 'learning_rate': 0.0010133005578156153, 'num_leaves': 220, 'max_depth': 6, 'subsample': 0.997552446413138, 'colsample_bytree': 0.6520366837014528, 'reg_alpha': 2.064188344460927e-06, 'reg_lambda': 3.5829838233353234e-08, 'min_child_weight': 5}. Best is trial 9 with value: 0.7684880904375512.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[167]	valid_0's l2: 6.60553e+12


Best trial: 9. Best value: 0.768488:  26%|██▌       | 13/50 [1:34:03<5:37:30, 547.30s/it]

[I 2025-06-10 11:17:16,259] Trial 16 finished with value: 0.7641370874377332 and parameters: {'n_estimators': 907, 'learning_rate': 0.08254740947853166, 'num_leaves': 212, 'max_depth': 16, 'subsample': 0.6199952957226187, 'colsample_bytree': 0.5965075595808604, 'reg_alpha': 0.0051151187363833095, 'reg_lambda': 0.00560597253021469, 'min_child_weight': 15}. Best is trial 9 with value: 0.7684880904375512.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[269]	valid_0's l2: 6.64128e+12




Did not meet early stopping. Best iteration is:
[1531]	valid_0's l2: 1.06508e+13


Best trial: 9. Best value: 0.768488:  28%|██▊       | 14/50 [1:36:32<4:22:55, 438.21s/it]

[I 2025-06-10 11:19:45,948] Trial 21 finished with value: 0.7628605178286624 and parameters: {'n_estimators': 1934, 'learning_rate': 0.09058084165613556, 'num_leaves': 256, 'max_depth': 12, 'subsample': 0.5320309989516921, 'colsample_bytree': 0.7935411581147781, 'reg_alpha': 3.4871901446199586e-05, 'reg_lambda': 0.7372265587167655, 'min_child_weight': 2}. Best is trial 9 with value: 0.7684880904375512.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:20:56,648] Trial 0 finished with value: 0.6196943494459659 and parameters: {'n_estimators': 1531, 'learning_rate': 0.0010750490844027387, 'num_leaves': 97, 'max_depth': 26, 'subsample': 0.9779295735109266, 'colsample_bytree': 0.8147836596813225, 'reg_alpha': 2.2455546937634212e-08, 'reg_lambda': 7.588969001539502e-05, 'min_child_weight': 44}. Best is trial 9 with value: 0.7684880904375512.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[285]	valid_0's l2: 6.59468e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),


[I 2025-06-10 11:23:00,583] Trial 22 finished with value: 0.7645244559656967 and parameters: {'n_estimators': 1866, 'learning_rate': 0.0870978913912941, 'num_leaves': 76, 'max_depth': 29, 'subsample': 0.8046324020781959, 'colsample_bytree': 0.7932723799508875, 'reg_alpha': 3.307639336983905e-05, 'reg_lambda': 7.922472126346382, 'min_child_weight': 4}. Best is trial 9 with value: 0.7684880904375512.


  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1301]	valid_0's l2: 6.52043e+12


Best trial: 9. Best value: 0.768488:  34%|███▍      | 17/50 [1:41:58<2:07:55, 232.60s/it]

[I 2025-06-10 11:25:11,603] Trial 20 finished with value: 0.7671757661153945 and parameters: {'n_estimators': 1301, 'learning_rate': 0.014080482672486145, 'num_leaves': 144, 'max_depth': 10, 'subsample': 0.7196053886970883, 'colsample_bytree': 0.904417700707203, 'reg_alpha': 5.452535934228771e-07, 'reg_lambda': 3.227700583167892e-05, 'min_child_weight': 10}. Best is trial 9 with value: 0.7684880904375512.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Early stopping, best iteration is:
[287]	valid_0's l2: 6.66807e+12




Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


Best trial: 9. Best value: 0.768488:  36%|███▌      | 18/50 [1:43:53<1:45:44, 198.28s/it]

[I 2025-06-10 11:27:07,009] Trial 23 finished with value: 0.7619037855772276 and parameters: {'n_estimators': 1814, 'learning_rate': 0.08756798044846023, 'num_leaves': 84, 'max_depth': 29, 'subsample': 0.812003820740669, 'colsample_bytree': 0.7969459350281677, 'reg_alpha': 5.063748637597755e-05, 'reg_lambda': 1.5503961378179554, 'min_child_weight': 14}. Best is trial 9 with value: 0.7684880904375512.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[731]	valid_0's l2: 1.36652e+13




Training until validation scores don't improve for 50 rounds


Best trial: 9. Best value: 0.768488:  38%|███▊      | 19/50 [1:48:57<1:58:32, 229.44s/it]

[I 2025-06-10 11:32:10,963] Trial 14 finished with value: 0.5120577973735424 and parameters: {'n_estimators': 731, 'learning_rate': 0.0011418079672858299, 'num_leaves': 235, 'max_depth': 18, 'subsample': 0.9009483890160705, 'colsample_bytree': 0.9781049549840541, 'reg_alpha': 7.131153983599412, 'reg_lambda': 6.73796423787652e-07, 'min_child_weight': 50}. Best is trial 9 with value: 0.7684880904375512.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Did not meet early stopping. Best iteration is:
[1907]	valid_0's l2: 8.83797e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),


[I 2025-06-10 11:34:59,111] Trial 7 finished with value: 0.6844233887123341 and parameters: {'n_estimators': 1907, 'learning_rate': 0.0015513523420011693, 'num_leaves': 127, 'max_depth': 17, 'subsample': 0.9438205181175421, 'colsample_bytree': 0.7063241191274227, 'reg_alpha': 4.740413245659029e-06, 'reg_lambda': 0.004572600961170778, 'min_child_weight': 35}. Best is trial 9 with value: 0.7684880904375512.


  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[650]	valid_0's l2: 6.36083e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:36:16,358] Trial 24 finished with value: 0.772874539578923 and parameters: {'n_estimators': 1786, 'learning_rate': 0.025564332963318472, 'num_leaves': 83, 'max_depth': 29, 'subsample': 0.8259772997763104, 'colsample_bytree': 0.5089082700589327, 'reg_alpha': 0.0001281458649912783, 'reg_lambda': 1.4509575467710583, 'min_child_weight': 15}. Best is trial 24 with value: 0.772874539578923.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Did not meet early stopping. Best iteration is:
[1605]	valid_0's l2: 6.42505e+12




Early stopping, best iteration is:
[915]	valid_0's l2: 6.48685e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:37:52,274] Trial 6 finished with value: 0.7705815628653898 and parameters: {'n_estimators': 1605, 'learning_rate': 0.008768858075800112, 'num_leaves': 158, 'max_depth': 23, 'subsample': 0.8692281559507183, 'colsample_bytree': 0.6026883782018413, 'reg_alpha': 1.3965498569988246, 'reg_lambda': 3.8333678699173923, 'min_child_weight': 11}. Best is trial 24 with value: 0.772874539578923.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:38:47,429] Trial 27 finished with value: 0.7683747735600442 and parameters: {'n_estimators': 2000, 'learning_rate': 0.02944529711358971, 'num_leaves': 55, 'max_depth': 30, 'subsample': 0.807634735461208, 'colsample_bytree': 0.7009135485811593, 'reg_alpha': 0.00019831415511183695, 'reg_lambda': 8.847892287943662, 'min_child_weight': 12}. Best is trial 24 with value: 0.772874539578923.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Early stopping, best iteration is:
[1306]	valid_0's l2: 6.51282e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:40:20,693] Trial 19 finished with value: 0.7674473142582819 and parameters: {'n_estimators': 1978, 'learning_rate': 0.01676437657980308, 'num_leaves': 158, 'max_depth': 18, 'subsample': 0.5244397513972039, 'colsample_bytree': 0.841780898017594, 'reg_alpha': 0.0039617849632545355, 'reg_lambda': 3.889621672763634e-05, 'min_child_weight': 15}. Best is trial 24 with value: 0.772874539578923.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1168]	valid_0's l2: 6.51346e+12




Early stopping, best iteration is:
[744]	valid_0's l2: 6.36527e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 11:41:40,407] Trial 25 finished with value: 0.7727160889000249 and parameters: {'n_estimators': 1310, 'learning_rate': 0.028023172531255924, 'num_leaves': 85, 'max_depth': 29, 'subsample': 0.8098123025577627, 'colsample_bytree': 0.5005452248587219, 'reg_alpha': 9.696311324942695e-05, 'reg_lambda': 4.922205286849812, 'min_child_weight': 13}. Best is trial 24 with value: 0.772874539578923.
[I 2025-06-10 11:41:40,432] Trial 29 finished with value: 0.7674244939476581 and parameters: {'n_estimators': 1333, 'learning_rate': 0.019223525220592733, 'num_leaves': 45, 'max_depth': 12, 'subsample': 0.8805979371656245, 'colsample_bytree': 0.6987190760648816, 'reg_alpha': 0.00021359007396614435, 'reg_lambda': 1.6640857548549888e-05, 'min_child_weight': 12}. Best is trial 24 with value: 0.772874539578923.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Did not meet early stopping. Best iteration is:
[1905]	valid_0's l2: 7.67797e+12




Training until validation scores don't improve for 50 rounds


Best trial: 24. Best value: 0.772875:  54%|█████▍    | 27/50 [2:01:29<37:20, 97.43s/it] 

[I 2025-06-10 11:44:42,729] Trial 18 finished with value: 0.7258436314350125 and parameters: {'n_estimators': 1905, 'learning_rate': 0.004217926068761126, 'num_leaves': 90, 'max_depth': 28, 'subsample': 0.6049225929321832, 'colsample_bytree': 0.8421979554897251, 'reg_alpha': 9.242263543679215e-06, 'reg_lambda': 1.8738784105881845e-08, 'min_child_weight': 45}. Best is trial 24 with value: 0.772874539578923.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Did not meet early stopping. Best iteration is:
[1188]	valid_0's l2: 6.34398e+12




Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),


[I 2025-06-10 11:47:10,019] Trial 26 finished with value: 0.7734762814937777 and parameters: {'n_estimators': 1213, 'learning_rate': 0.024496439436048064, 'num_leaves': 63, 'max_depth': 30, 'subsample': 0.7875446509821259, 'colsample_bytree': 0.5020615944067544, 'reg_alpha': 4.509220298477112, 'reg_lambda': 6.926904100118654, 'min_child_weight': 14}. Best is trial 26 with value: 0.7734762814937777.


  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Did not meet early stopping. Best iteration is:
[1271]	valid_0's l2: 6.58403e+12




Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


Best trial: 26. Best value: 0.773476:  58%|█████▊    | 29/50 [2:05:17<35:43, 102.07s/it]

[I 2025-06-10 11:48:30,366] Trial 30 finished with value: 0.7649048410168859 and parameters: {'n_estimators': 1302, 'learning_rate': 0.018630926377458022, 'num_leaves': 44, 'max_depth': 12, 'subsample': 0.7219953660145989, 'colsample_bytree': 0.7128047226849252, 'reg_alpha': 0.0003763991558961714, 'reg_lambda': 2.2413615409795623e-05, 'min_child_weight': 14}. Best is trial 26 with value: 0.7734762814937777.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Early stopping, best iteration is:
[1206]	valid_0's l2: 6.64302e+12




Training until validation scores don't improve for 50 rounds


Best trial: 26. Best value: 0.773476:  60%|██████    | 30/50 [2:09:16<46:30, 139.53s/it]

[I 2025-06-10 11:52:30,255] Trial 31 finished with value: 0.7627985370043929 and parameters: {'n_estimators': 1334, 'learning_rate': 0.019163560993567127, 'num_leaves': 33, 'max_depth': 12, 'subsample': 0.7533391804898018, 'colsample_bytree': 0.8766694156680578, 'reg_alpha': 0.0003928978466716268, 'reg_lambda': 2.18988923330997e-05, 'min_child_weight': 11}. Best is trial 26 with value: 0.7734762814937777.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1171]	valid_0's l2: 6.27708e+12
Early stopping, best iteration is:
[1059]	valid_0's l2: 6.46581e+12


Best trial: 32. Best value: 0.775865:  62%|██████▏   | 31/50 [2:19:12<1:24:33, 267.02s/it]

[I 2025-06-10 12:02:25,453] Trial 32 finished with value: 0.7758649506904582 and parameters: {'n_estimators': 1293, 'learning_rate': 0.018979006632263765, 'num_leaves': 52, 'max_depth': 30, 'subsample': 0.7308899210502896, 'colsample_bytree': 0.5048094059104028, 'reg_alpha': 0.0002724921985019624, 'reg_lambda': 1.3261732166212643e-05, 'min_child_weight': 12}. Best is trial 32 with value: 0.7758649506904582.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
Best trial: 32. Best value: 0.775865:  64%|██████▍   | 32/50 [2:19:12<57:15, 190.89s/it]  

[I 2025-06-10 12:02:25,746] Trial 28 finished with value: 0.76912611519222 and parameters: {'n_estimators': 1286, 'learning_rate': 0.019519056328355665, 'num_leaves': 141, 'max_depth': 30, 'subsample': 0.8095055672856472, 'colsample_bytree': 0.7164357336148057, 'reg_alpha': 7.11388928094855, 'reg_lambda': 1.0340627176411394e-05, 'min_child_weight': 13}. Best is trial 32 with value: 0.7758649506904582.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Did not meet early stopping. Best iteration is:
[1613]	valid_0's l2: 7.12068e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),


[I 2025-06-10 12:03:14,862] Trial 34 finished with value: 0.7457425397306388 and parameters: {'n_estimators': 1614, 'learning_rate': 0.004341055611167815, 'num_leaves': 32, 'max_depth': 27, 'subsample': 0.8695497888991097, 'colsample_bytree': 0.5047996250932033, 'reg_alpha': 5.016047953257169, 'reg_lambda': 0.06454312863083596, 'min_child_weight': 18}. Best is trial 32 with value: 0.7758649506904582.


  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Early stopping, best iteration is:
[1271]	valid_0's l2: 6.31564e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 12:04:35,332] Trial 33 finished with value: 0.7744882480557582 and parameters: {'n_estimators': 1655, 'learning_rate': 0.02734829818013816, 'num_leaves': 49, 'max_depth': 27, 'subsample': 0.8497905111357599, 'colsample_bytree': 0.5364881443073006, 'reg_alpha': 0.0004489837283865065, 'reg_lambda': 0.08511764209514812, 'min_child_weight': 18}. Best is trial 32 with value: 0.7758649506904582.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Early stopping, best iteration is:
[861]	valid_0's l2: 6.38551e+12


Best trial: 32. Best value: 0.775865:  70%|███████   | 35/50 [2:24:08<35:07, 140.53s/it]

[I 2025-06-10 12:07:22,228] Trial 39 finished with value: 0.7719933377946379 and parameters: {'n_estimators': 1436, 'learning_rate': 0.04162617946093205, 'num_leaves': 57, 'max_depth': 26, 'subsample': 0.8524296896526506, 'colsample_bytree': 0.5078228137218741, 'reg_alpha': 0.0006503762012489656, 'reg_lambda': 0.18729065821335142, 'min_child_weight': 19}. Best is trial 32 with value: 0.7758649506904582.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1874]	valid_0's l2: 8.14476e+12




Training until validation scores don't improve for 50 rounds


Best trial: 32. Best value: 0.775865:  72%|███████▏  | 36/50 [2:27:53<38:37, 165.52s/it]

[I 2025-06-10 12:11:07,060] Trial 15 finished with value: 0.7091757084074103 and parameters: {'n_estimators': 1874, 'learning_rate': 0.0014207088364904163, 'num_leaves': 155, 'max_depth': 26, 'subsample': 0.5125401354369957, 'colsample_bytree': 0.559332541904994, 'reg_alpha': 0.0400043095520283, 'reg_lambda': 0.013727168404130456, 'min_child_weight': 22}. Best is trial 32 with value: 0.7758649506904582.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[384]	valid_0's l2: 6.46802e+12


Best trial: 32. Best value: 0.775865:  74%|███████▍  | 37/50 [2:31:28<39:00, 180.02s/it]

[I 2025-06-10 12:14:41,303] Trial 38 finished with value: 0.7690471498448759 and parameters: {'n_estimators': 1603, 'learning_rate': 0.041310333239868394, 'num_leaves': 192, 'max_depth': 25, 'subsample': 0.8542866027931505, 'colsample_bytree': 0.5152483119404845, 'reg_alpha': 0.11673686916186878, 'reg_lambda': 0.18700134937223917, 'min_child_weight': 18}. Best is trial 32 with value: 0.7758649506904582.


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Early stopping, best iteration is:
[649]	valid_0's l2: 6.22253e+12


  'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),


[I 2025-06-10 12:15:57,164] Trial 40 finished with value: 0.7778125884898708 and parameters: {'n_estimators': 1600, 'learning_rate': 0.04230796771458062, 'num_leaves': 109, 'max_depth': 26, 'subsample': 0.7744391789491678, 'colsample_bytree': 0.5013855637546327, 'reg_alpha': 0.08261606688965385, 'reg_lambda': 0.2539614097187018, 'min_child_weight': 18}. Best is trial 40 with value: 0.7778125884898708.


  'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha':        trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[383]	valid_0's l2: 6.49981e+12
Early stopping, best iteration is:
[383]	valid_0's l2: 6.47801e+12


Best trial: 40. Best value: 0.777813:  78%|███████▊  | 39/50 [2:36:08<30:22, 165.66s/it]

[I 2025-06-10 12:19:22,089] Trial 42 finished with value: 0.7679120536187543 and parameters: {'n_estimators': 1525, 'learning_rate': 0.04618472688111098, 'num_leaves': 64, 'max_depth': 26, 'subsample': 0.7828745453497874, 'colsample_bytree': 0.5086204849104204, 'reg_alpha': 0.05164314697485174, 'reg_lambda': 0.3923740192791511, 'min_child_weight': 19}. Best is trial 40 with value: 0.7778125884898708.
[I 2025-06-10 12:19:22,093] Trial 44 finished with value: 0.7686901795952471 and parameters: {'n_estimators': 1518, 'learning_rate': 0.04269020052097826, 'num_leaves': 55, 'max_depth': 26, 'subsample': 0.839720866456937, 'colsample_bytree': 0.5068852399088224, 'reg_alpha': 0.0010669883295691583, 'reg_lambda': 0.22624620072026666, 'min_child_weight': 20}. Best is trial 40 with value: 0.7778125884898708.
Early stopping, best iteration is:
[1174]	valid_0's l2: 6.29252e+12




Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


Best trial: 40. Best value: 0.777813:  82%|████████▏ | 41/50 [2:36:52<14:55, 99.54s/it] 

[I 2025-06-10 12:20:06,278] Trial 41 finished with value: 0.775313784587046 and parameters: {'n_estimators': 1534, 'learning_rate': 0.038668735023195, 'num_leaves': 59, 'max_depth': 26, 'subsample': 0.8430391937585485, 'colsample_bytree': 0.5068329036269448, 'reg_alpha': 0.07671638061460652, 'reg_lambda': 0.15446675394443224, 'min_child_weight': 19}. Best is trial 40 with value: 0.7778125884898708.
Early stopping, best iteration is:
[383]	valid_0's l2: 6.49469e+12




Early stopping, best iteration is:
[514]	valid_0's l2: 6.58308e+12
Early stopping, best iteration is:
[299]	valid_0's l2: 6.50392e+12


Best trial: 40. Best value: 0.777813:  84%|████████▍ | 42/50 [2:38:01<12:14, 91.83s/it]

[I 2025-06-10 12:21:14,646] Trial 45 finished with value: 0.7680948402255096 and parameters: {'n_estimators': 1460, 'learning_rate': 0.04360271931749821, 'num_leaves': 62, 'max_depth': 26, 'subsample': 0.8411135708806825, 'colsample_bytree': 0.533604259979626, 'reg_alpha': 0.05335578264962451, 'reg_lambda': 0.22899010125441294, 'min_child_weight': 19}. Best is trial 40 with value: 0.7778125884898708.


Best trial: 40. Best value: 0.777813:  86%|████████▌ | 43/50 [2:38:12<08:15, 70.75s/it]

[I 2025-06-10 12:21:25,789] Trial 46 finished with value: 0.7677652776373887 and parameters: {'n_estimators': 1664, 'learning_rate': 0.04643376181074901, 'num_leaves': 63, 'max_depth': 26, 'subsample': 0.7740609259097677, 'colsample_bytree': 0.5400830350407335, 'reg_alpha': 0.10404305886690518, 'reg_lambda': 0.32082097133253223, 'min_child_weight': 19}. Best is trial 40 with value: 0.7778125884898708.


Best trial: 40. Best value: 0.777813:  88%|████████▊ | 44/50 [2:38:12<05:09, 51.63s/it]

[I 2025-06-10 12:21:26,189] Trial 43 finished with value: 0.7649385999470593 and parameters: {'n_estimators': 1516, 'learning_rate': 0.040443271028389266, 'num_leaves': 64, 'max_depth': 26, 'subsample': 0.861032207703214, 'colsample_bytree': 0.5199855231827348, 'reg_alpha': 0.06037030778469395, 'reg_lambda': 0.23882269665536499, 'min_child_weight': 18}. Best is trial 40 with value: 0.7778125884898708.
Early stopping, best iteration is:
[408]	valid_0's l2: 6.47265e+12


Best trial: 40. Best value: 0.777813:  90%|█████████ | 45/50 [2:39:52<05:25, 65.18s/it]

[I 2025-06-10 12:23:06,251] Trial 47 finished with value: 0.768881850291976 and parameters: {'n_estimators': 1717, 'learning_rate': 0.04052495267129907, 'num_leaves': 66, 'max_depth': 25, 'subsample': 0.7729802187432796, 'colsample_bytree': 0.5455126072782969, 'reg_alpha': 0.12184133384898485, 'reg_lambda': 0.32375647575629546, 'min_child_weight': 18}. Best is trial 40 with value: 0.7778125884898708.
Early stopping, best iteration is:
[513]	valid_0's l2: 6.43423e+12


Best trial: 40. Best value: 0.777813:  92%|█████████▏| 46/50 [2:40:50<04:11, 62.93s/it]

[I 2025-06-10 12:24:03,557] Trial 48 finished with value: 0.7702537143569905 and parameters: {'n_estimators': 1133, 'learning_rate': 0.04047031988711441, 'num_leaves': 65, 'max_depth': 21, 'subsample': 0.9151620901382381, 'colsample_bytree': 0.5503232541331031, 'reg_alpha': 0.0010137326501190227, 'reg_lambda': 1.567629279771473, 'min_child_weight': 8}. Best is trial 40 with value: 0.7778125884898708.
Early stopping, best iteration is:
[738]	valid_0's l2: 6.29026e+12


Best trial: 40. Best value: 0.777813:  94%|█████████▍| 47/50 [2:41:49<03:05, 61.93s/it]

[I 2025-06-10 12:25:03,044] Trial 49 finished with value: 0.7753941796961696 and parameters: {'n_estimators': 1027, 'learning_rate': 0.05243223180046229, 'num_leaves': 64, 'max_depth': 21, 'subsample': 0.7774079142106206, 'colsample_bytree': 0.5444319687499145, 'reg_alpha': 0.21456603000344204, 'reg_lambda': 0.0014461090075524937, 'min_child_weight': 8}. Best is trial 40 with value: 0.7778125884898708.
Did not meet early stopping. Best iteration is:
[1596]	valid_0's l2: 6.74168e+12




Did not meet early stopping. Best iteration is:
[1647]	valid_0's l2: 6.61988e+12


Best trial: 40. Best value: 0.777813:  96%|█████████▌| 48/50 [2:43:35<02:29, 74.84s/it]

[I 2025-06-10 12:26:49,062] Trial 36 finished with value: 0.7592754069633155 and parameters: {'n_estimators': 1596, 'learning_rate': 0.004842247865373332, 'num_leaves': 107, 'max_depth': 27, 'subsample': 0.8467179252355471, 'colsample_bytree': 0.5115879279731358, 'reg_alpha': 0.10240305080763626, 'reg_lambda': 0.1371609257714269, 'min_child_weight': 19}. Best is trial 40 with value: 0.7778125884898708.


Best trial: 40. Best value: 0.777813:  98%|█████████▊| 49/50 [2:43:42<00:54, 54.77s/it]

[I 2025-06-10 12:26:55,844] Trial 37 finished with value: 0.7636245025078996 and parameters: {'n_estimators': 1647, 'learning_rate': 0.0048377350959822206, 'num_leaves': 128, 'max_depth': 27, 'subsample': 0.832995490079824, 'colsample_bytree': 0.5030219368974044, 'reg_alpha': 0.3134846652717163, 'reg_lambda': 0.07327730901442155, 'min_child_weight': 18}. Best is trial 40 with value: 0.7778125884898708.
Did not meet early stopping. Best iteration is:
[1670]	valid_0's l2: 6.7134e+12


Best trial: 40. Best value: 0.777813: 100%|██████████| 50/50 [2:44:39<00:00, 197.59s/it]

[I 2025-06-10 12:27:52,699] Trial 35 finished with value: 0.7602852339333606 and parameters: {'n_estimators': 1670, 'learning_rate': 0.004113125424886741, 'num_leaves': 192, 'max_depth': 27, 'subsample': 0.8744357434867911, 'colsample_bytree': 0.515274665210104, 'reg_alpha': 7.6015414896103835, 'reg_lambda': 0.10782316486854308, 'min_child_weight': 19}. Best is trial 40 with value: 0.7778125884898708.
Best R²: 0.7778125884898708
Best params: {'n_estimators': 1600, 'learning_rate': 0.04230796771458062, 'num_leaves': 109, 'max_depth': 26, 'subsample': 0.7744391789491678, 'colsample_bytree': 0.5013855637546327, 'reg_alpha': 0.08261606688965385, 'reg_lambda': 0.2539614097187018, 'min_child_weight': 18}





In [59]:
print("Best R²:", study.best_value)
print("Best params:", study.best_params)

Best R²: 0.7778125884898708
Best params: {'n_estimators': 1600, 'learning_rate': 0.04230796771458062, 'num_leaves': 109, 'max_depth': 26, 'subsample': 0.7744391789491678, 'colsample_bytree': 0.5013855637546327, 'reg_alpha': 0.08261606688965385, 'reg_lambda': 0.2539614097187018, 'min_child_weight': 18}


In [60]:
final_model = LGBMRegressor(**study.best_params, random_state=RANDOM_STATE)

In [65]:
import joblib

final_model.fit(X_train_proc, y_train)

joblib.dump(final_model, 'best_lightgbm_model.pkl')
joblib.dump(preprocessor, 'lightgbm_preprocessor.pkl')
print("Models saved to best_lightgbm_model.pkl and lightgbm_preprocessor.pkl")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.508726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6396
[LightGBM] [Info] Number of data points in the train set: 483236, number of used features: 2692
[LightGBM] [Info] Start training from score 2807543.358738
Models saved to best_lightgbm_model.pkl and lightgbm_preprocessor.pkl


In [66]:
y_pred = final_model.predict(X_valid_proc)



In [64]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print(f"Test MAE: {mean_absolute_error(y_valid, y_pred):.4f}")
print(f"Test MSE: {mean_squared_error(y_valid, y_pred):.4f}")
print(f"Test R^2: {r2_score(y_valid, y_pred):.4f}")

Test MAE: 283821.9014
Test MSE: 6138210709233.8760
Test R^2: 0.7808


### Выводы

$R^2 = 0.7808$

In [None]:
param = {
    'n_estimators': 1600,
    'learning_rate': 0.04230796771458062, 
    'num_leaves': 109, 
    'max_depth': 26, 
    'subsample': 0.7744391789491678, 
    'colsample_bytree': 0.5013855637546327, 
    'reg_alpha': 0.08261606688965385, 
    'reg_lambda': 0.2539614097187018, 
    'min_child_weight': 18
}