## Загрузим данные

In [62]:
%pip install gdown



In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [64]:
RANDOM_STATE=42

In [65]:
# CARS_FILE_ID = '1zl7HAtBCxTFYkaj871a7BkT9X3CVSpME'
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}

Downloading...
From (original): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI
From (redirected): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI&confirm=t&uuid=b760210f-bd7e-4c72-8384-fc475705f885
To: /content/dataset.csv
100% 1.01G/1.01G [00:07<00:00, 134MB/s]


In [66]:
import pandas as pd

df = pd.read_csv('dataset.csv', low_memory=False)
print(df.shape)

(604047, 24)


In [67]:
df.drop(210905, inplace=True)

In [68]:
df.drop('horse_power', axis=1, inplace=True)

In [69]:
import importlib
import app.missing_imputer
import app.custom_preprocessor

importlib.reload(app.missing_imputer)
importlib.reload(app.custom_preprocessor)

from app.missing_imputer import MissingValueImputer
from app.custom_preprocessor import CustomPreprocessor


In [70]:
!pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: --install-option


In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [72]:
df['log_price'] = np.log1p(df['price'])
df.drop(columns=['price'], inplace=True)

In [73]:
X = df.drop('log_price', axis=1)
y = df['log_price']

In [74]:
feature_types = {
    'production_year': 'numeric',
    'mileage': 'numeric',
    'condition': 'categorical',
    'owners_number': 'numeric',
    'horse_power': 'numeric',
    'region': 'categorical',
    'seller_type': 'categorical',
    'brand': 'categorical',
    'model': 'categorical',
    'body_type': 'categorical',
    'doors_count': 'numeric',
    'seats': 'numeric',
    'engine_displacement': 'numeric',
    'engine_power': 'numeric',
    'fuel_rate': 'numeric',
    'steering_wheel': 'categorical',
    'auto_class': 'categorical',
}

num_features = X.select_dtypes(include=['int64', 'float64', 'int8']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

imputer = MissingValueImputer(feature_types)
preprocessor = CustomPreprocessor()
feature_transform = ColumnTransformer([
    ('scale', StandardScaler(), selector(dtype_include=np.number)),
    ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), selector(dtype_include=object))
], remainder='passthrough')

param = {
    'n_estimators': 1828,
    'learning_rate': 0.05930396196813868,
    'num_leaves': 231,
    'max_depth': 20,
    'subsample': 0.5848815693903193,
    'colsample_bytree': 0.7082366853043143,
    'reg_alpha': 0.00027775655371692815,
    'reg_lambda': 3.7302584109152244e-06,
    'min_child_weight': 1,
    'device':'GPU',
    'random_seed':RANDOM_STATE,
    'verbose':-1
}


# Пример интеграции в Pipeline
# param — словарь с параметрами CatBoostRegressor, определённый ранее
lgbm_pipeline = Pipeline([
    ('impute', imputer),
    ('preproc', preprocessor),
    ('feature_transform', feature_transform),
    ('model', LGBMRegressor(**param))
])

In [75]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [76]:
lgbm_pipeline.fit(X_train, y_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['pts_original'].fillna(True, inplace=True)
  X['pts_original'].fillna(True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['accidents_resolution'].fillna('OK', inplace=True)
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_

In [77]:
import joblib
joblib.dump(lgbm_pipeline, 'lgbm_model_with_preproc.pkl', compress=3)

['lgbm_model_with_preproc.pkl']

In [None]:
y_pred_lgbm = lgbm_pipeline.predict(X_test)

In [79]:
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm  = r2_score(y_test, y_pred_lgbm)

In [80]:
y_test_rub = np.exp(y_test)
y_pred_lgbm_rub = np.exp(y_pred_lgbm)

In [81]:
def business_mae(y_true_log, y_pred_log):
    """Средняя абсолютная ошибка в рублях"""
    y_true_rub = np.exp(y_true_log)
    y_pred_rub = np.exp(y_pred_log)
    return np.mean(np.abs(y_true_rub - y_pred_rub))

def acceptable_rate(y_true_log, y_pred_log, threshold=0.1):
    """Доля предсказаний в пределах ±threshold%"""
    y_true_rub = np.exp(y_true_log)
    y_pred_rub = np.exp(y_pred_log)
    rel_error = np.abs((y_true_rub - y_pred_rub) / y_true_rub)
    return np.mean(rel_error <= threshold) * 100


In [82]:
business_mae_lgbm = business_mae(y_test, y_pred_lgbm)
acc_rate_lgbm = acceptable_rate(y_test, y_pred_lgbm)

mse_lgbm_rub = mean_squared_error(y_test_rub, y_pred_lgbm_rub)
mae_lgbm_rub = mean_absolute_error(y_test_rub, y_pred_lgbm_rub)
r2_lgbm_rub  = r2_score(y_test_rub, y_pred_lgbm_rub)

In [83]:
print("LightGBM:")
print(f"  MSE:  {mse_lgbm:.2f}")
print(f"  MAE:  {mae_lgbm:.2f}")
print(f"  R^2:  {r2_lgbm:.2f}")

LightGBM:
  MSE:  0.02
  MAE:  0.08
  R^2:  0.98


In [84]:
print("LightGBM:")
print(f"  BUSINESS MAE:     {business_mae_lgbm:.2f}")
print(f"  ACCEPTABLE RATE:  {acc_rate_lgbm:.2f}")
print(f"  MSE RUB:          {mse_lgbm_rub:.2f}")
print(f"  MAE RUB:          {mae_lgbm_rub:.2f}")
print(f"  R^2 RUB:          {r2_lgbm_rub:.2f}")

LightGBM:
  BUSINESS MAE:     183946.67
  ACCEPTABLE RATE:  77.01
  MSE RUB:          4850397708044.72
  MAE RUB:          183946.67
  R^2 RUB:          0.83


In [85]:
import joblib
joblib.dump(lgbm_pipeline, 'lgbm_model_with_preproc.pkl', compress=3)

['lgbm_model_with_preproc.pkl']