## Загрузим данные

In [1]:
%pip install gdown



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [3]:
RANDOM_STATE=42

In [4]:
# CARS_FILE_ID = '1zl7HAtBCxTFYkaj871a7BkT9X3CVSpME'
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}

Downloading...
From (original): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI
From (redirected): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI&confirm=t&uuid=3acdf8b9-62fb-4320-b355-4cf08b1e3236
To: /content/dataset.csv
100% 1.01G/1.01G [00:10<00:00, 94.4MB/s]


In [5]:
import pandas as pd

df = pd.read_csv('dataset.csv', low_memory=False)
print(df.shape)

(604047, 24)


In [6]:
df.drop(210905, inplace=True)

In [7]:
df.drop('horse_power', axis=1, inplace=True)

import importlib
import app.missing_imputer
import app.custom_preprocessor

importlib.reload(app.missing_imputer)
importlib.reload(app.custom_preprocessor)

In [8]:
from app.missing_imputer import MissingValueImputer
from app.custom_preprocessor import CustomPreprocessor

In [9]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [11]:
df['log_price'] = np.log1p(df['price'])
df.drop(columns=['price'], inplace=True)

In [12]:
X = df.drop('log_price', axis=1)
y = df['log_price']

In [13]:
feature_types = {
    'production_year': 'numeric',
    'mileage': 'numeric',
    'condition': 'categorical',
    'owners_number': 'numeric',
    'horse_power': 'numeric',
    'region': 'categorical',
    'seller_type': 'categorical',
    'brand': 'categorical',
    'model': 'categorical',
    'body_type': 'categorical',
    'doors_count': 'numeric',
    'seats': 'numeric',
    'engine_displacement': 'numeric',
    'engine_power': 'numeric',
    'fuel_rate': 'numeric',
    'steering_wheel': 'categorical',
    'auto_class': 'categorical',
}

num_features = X.select_dtypes(include=['int64', 'float64', 'int8']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

imputer = MissingValueImputer(feature_types)
preprocessor = CustomPreprocessor()
feature_transform = ColumnTransformer([
    ('scale', StandardScaler(), selector(dtype_include=np.number)),
    ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), selector(dtype_include=object))
], remainder='passthrough')


param = {
    'iterations': 1003,
    'learning_rate': 0.09702603811009403,
    'depth': 10,
    'l2_leaf_reg': 0.11642963038174498,
    'loss_function': 'RMSE',
    'border_count': 62,
    'random_seed': 42,
    'verbose': False,
    'gpu_ram_part': 0.8,
    'task_type': 'GPU',
    'devices': '0'
}

# Пример интеграции в Pipeline
# param — словарь с параметрами CatBoostRegressor, определённый ранее
cat_pipeline = Pipeline([
    ('impute', imputer),
    ('preproc', preprocessor),
    ('feature_transform', feature_transform),
    ('model', CatBoostRegressor(**param))
])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [15]:
cat_pipeline.fit(X_train, y_train)

  X['pts_original'] = X['pts_original'].fillna(True)
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_

In [16]:
import joblib
joblib.dump(cat_pipeline, 'lgbm_model_with_preproc.pkl', compress=3)

['lgbm_model_with_preproc.pkl']

In [17]:
y_pred_lgbm = cat_pipeline.predict(X_test)

  X['pts_original'] = X['pts_original'].fillna(True)
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_df[name] = s
  full_

In [18]:
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm  = r2_score(y_test, y_pred_lgbm)

In [19]:
y_test_rub = np.exp(y_test)
y_pred_lgbm_rub = np.exp(y_pred_lgbm)

In [20]:
def business_mae(y_true_log, y_pred_log):
    """Средняя абсолютная ошибка в рублях"""
    y_true_rub = np.exp(y_true_log)
    y_pred_rub = np.exp(y_pred_log)
    return np.mean(np.abs(y_true_rub - y_pred_rub))

def acceptable_rate(y_true_log, y_pred_log, threshold=0.1):
    """Доля предсказаний в пределах ±threshold%"""
    y_true_rub = np.exp(y_true_log)
    y_pred_rub = np.exp(y_pred_log)
    rel_error = np.abs((y_true_rub - y_pred_rub) / y_true_rub)
    return np.mean(rel_error <= threshold) * 100


In [21]:
business_mae_lgbm = business_mae(y_test, y_pred_lgbm)
acc_rate_lgbm = acceptable_rate(y_test, y_pred_lgbm)

mse_lgbm_rub = mean_squared_error(y_test_rub, y_pred_lgbm_rub)
mae_lgbm_rub = mean_absolute_error(y_test_rub, y_pred_lgbm_rub)
r2_lgbm_rub  = r2_score(y_test_rub, y_pred_lgbm_rub)

In [22]:
print("LightGBM:")
print(f"  MSE:  {mse_lgbm:.2f}")
print(f"  MAE:  {mae_lgbm:.2f}")
print(f"  R^2:  {r2_lgbm:.2f}")

LightGBM:
  MSE:  0.03
  MAE:  0.09
  R^2:  0.98


In [23]:
print("LightGBM:")
print(f"  BUSINESS MAE:     {business_mae_lgbm:.2f}")
print(f"  ACCEPTABLE RATE:  {acc_rate_lgbm:.2f}")
print(f"  MSE RUB:          {mse_lgbm_rub:.2f}")
print(f"  MAE RUB:          {mae_lgbm_rub:.2f}")
print(f"  R^2 RUB:          {r2_lgbm_rub:.2f}")

LightGBM:
  BUSINESS MAE:     234203.29
  ACCEPTABLE RATE:  70.21
  MSE RUB:          6752939513936.13
  MAE RUB:          234203.29
  R^2 RUB:          0.76
