# Загружаем данные

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [None]:
%pip install gdown

In [6]:
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# !gdown --id {CARS_FILE_ID}

In [7]:
df = pd.read_csv('dataset.csv')
print(df.shape)

(604047, 24)


In [8]:
df.sample(10)

Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
436946,2024,0,CONDITION_OK,0,True,150.0,,Санкт-Петербург,COMMERCIAL,Skoda,...,1395.0,150.0,7.2,LEFT,4600000,MEDIUM,all-terrain;big-trunk;dealer_quarantine_calls;...,C,tyre-pressure;start-stop-function;roof-rails;w...,cruise-control;multi-wheel;auto-park;heated-wa...
329728,2018,220000,CONDITION_OK,0,,,OK,Минск,PRIVATE,Mercedes-Benz,...,1950.0,194.0,4.6,LEFT,2500000,PREMIUM,allowed_for_credit;autoru_exclusive;available_...,E,electro-trunk;voice-recognition;android-auto;g...,
343496,2006,326000,CONDITION_OK,4,,224.0,OK,Санкт-Петербург,PRIVATE,Mercedes-Benz,...,2987.0,224.0,9.3,LEFT,999999,PREMIUM,all-terrain;allowed_for_credit;auction_call_fr...,E,wheel-power;lock;light-sensor;seats-5;electro-...,
439783,2008,164233,CONDITION_OK,4,,102.0,ERROR,Тверь,PRIVATE,Skoda,...,1595.0,102.0,7.1,LEFT,650000,MEDIUM,allowed_for_credit;auction_call_free_report;au...,C,seats-5,
550845,2023,107,CONDITION_OK,1,True,89.0,OK,Краснодар,PRIVATE,Lada (ВАЗ),...,1596.0,90.0,7.5,LEFT,1800000,ECONOMY,allowed_for_credit;almost_new;auction_call_fre...,C,audiosystem-cd;front-seats-heat;driver-seat-up...,cruise-control;multi-wheel;airbag-passenger;ro...
431807,2015,44450,CONDITION_OK,1,True,102.0,ERROR,Екатеринбург,PRIVATE,Renault,...,1598.0,102.0,8.3,LEFT,1044000,ECONOMY,affordable;allowed_for_credit;auction_call_fre...,B,cruise-control;wheel-power;wheel-configuration...,cruise-control;airbag-passenger;lock;seat-tran...
167063,2024,0,CONDITION_OK,0,True,304.0,,Химки,COMMERCIAL,Genesis,...,2497.0,304.0,12.2,LEFT,15840000,PREMIUM,big;dealer_quarantine_calls;discount_options;f...,E,projection-display;audiosystem-cd;airbag-side;...,e-adjustment-wheel;multi-wheel;airbag-passenge...
391522,2021,43000,CONDITION_OK,0,,204.0,OK,Владивосток,COMMERCIAL,Nissan,...,1497.0,204.0,7.3,LEFT,2843000,MEDIUM,allowed_for_credit;autoru_exclusive;big-trunk;...,D,seats-5,
210052,2024,100,CONDITION_OK,1,True,115.0,OK,Новосибирск,PRIVATE,Hyundai,...,1497.0,115.0,5.3,LEFT,2530000,MEDIUM,allowed_for_credit;almost_new;auction_call_fre...,C,seats-5,
46816,2024,34,CONDITION_OK,1,True,286.0,OK,Москва,COMMERCIAL,BMW,...,2993.0,286.0,8.2,LEFT,16150000,PREMIUM,allowed_for_credit;almost_new;auction_call_fre...,E,seats-5,


# Делаем предобработку

In [9]:
df_with_na_column = df[df['engine_displacement'].isna()]
df_with_na_column

Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
210905,2003,175000,CONDITION_OK,0,True,,,Чебаркуль,PRIVATE,Hyundai,...,,,,LEFT,380000,MEDIUM,available_for_checkup;pts_original;real_photo;...,,seats-5,


In [10]:
df.drop(210905, inplace=True)

In [None]:
df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)

In [None]:
df['pts_original'].fillna(True, inplace=True)
df['accidents_resolution'].fillna('OK', inplace=True)
df['auto_class'].fillna('NOT SPECIFIED', inplace=True)

In [13]:
columns = ['production_year', 'mileage', 'condition', 'owners_number', 'pts_original', 'accidents_resolution', 'region', 'seller_type', 'brand', 'model', 'body_type', 'doors_count', 'seats', 'engine_displacement', 'engine_power', 'fuel_rate', 'steering_wheel', 'price', 'price_segment', 'auto_class']
features = ['production_year', 'mileage', 'condition', 'owners_number', 'pts_original', 'accidents_resolution', 'region', 'seller_type', 'brand', 'model', 'body_type', 'doors_count', 'seats', 'engine_displacement', 'engine_power', 'fuel_rate', 'steering_wheel', 'price_segment', 'auto_class']

In [14]:
df[columns].isnull().sum()

production_year         0
mileage                 0
condition               0
owners_number           0
pts_original            0
accidents_resolution    0
region                  0
seller_type             0
brand                   0
model                   0
body_type               0
doors_count             0
seats                   0
engine_displacement     0
engine_power            0
fuel_rate               0
steering_wheel          0
price                   0
price_segment           0
auto_class              0
dtype: int64

In [15]:
df.drop('horse_power', axis=1, inplace=True)

In [16]:
df.drop(columns=["tags", "complectation_available_options", "equipment"], inplace=True)

In [17]:
import numpy as np

def convert_seats(seats_str):
    # Разбиваем строку по разделителю ";" и преобразуем каждую часть в целое число
    try:
        numbers = list(map(int, seats_str.split(';')))
    except Exception as e:
        # Если не удалось преобразовать, возвращаем NaN
        return np.nan
    # Возвращаем среднее значение (можно выбрать max, min или другое агрегирующее значение)
    return np.max(numbers)

# Применяем функцию к столбцу и создаём новый числовой признак
df["seats_numeric"] = df["seats"].apply(convert_seats)


In [18]:
df.drop(columns=["seats"], inplace=True)

In [19]:
categorical_features = ['condition',
 'accidents_resolution',
 'region',
 'seller_type',
 'brand',
 'model',
 'body_type',
 'steering_wheel',
 'price_segment',
 'auto_class']

In [None]:
!pip install category-encoders

In [21]:
for col in df.select_dtypes(include=['int64']):
    df[col] = df[col].astype('int32')

for col in df.select_dtypes(include=['float64']):
    df[col] = df[col].astype('float32')

for col in categorical_features:
    df[col] = df[col].astype('category')

In [22]:
y = df["price"]
X = df.drop(columns=["price"])

In [23]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
auto_ohe_features = [col for col in categorical_features if X[col].nunique() <= 30]
auto_cb_features  = [col for col in categorical_features if X[col].nunique() > 30]
auto_te_features = [col for col in categorical_features if X[col].nunique() > 30]

low_cardinality_features = [col for col in categorical_features if X[col].nunique() <= 30]
high_cardinality_features = [col for col in categorical_features if X[col].nunique() > 30]

In [24]:
len(df["model"].value_counts())

2575

# Делим данные на обучающую и тестовую выборки

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
pd.concat([X_train, y_train], axis=1).to_csv("train_data.csv", index=False)


# Начинаем обучать модели

#### Предобработка данных:
   - OneHot для категориальных
   - StandardScaler() для числовых

## SGD Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD

In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [None]:
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

full_pipeline.fit(X_train)

In [None]:
X_transformed = full_pipeline.transform(X_train)

In [27]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

model = SGDRegressor(max_iter=3000, tol=1e-3)
model.fit(X_transformed, y_train)

In [None]:
y_pred = model.predict(full_pipeline.transform(X_test))

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

MAE: 1088447.63 руб.
R²: 0.49


## Lasso Regression (L1-регуляризация)

In [29]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import Lasso

Было принято решение кодировать высококардинальные признаки с помощью `OrdinalEncoder` вместо `OneHotEncoder`, чтобы уменьшить размерность данных и сэкономить оперативную память

In [30]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), low_cardinality_features),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), high_cardinality_features)
])

In [31]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso(random_state=RANDOM_STATE, max_iter=10000, tol=1e-4))
])

In [32]:
param_grid = {
    'model__alpha': np.logspace(-3, 1, num=8)
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='neg_mean_absolute_error', verbose=1)

In [33]:
grid_search.fit(X_train, y_train)
print("Лучшие параметры: ", grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Лучшие параметры:  {'model__alpha': 10.0}


  model = cd_fast.enet_coordinate_descent(


In [34]:
import pickle

with open('model_l1_regression.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

In [35]:
y_pred = grid_search.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f} руб.")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

MAE: 1521742.21 руб.
R²: 0.25


**Вывод**: \
Метрики довольно плачевные даже для baseline. Данная модель с текущей предобработкой данных явно не справляется с нашей задачей. Скорее всего, нужно улучшить качественную проработку признаков, возможно, попробовать другие кодировщики или попробовать более мощные модели для повышения точности прогнозирования.

## Decision Tree Regression

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [60]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [61]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=RANDOM_STATE))
])

In [62]:
param_grid = {
    'regressor__max_depth': [None, 5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)

In [63]:
grid_search.fit(X_train, y_train)
print("Лучшие параметры:", grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Лучшие параметры: {'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2}


In [64]:
y_pred = grid_search.predict(X_test)

In [65]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f} руб.")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

MAE: 616943.20 руб.
R²: 0.48


In [66]:
import pickle

with open('model_tree_1_version.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

Как мы видим r2_score особо не улучшился, поэтому попробуем немного поменять гиперпараметры:

In [32]:
param_grid_2 = {
    'regressor__max_depth': [5, 10, 15],
    'regressor__min_samples_split': [5, 10, 15],
    'regressor__min_samples_leaf': [3, 5, 7],
    'regressor__max_features': ['sqrt', 'log2', None],
    'regressor__ccp_alpha': [0.0, 0.001, 0.01, 0.1]
}

grid_search_2 = GridSearchCV(model_pipeline, param_grid_2, cv=5, scoring='r2', n_jobs=-1, verbose=1)

In [None]:
grid_search_2.fit(X_train, y_train)
print("Лучшие параметры:", grid_search_2.best_params_)

In [None]:
import pickle

with open('model_tree_2_version.pkl', 'wb') as f:
    pickle.dump(grid_search_2, f)

In [35]:
y_pred = grid_search_2.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f} руб.")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

MAE: 970054.78 руб.
R²: 0.57


**Выводы**: \
Мы смогли добиться некоторого прироста качества предсказаний по сравнению с базовой моделью.
Это подтверждает, что гиперпараметрическая настройка повысила качество, но остаётся значительная доля дисперсии, которую модель не объясняет. \
Я думаю, что это неплохой резльтат для `baseline`, который можно будет еще улучшить с помощью дополнительного фичер инжиниринга или перехода на более мощные модели.