In [1]:
import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)

# Бейзлайн для предсказателя цены автомобиля

**В этом ноутбуке мы последовательно:**
- Установим и импортируем нужные библиотеки
- Очистим данные и добавим новые информативные признаки
- Подготовим выборки, закодируем и масштабируем признаки
- Построим бейзлайн

## 1. Установка и импорт

In [2]:
import pandas as pd, numpy as np, math, joblib, gc, warnings, os, random, json, pickle, time
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from typing import Tuple, Dict, Any

warnings.filterwarnings("ignore")

# Импортируем библиотеки, фиксируем сиды и выбираем устройство.
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()

 ## 2. Загрузка данных

In [3]:
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}

Downloading...
From (original): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI
From (redirected): https://drive.google.com/uc?id=1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI&confirm=t&uuid=7237e4ee-2070-4423-8c55-d0566eac0d22
To: /kaggle/working/dataset.csv
100%|███████████████████████████████████████| 1.01G/1.01G [00:07<00:00, 137MB/s]


In [4]:
def mem_usage(df):
    return f'{df.memory_usage(deep=True).sum()/1024**2:.1f} MB'

df = pd.read_csv('dataset.csv', low_memory=False)
print(f'RAW shape        : {df.shape}, mem: {mem_usage(df)}')

print('\nHead:')
display(df.head())

RAW shape        : (604047, 24), mem: 1458.8 MB

Head:


Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
0,2020,31000,CONDITION_OK,0,,170.0,OK,Оренбург,COMMERCIAL,Abarth,...,1368.0,170.0,6.6,LEFT,1900000,MEDIUM,allowed_for_credit;auction_call_free_report;au...,S,sport-suspension;seats-2,
1,2017,96000,CONDITION_OK,0,,170.0,OK,Оренбург,COMMERCIAL,Abarth,...,1368.0,170.0,6.4,LEFT,2300000,MEDIUM,allowed_for_credit;auction_call_free_report;au...,S,seats-2,
2,2019,42500,CONDITION_OK,1,True,179.0,OK,Санкт-Петербург,PRIVATE,Abarth,...,1368.0,180.0,5.8,LEFT,2895000,MEDIUM,allowed_for_credit;auction_call_free_report;au...,A,seats-4,
3,2013,130000,CONDITION_OK,2,True,160.0,OK,Москва,PRIVATE,Abarth,...,1368.0,160.0,5.4,LEFT,1750000,MEDIUM,allowed_for_credit;auction_call_free_report;av...,A,automatic-lighting-control;voice-recognition;l...,
4,2009,47800,CONDITION_OK,4,True,135.0,OK,Москва,PRIVATE,Abarth,...,1368.0,135.0,6.5,LEFT,2000000,MEDIUM,allowed_for_credit;auction_call_free_report;au...,A,leather-gear-stick;seats-4;apple-carplay;usb;a...,


## 3. Предобработка и признаки

In [5]:
# АНОМАЛИЯ
df.drop(210905, inplace=True)

df["fuel_rate"].fillna(df["fuel_rate"].median(), inplace=True)
df["pts_original"].fillna(True, inplace=True)
df["accidents_resolution"].fillna("OK", inplace=True)
df["auto_class"].fillna("NOT SPECIFIED", inplace=True)
df.drop("horse_power", axis=1, inplace=True)


def optimize_types(d):
    for col in d.select_dtypes(include=["int64"]).columns:
        d[col] = pd.to_numeric(d[col], downcast="integer")
    for col in d.select_dtypes(include=["float64"]).columns:
        d[col] = pd.to_numeric(d[col], downcast="float")
    return d


df = optimize_types(df)
print(f"After dtype optimisation mem: {mem_usage(df)}")

After dtype optimisation mem: 1420.9 MB


In [6]:
def split_max(x):
    try:
        return max(map(int, x.split(';')))
    except:
        return np.nan

df['seats_numeric'] = df['seats'].progress_apply(split_max)
df.drop('seats', axis=1, inplace=True)

  0%|          | 0/604046 [00:00<?, ?it/s]

### Преобразование мульти­категорий в дамми-признаки  
В этой ячейке мы:  
- Собираем все уникальные значения из колонок с «;»  
- Создаём бинарные столбцы для каждого уникального тега/опции/оборудования  

In [7]:
def get_unique_values(series, sep=';'):
    uniq = set()
    for cell in series.dropna():
        for piece in cell.split(sep):
            s = piece.strip()
            if s:
                uniq.add(s)
    return np.array(list(uniq))

all_tags = get_unique_values(df['tags'])
all_options = get_unique_values(df['complectation_available_options'])
all_equipments = get_unique_values(df['equipment'])

def create_binary_features(df, column, unique_values, sep=';'):
    return (
        df[column]
        .str.get_dummies(sep=sep)
        .reindex(columns=unique_values, fill_value=0)
        .astype('int8')
        )
    
tags_dummies = create_binary_features(df, 'tags', all_tags)
options_dummies = create_binary_features(df, 'complectation_available_options', all_options)
equipment_dummies = create_binary_features(df, 'equipment', all_equipments)


full_df = pd.concat([
    df.drop(columns=['tags', 'complectation_available_options', 'equipment']),
    tags_dummies,
    options_dummies.drop(columns=['condition']),
    equipment_dummies.drop(columns=['condition'])
], axis=1)


### Удаление дубликатных колонок

In [8]:
names = full_df.columns[full_df.columns.duplicated()].unique()
new_cols = {}

for name in names:
    cols_i = [col for col in full_df.columns if col == name]
    sub_df = full_df.loc[:, cols_i].astype('int8')
    
    max_series = sub_df.max(axis=1).astype('int8')
    
    new_cols[name] = max_series
    
    full_df.drop(columns=cols_i, inplace=True)

for name, s in new_cols.items():
    full_df[name] = s

### Отбраковка сильно коррелирующих признаков
Здесь мы:

- Вычисляем корреляцию всех числовых признаков между собой

- Удаляем одну из пары признаков с корреляцией > 0.8

In [9]:
num = full_df.select_dtypes(include=["number"]).drop(
    columns=["price"], errors="ignore"
)

corr = num.corr().abs()

mask = np.triu(np.ones(corr.shape), 1).astype(bool)
upper = corr.where(mask)
mean_corr = corr.mean()

drop = set()
for i, j in zip(*np.where(upper > 0.80)):
    a, b = corr.index[i], corr.columns[j]
    drop.add(a if mean_corr[a] > mean_corr[b] else b)

# защищаем важные признаки от удаления
essential_cols = {
    "price",
    "production_year",  # уже были
    "tags_cnt",
    "equipment_cnt",
    "complectation_available_options_cnt",  # NEW
}
drop -= essential_cols

f = full_df.drop(columns=drop, errors="ignore").copy()
print("After corr-pruning shape:", f.shape)

After corr-pruning shape: (604046, 362)


### Создание новых признаков

Добавляем полиномиальные, логарифмические, булевы и дробные признаки.


In [10]:
mod_df = f.copy()
mod_df['production_year_square'] = mod_df['production_year'] ** 2
mod_df['age'] = 2025 - mod_df['production_year']
mod_df['mileage_per_year'] = mod_df['mileage'] / mod_df['age'].clip(lower=1)
mod_df['log_mileage']      = np.log1p(mod_df['mileage'].astype(float))
mod_df['log_engine_disp']  = np.log1p(mod_df['engine_displacement'].astype(float))
mod_df['power_per_liter'] = mod_df['engine_power'] / mod_df['engine_displacement'].clip(lower=0.1)
mod_df['age_x_power'] = mod_df['age'] * mod_df['engine_power']
mod_df['age_x_mileage'] = mod_df['age'] * mod_df['mileage']
mod_df['disp_per_door'] = mod_df['engine_displacement'] / mod_df['doors_count'].clip(lower=1)
mod_df['is_one_owner'] = (mod_df['owners_number'] == 1).astype(int)
mod_df['is_new'] = ((mod_df['mileage'] < 1000) & (mod_df['owners_number'] <= 1)).astype(int)
mod_df['is_very_old'] = (mod_df['age'] > 20).astype(int)
mod_df['power_per_year'] = mod_df['engine_power'] / mod_df['age'].clip(lower=0.1)
for a,b in [('age','mileage'),('age','power_per_liter')]:
    mod_df[f"{a}_x_{b}"] = mod_df[a] * mod_df[b]
mod_df.drop(columns=['production_year','engine_displacement'], inplace=True)

## 4. Обучение модели и оценка

Разделяем на train/test, собираем Pipeline с препроцессором и Ridge.

In [11]:
start_numeric_features = ['age',
                          'power_per_year',
                          'age_x_mileage',
                          "mileage",
                          'age_x_power_per_liter',
                          'production_year_square',
                          'disp_per_door',
                          'log_engine_disp',
                          'age_x_power',
                          'age_x_mileage',
                          'log_mileage',
                          'power_per_liter',                        
                          'mileage_per_year',
                          'owners_number',
                          'doors_count',
                          #'engine_displacement',
                          'engine_power',
                          'fuel_rate']

### Разделяем на train/test, собираем Pipeline с препроцессором и Ridge.

In [12]:
y = mod_df['price'].astype(float)
X = mod_df.drop(columns=['price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

numeric_feats = X.select_dtypes(include=['int32','int64','float']).columns.tolist()
cat_feats     = X.select_dtypes(include=['object','category']).columns.tolist()
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), start_numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_feats)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=1, solver='lsqr'))
])

### Grid Search

In [13]:
def run_grid_search(X, y, pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error'):
    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )
    gs.fit(X, np.log1p(y))
    print(f"Best params: {gs.best_params_}")
    print(f"Best CV score (RMSE): {-gs.best_score_:.4f}")
    return gs.best_estimator_

grid_params = {
    'ridge__alpha': [0.1, 1, 10],
    'ridge__solver': ['lsqr', 'sag']
}

best_pipe = run_grid_search(X_train, y_train, pipeline, grid_params)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params: {'ridge__alpha': 1, 'ridge__solver': 'lsqr'}
Best CV score (RMSE): 0.2583


### Результаты на тесте

Вычисляем MSE, MAE и R² в исходном масштабе цен.

In [14]:
def business_mae(y_test, y_pred):
    """Средняя абсолютная ошибка в рублях"""
    y_true_rub = y_test
    y_pred_rub = y_pred
    return np.mean(np.abs(y_true_rub - y_pred_rub))

def acceptable_rate(y_test, y_pred, threshold=0.1):
    """Доля предсказаний в пределах ±threshold%"""
    y_true_rub = y_test
    y_pred_rub = y_pred
    rel_error = np.abs((y_true_rub - y_pred_rub) / y_true_rub)
    return np.mean(rel_error <= threshold) * 100

y_pred = np.expm1(best_pipe.predict(X_test))
print("Тестовые метрики в исходной шкале:")
print(f"- MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"- MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"- R²: {r2_score(y_test, y_pred):.4f}")
print(f"- business_mae:{business_mae(y_test, y_pred):.2f}")
print(f"- acceptable_rate:{acceptable_rate(y_test, y_pred):.2f}")

Тестовые метрики в исходной шкале:
- MSE: 9621822319085.46
- MAE: 420725.29
- R²: 0.6082
- business_mae:420725.29
- acceptable_rate:46.66


## 5. Выводы

- Создан богатый набор признаков.
- Ridge-солвер `lsqr` успешно обрабатывает разрежённые данные.
- Базовые метрики показывают уровень качества, на котором можно строить более сложные модели.
