# Загружаем данные

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [None]:
%pip install gdown

In [None]:
CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}

In [4]:
df = pd.read_csv('dataset.csv')
print(df.shape)

(604047, 24)


In [None]:
df.sample(10)

Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
436946,2024,0,CONDITION_OK,0,True,150.0,,Санкт-Петербург,COMMERCIAL,Skoda,...,1395.0,150.0,7.2,LEFT,4600000,MEDIUM,all-terrain;big-trunk;dealer_quarantine_calls;...,C,tyre-pressure;start-stop-function;roof-rails;w...,cruise-control;multi-wheel;auto-park;heated-wa...
329728,2018,220000,CONDITION_OK,0,,,OK,Минск,PRIVATE,Mercedes-Benz,...,1950.0,194.0,4.6,LEFT,2500000,PREMIUM,allowed_for_credit;autoru_exclusive;available_...,E,electro-trunk;voice-recognition;android-auto;g...,
343496,2006,326000,CONDITION_OK,4,,224.0,OK,Санкт-Петербург,PRIVATE,Mercedes-Benz,...,2987.0,224.0,9.3,LEFT,999999,PREMIUM,all-terrain;allowed_for_credit;auction_call_fr...,E,wheel-power;lock;light-sensor;seats-5;electro-...,
439783,2008,164233,CONDITION_OK,4,,102.0,ERROR,Тверь,PRIVATE,Skoda,...,1595.0,102.0,7.1,LEFT,650000,MEDIUM,allowed_for_credit;auction_call_free_report;au...,C,seats-5,
550845,2023,107,CONDITION_OK,1,True,89.0,OK,Краснодар,PRIVATE,Lada (ВАЗ),...,1596.0,90.0,7.5,LEFT,1800000,ECONOMY,allowed_for_credit;almost_new;auction_call_fre...,C,audiosystem-cd;front-seats-heat;driver-seat-up...,cruise-control;multi-wheel;airbag-passenger;ro...
431807,2015,44450,CONDITION_OK,1,True,102.0,ERROR,Екатеринбург,PRIVATE,Renault,...,1598.0,102.0,8.3,LEFT,1044000,ECONOMY,affordable;allowed_for_credit;auction_call_fre...,B,cruise-control;wheel-power;wheel-configuration...,cruise-control;airbag-passenger;lock;seat-tran...
167063,2024,0,CONDITION_OK,0,True,304.0,,Химки,COMMERCIAL,Genesis,...,2497.0,304.0,12.2,LEFT,15840000,PREMIUM,big;dealer_quarantine_calls;discount_options;f...,E,projection-display;audiosystem-cd;airbag-side;...,e-adjustment-wheel;multi-wheel;airbag-passenge...
391522,2021,43000,CONDITION_OK,0,,204.0,OK,Владивосток,COMMERCIAL,Nissan,...,1497.0,204.0,7.3,LEFT,2843000,MEDIUM,allowed_for_credit;autoru_exclusive;big-trunk;...,D,seats-5,
210052,2024,100,CONDITION_OK,1,True,115.0,OK,Новосибирск,PRIVATE,Hyundai,...,1497.0,115.0,5.3,LEFT,2530000,MEDIUM,allowed_for_credit;almost_new;auction_call_fre...,C,seats-5,
46816,2024,34,CONDITION_OK,1,True,286.0,OK,Москва,COMMERCIAL,BMW,...,2993.0,286.0,8.2,LEFT,16150000,PREMIUM,allowed_for_credit;almost_new;auction_call_fre...,E,seats-5,


# Делаем предобработку

In [5]:
df_with_na_column = df[df['engine_displacement'].isna()]
df_with_na_column

Unnamed: 0,production_year,mileage,condition,owners_number,pts_original,horse_power,accidents_resolution,region,seller_type,brand,...,engine_displacement,engine_power,fuel_rate,steering_wheel,price,price_segment,tags,auto_class,equipment,complectation_available_options
210905,2003,175000,CONDITION_OK,0,True,,,Чебаркуль,PRIVATE,Hyundai,...,,,,LEFT,380000,MEDIUM,available_for_checkup;pts_original;real_photo;...,,seats-5,


In [6]:
df.drop(210905, inplace=True)

In [None]:
df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)

In [None]:
df['pts_original'].fillna(True, inplace=True)
df['accidents_resolution'].fillna('OK', inplace=True)
df['auto_class'].fillna('NOT SPECIFIED', inplace=True)

In [9]:
columns = ['production_year', 'mileage', 'condition', 'owners_number', 'pts_original', 'accidents_resolution', 'region', 'seller_type', 'brand', 'model', 'body_type', 'doors_count', 'seats', 'engine_displacement', 'engine_power', 'fuel_rate', 'steering_wheel', 'price', 'price_segment', 'auto_class']
features = ['production_year', 'mileage', 'condition', 'owners_number', 'pts_original', 'accidents_resolution', 'region', 'seller_type', 'brand', 'model', 'body_type', 'doors_count', 'seats', 'engine_displacement', 'engine_power', 'fuel_rate', 'steering_wheel', 'price_segment', 'auto_class']

In [10]:
df[columns].isnull().sum()

Unnamed: 0,0
production_year,0
mileage,0
condition,0
owners_number,0
pts_original,0
accidents_resolution,0
region,0
seller_type,0
brand,0
model,0


In [11]:
df.drop('horse_power', axis=1, inplace=True)

In [12]:
df.drop(columns=["tags", "complectation_available_options", "equipment"], inplace=True)

In [13]:
import numpy as np

def convert_seats(seats_str):
    # Разбиваем строку по разделителю ";" и преобразуем каждую часть в целое число
    try:
        numbers = list(map(int, seats_str.split(';')))
    except Exception as e:
        # Если не удалось преобразовать, возвращаем NaN
        return np.nan
    # Возвращаем среднее значение (можно выбрать max, min или другое агрегирующее значение)
    return np.max(numbers)

# Применяем функцию к столбцу и создаём новый числовой признак
df["seats_numeric"] = df["seats"].apply(convert_seats)


In [14]:
df.drop(columns=["seats"], inplace=True)

In [15]:
categorical_features = ['condition',
 'accidents_resolution',
 'region',
 'seller_type',
 'brand',
 'model',
 'body_type',
 'steering_wheel',
 'price_segment',
 'auto_class']

In [None]:
!pip install category-encoders

In [17]:
for col in df.select_dtypes(include=['int64']):
    df[col] = df[col].astype('int32')

for col in df.select_dtypes(include=['float64']):
    df[col] = df[col].astype('float32')

for col in categorical_features:
    df[col] = df[col].astype('category')

In [18]:
y = df["price"]
X = df.drop(columns=["price"])

In [19]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
auto_ohe_features = [col for col in categorical_features if X[col].nunique() <= 30]
auto_cb_features  = [col for col in categorical_features if X[col].nunique() > 30]
auto_te_features = [col for col in categorical_features if X[col].nunique() > 30]

low_cardinality_features = [col for col in categorical_features if X[col].nunique() <= 30]
high_cardinality_features = [col for col in categorical_features if X[col].nunique() > 30]

In [None]:
len(df["model"].value_counts())

2575

# Делим данные на обучающую и тестовую выборки

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [21]:
pd.concat([X_train, y_train], axis=1).to_csv("train_data.csv", index=False)


# Начинаем обучать модели

## ElasticNet

In [25]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from category_encoders import TargetEncoder
from sklearn.model_selection import RandomizedSearchCV

In [26]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('te', TargetEncoder(), high_cardinality_features),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True), low_cardinality_features)
])

In [38]:
elastic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet(max_iter=3000, tol=1e-3, random_state=42))
])


In [45]:
param_dist = {
    'regressor__alpha': [0.89, 0.9, 0.91, 0.93, 100000],
    'regressor__l1_ratio': [0.79, 0.8, 0.81, 0.83]
}


In [47]:
elastic_search = RandomizedSearchCV(
    elastic_pipeline,
    param_dist,
    n_iter=20,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

In [48]:
X_sample, _, y_sample, _ = train_test_split(X_train, y_train, train_size=0.5, random_state=42)

In [None]:
elastic_search.fit(X_sample, y_sample)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [43]:
print("Лучшие параметры:", elastic_search.best_params_)
#Лучшие параметры: {'regressor__l1_ratio': 0.8, 'regressor__alpha': 0.9}

Лучшие параметры: {'regressor__l1_ratio': 0.8, 'regressor__alpha': 0.9}


In [None]:
y_pred_elastic = elastic_search.predict(X_test)
print(f"ElasticNet MAE: {mean_absolute_error(y_test, y_pred_elastic):.2f} руб.")
print(f"ElasticNet R²: {r2_score(y_test, y_pred_elastic):.2f}")

ElasticNet MAE: 1135296.80 руб.
ElasticNet R²: 0.44
