In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [None]:
%pip install gdown

In [3]:
"""CARS_FILE_ID = '1liFEe1-yFISPSpRSvbv1wIH_avYNGmBI'
RANDOM_STATE = 42

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

!gdown --id {CARS_FILE_ID}"""

df = pd.read_csv('dataset.csv')
print(df.shape)


(604047, 24)


In [None]:
df_with_na_column = df[df['engine_displacement'].isna()]
df_with_na_column

In [None]:
df.drop(210905, inplace=True)

In [None]:
df['fuel_rate'].fillna(df['fuel_rate'].median(), inplace=True)

In [None]:
df['pts_original'].fillna(True, inplace=True)
df['accidents_resolution'].fillna('OK', inplace=True)
df['auto_class'].fillna('NOT SPECIFIED', inplace=True)

In [9]:
columns = ['production_year', 'mileage', 'condition', 'owners_number', 'pts_original', 'accidents_resolution', 'region', 'seller_type', 'brand', 'model', 'body_type', 'doors_count', 'seats', 'engine_displacement', 'engine_power', 'fuel_rate', 'steering_wheel', 'price', 'price_segment', 'auto_class']
features = ['production_year', 'mileage', 'condition', 'owners_number', 'pts_original', 'accidents_resolution', 'region', 'seller_type', 'brand', 'model', 'body_type', 'doors_count', 'seats', 'engine_displacement', 'engine_power', 'fuel_rate', 'steering_wheel', 'price_segment', 'auto_class']

In [10]:
df[columns].isnull().sum()

production_year         0
mileage                 0
condition               0
owners_number           0
pts_original            0
accidents_resolution    0
region                  0
seller_type             0
brand                   0
model                   0
body_type               0
doors_count             0
seats                   0
engine_displacement     0
engine_power            0
fuel_rate               0
steering_wheel          0
price                   0
price_segment           0
auto_class              0
dtype: int64

In [11]:
df.drop('horse_power', axis=1, inplace=True)

In [12]:
stripped_down_df = df.drop(columns=["tags", "complectation_available_options", "equipment"], inplace=False)

In [13]:
import numpy as np

def convert_seats(seats_str):
    try:
        numbers = list(map(int, seats_str.split(';')))
    except Exception as e:
        return np.nan
    return np.max(numbers)

stripped_down_df["seats_numeric"] = stripped_down_df["seats"].apply(convert_seats)


In [14]:
stripped_down_df.drop(columns=["seats"], inplace=True)

In [15]:
categorical_features = ['condition',
 'accidents_resolution',
 'region',
 'seller_type',
 'brand',
 'model',
 'body_type',
 'steering_wheel',
 'price_segment',
 'auto_class']

In [None]:
!pip install category-encoders

In [16]:
for col in stripped_down_df.select_dtypes(include=['int64']):
    stripped_down_df[col] = stripped_down_df[col].astype('int32')

for col in stripped_down_df.select_dtypes(include=['float64']):
    stripped_down_df[col] = stripped_down_df[col].astype('float32')

for col in categorical_features:
    stripped_down_df[col] = stripped_down_df[col].astype('category')

In [17]:
y = stripped_down_df["price"]
X = stripped_down_df.drop(columns=["price"])

In [18]:
numeric_features = X.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
auto_ohe_features = [col for col in categorical_features if X[col].nunique() <= 30]
auto_te_features = [col for col in categorical_features if X[col].nunique() > 30]

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


# Обучение Ridge Regression с исключением специфических признаков

## Особенности эксперимента

**Исключенные признаки** (не использовались в обучении):
   - `tags`
   - `complectation_available_options` 
   - `equipment`

### 1. Предобработка данных
   - OneHot для категориальных
   - StandardScaler() для числовых

### 2. Подбор гиперпараметров с помощью GridSearchCV для l2 регуляризации

In [29]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [22]:
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge())
])

param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [23]:
best_params = grid_search.best_params_

In [24]:
print("Лучшие параметры:", best_params)
print("Лучшая оценка (MSE):", -grid_search.best_score_)

Лучшие параметры: {'ridge__alpha': 0.1}
Лучшая оценка (MSE): 9496177053885.768


In [25]:
best_model = grid_search.best_estimator_

In [26]:
y_test_pred = best_model.predict(X_test)

In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [28]:
print("Тестовые метрики:")
print("- MSE:", mean_squared_error(y_test, y_test_pred))
print("- MAE:", mean_absolute_error(y_test, y_test_pred))
print("- R²:", r2_score(y_test, y_test_pred))

Тестовые метрики:
- MSE: 10342307889739.17
- MAE: 875712.9958824995
- R²: 0.5788330220389966


# Анализ тестовых метрик
## Ключевые выводы:
### 1. **Проблема с масштабом данных** 
    - возможно стоит логарифмировать price

### 2. **Качество модели (R²) маловато** 

### 3. **Проблемы с признаками**
    - Возможные причины:
        1. Исключенные признаки содержали важную информацию
        2. стоит заняться инженерией признаков
        3. Высокая мультиколлинеарность

In [36]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge())
])

# 4. Сетка параметров для GridSearch
param_grid = {
    'ridge__alpha': [0.01, 0.1, 1, 10, 100, 1000],
    'ridge__solver': ['auto', 'lbfgs', 'sparse_cg', 'lsqr']
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=4,
    verbose=1
)
grid_search.fit(X_train, np.log1p(y_train))

best_model = grid_search.best_estimator_

y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)

print("\nТестовые метрики в исходной шкале:")
print(f"- MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"- MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"- R²: {r2_score(y_test, y_pred):.4f}")


Fitting 4 folds for each of 24 candidates, totalling 96 fits


24 fits failed out of a total of 96.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/c/Users/ravil/Desktop/SUPER-MEGA-PROJECT/myenv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/c/Users/ravil/Desktop/SUPER-MEGA-PROJECT/myenv/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/mnt/c/Users/ravil/Desktop/SUPER-MEGA-PROJECT/myenv/lib/python3.10/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File 


Тестовые метрики в исходной шкале:
- MSE: 10201800394187.23
- MAE: 470973.25
- R²: 0.5846


## Результаты после логарифмирования целевой переменной

### 🟢 Положительные эффекты:
- **Снижение влияния выбросов**:
  - MSE и MAE чуть уменьшились
  - немного увеличился R²

Хоть и был положительный эффект, но пока что, модель все равно требует фундаментальной переработки
 

# Обучение пайплайна с использованием колонок `tags`, `complectation_available_options`, `equipment`

In [12]:
def get_unique_values(series, sep=';'):
    return (
        series
        .str.split(sep, expand=True)
        .stack()
        .str.strip()
        .unique()
    )

all_tags = get_unique_values(df['tags'])
all_options = get_unique_values(df['complectation_available_options'])
all_equipments = get_unique_values(df['equipment'])

In [13]:
def create_binary_features(df, column, unique_values, sep=';'):
    return (
        df[column]
        .str.get_dummies(sep=sep)
        .reindex(columns=unique_values, fill_value=0)
        .astype('int8')
    )

tags_dummies = create_binary_features(df, 'tags', all_tags)
options_dummies = create_binary_features(df, 'complectation_available_options', all_options)
equipment_dummies = create_binary_features(df, 'equipment', all_equipments)

In [14]:
full_df = pd.concat([
    df.drop(columns=['tags', 'complectation_available_options', 'equipment']),
    tags_dummies,
    options_dummies.drop(columns=['condition']),
    equipment_dummies.drop(columns=['condition'])
], axis=1)

In [15]:
duplicated_cols = full_df.columns[full_df.columns.duplicated()].unique()
full_df[duplicated_cols] = full_df[duplicated_cols].astype("int8")

In [None]:
full_df = full_df.groupby(axis=1, level=0, sort=False).max()

In [17]:
duplicated_cols = full_df.columns[full_df.columns.duplicated()].unique()
duplicated_cols

Index([], dtype='object')

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

y = full_df["price"]

X = full_df.drop(columns=["price"])

numeric_features = X.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
start_numeric_features = ['production_year',
 'mileage',
 'owners_number',
 'pts_original',
 'doors_count',
 'engine_displacement',
 'engine_power',
 'fuel_rate']

numeric_features = [
    col for col in numeric_features 
    if col not in start_numeric_features
]

for col in numeric_features:
    col_min = X[col].min()
    col_max = X[col].max()
    if col_min < -128 or col_max > 127:
        print(f"Колонка {col} пропущена")
    else:
        X[col] = X[col].astype("int8")

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [32]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge())
])

param_grid = {
    'ridge__alpha': [0.01, 0.1, 1, 10],
    'ridge__solver': ['auto', 'lbfgs', 'sparse_cg', 'lsqr']
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1
)
grid_search.fit(X_train, np.log1p(y_train))

best_model = grid_search.best_estimator_

y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)

print("\nТестовые метрики в исходной шкале:")
print(f"- MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"- MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"- R²: {r2_score(y_test, y_pred):.4f}")


Fitting 3 folds for each of 16 candidates, totalling 48 fits


12 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/c/Users/ravil/Desktop/SUPER-MEGA-PROJECT/myenv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/c/Users/ravil/Desktop/SUPER-MEGA-PROJECT/myenv/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/mnt/c/Users/ravil/Desktop/SUPER-MEGA-PROJECT/myenv/lib/python3.10/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File 


Тестовые метрики в исходной шкале:
- MSE: 11048279801286.63
- MAE: 585853.76
- R²: 0.5501


### Анализ результатов

#### Сравнение с базовой моделью:
- **R² без текстовых колонок**: 0.584
- **R² с новыми колонками: 0.5501

**Выводы**:
1. **Ухудшение качества**:
- Добавление текстовых признаков снизило R² на ~0.3, возможно из-за:
- перешумление признаков
- Неоптимальная обработка новых колонок

2. **Высокая ошибка предсказания MSE и MAE**

3. **Планы на будущее**:
- Стоит попробовать избавится от сильно коррелирующих признаков
- попробовать другие методы обработки, например MultiLabelBinarizer или еще что-нибудь