In [1]:
import numpy as np
import pandas as pd

In [2]:
loaded_data = pd.read_csv('../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [19]:
print(df['price'].describe())
print("\nstd:", df['price'].std())


print("median:", df['price'].median())

q99 = df['price'].quantile(0.99)
print("\n99-й quantile:", q99)
print("not in 99-го quantile:", (df['price'] > q99).sum())

Основные статистики по price:
count      7962.000000
mean     163331.653228
std      188397.899880
min        1000.000000
25%       44999.250000
50%       93000.000000
75%      207000.000000
max      999900.000000
Name: price, dtype: float64

Стандартное отклонение (std): 188397.89988031407

Количество пропусков в price: 0
Медиана (median): 93000.0

99-й перцентиль: 891340.000000002
Количество цен выше 99-го перцентиля: 80


In [3]:
def add_outlier_column(df, threshold=2.5):
    df_copy = df.copy()

    if 'price' not in df.columns:
        numeric_cols = df.select_dtypes(include=['float64', 'int64'])
    else:
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).drop('price', axis=1).columns

    df_copy['is_outlier'] = 0
    for col in numeric_cols:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        df_copy['is_outlier'] = (df_copy['is_outlier'] | (z_scores > threshold)).astype(int)

    return df_copy



In [4]:
df = loaded_data.copy()


def parse_mid(r):
    if pd.isna(r) or r == 'Unknown': return np.nan
    s = ''.join(c for c in r if c.isdigit() or c == '-')
    if '-' in s:
        lo, hi = map(int, s.split('-'))
        return (lo + hi) / 2
    return float(s)


for col in ['engine_capacity_cc', 'horsepower']:
    df[col + '_num'] = df[col].apply(parse_mid)
    df[col + '_miss'] = df[col + '_num'].isna().astype(int)
    df[col + '_num'] = df[col + '_num'].fillna(df[col + '_num'].median())
    df.drop(columns=[col], inplace=True)

df['is_automatic'] = (df['transmission_type'] == 'Automatic Transmission').astype(int)
df = df.drop('transmission_type', axis=1)

df['hp_per_cc'] = df['horsepower_num'] / df['engine_capacity_cc_num']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   0                        7501 non-null   float64
 1   1                        7472 non-null   float64
 2   2                        7359 non-null   float64
 3   3                        7556 non-null   float64
 4   4                        7495 non-null   float64
 5   brand                    7962 non-null   object 
 6   model                    7962 non-null   object 
 7   trim                     7951 non-null   object 
 8   body_type                7962 non-null   object 
 9   fuel_type                7962 non-null   object 
 10  exterior_color           7962 non-null   object 
 11  interior_color           7962 non-null   object 
 12  warranty                 7962 non-null   object 
 13  city                     7962 non-null   object 
 14  seller_type             

In [5]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(df, test_size=0.2, random_state=42)

train_set = add_outlier_column(train_set)
valid_set = add_outlier_column(valid_set)

print("Number of True values train:", train_set['is_outlier'].sum())
print("Number of True values valid:", valid_set['is_outlier'].sum())

for col in ['model']:
    means = train_set.groupby(col)['price'].mean()
    train_set[col + '_enc'] = train_set[col].map(means)
    valid_set[col + '_enc'] = valid_set[col].map(means)

train_set.drop(columns=['model'], inplace=True)
valid_set.drop(columns=['model'], inplace=True)


Number of True values train: 1465
Number of True values valid: 406


In [6]:
X_train = train_set.drop(columns=['price'])
y_train = train_set['price']

# y_train = np.log1p(y_train)

X_valid = valid_set.drop(columns=['price'])
y_valid = valid_set['price']

In [7]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity_cc_num', 'engine_capacity_cc_miss', 'horsepower_num', 'horsepower_miss', 'is_automatic', 'hp_per_cc', 'is_outlier', 'model_enc']
Categorical features: ['brand', 'trim', 'body_type', 'fuel_type', 'exterior_color', 'interior_color', 'warranty', 'city', 'seller_type']


In [8]:
correlation = train_set[numerical_features + ['price']].corr()

print(correlation['price'].sort_values(ascending=False))

price                      1.000000
model_enc                  0.695101
horsepower_num             0.409238
0                          0.273631
engine_capacity_cc_num     0.257073
engine_capacity_cc_miss    0.175946
hp_per_cc                  0.134919
1                          0.019409
is_outlier                 0.014194
is_automatic               0.003505
horsepower_miss           -0.046558
4                         -0.131088
3                         -0.242796
2                         -0.311485
Name: price, dtype: float64


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "ExtraTrees": ExtraTreesRegressor(random_state=42),

    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),

    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),

    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "KNeighbors": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=1000),
    "LightGBM": LGBMRegressor()
}


In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [12]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(models["ExtraTrees"], X_train_transformed, y_train, cv=5,
                            scoring='neg_mean_squared_error')

rmse_scores = np.sqrt(-cv_scores)
print("Cross-validation scores (RMSE):")
print("Mean RMSE: {:.2f} (+/- {:.2f})".format(rmse_scores.mean(), rmse_scores.std() * 2))

Cross-validation scores (RMSE):
Mean RMSE: 102697.22 (+/- 19386.08)


In [13]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                      MSE           RMSE          MAE        R2
# ExtraTrees        1.249172e+10  111766.347187   49114.416529  0.643662
# CatBoost          1.363584e+10  116772.586727   56559.222097  0.611025
# RandomForest      1.433247e+10  119718.292647   55151.764984  0.591153
# LightGBM          1.518073e+10  123210.113594   59571.078308  0.566955
# GradientBoosting  1.528405e+10  123628.661747   64333.711771  0.564008
# KNeighbors        1.669191e+10  129197.175477   62823.422724  0.523847
# Ridge             1.782411e+10  133506.982596   73239.950052  0.491550
# MLP               1.831009e+10  135314.787550   77891.181087  0.477687
# Lasso             1.840541e+10  135666.540440   73944.781059  0.474968
# LinearRegression  1.844263e+10  135803.642767   74052.291161  0.473906
# ElasticNet        1.915532e+10  138402.760991   80138.651567  0.453576
# AdaBoost          1.939229e+10  139256.199828   88143.301391  0.446816
# XGBoost           2.079412e+10  144201.665011   67901.914062  0.406828
# DecisionTree      2.609980e+10  161554.334249   70082.174513  0.255478
# SVR               3.999216e+10  199980.410485  117889.880474 -0.140815



Training ExtraTrees...
Training SVR...
Training RandomForest...
Training XGBoost...
Training CatBoost...
Training LinearRegression...
Training Ridge...
Training Lasso...


  model = cd_fast.sparse_enet_coordinate_descent(


Training ElasticNet...
Training DecisionTree...
Training GradientBoosting...
Training AdaBoost...
Training KNeighbors...
Training MLP...




Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1888
[LightGBM] [Info] Number of data points in the train set: 6369, number of used features: 149
[LightGBM] [Info] Start training from score 163310.384048
                           MSE           RMSE            MAE        R2
ExtraTrees        1.249172e+10  111766.347187   49114.416529  0.643662
CatBoost          1.363584e+10  116772.586727   56559.222097  0.611025
RandomForest      1.433247e+10  119718.292647   55151.764984  0.591153
LightGBM          1.518073e+10  123210.113594   59571.078308  0.566955
GradientBoosting  1.523523e+10  123431.054434   64319.007669  0.565401
KNeighbors        1.669191e+10  129197.175477   62823.422724  0.523847
Ridge             1.782411e+10  133506.982596   73239.950052  0.491550
Lasso             1.840541e+10  135666.540440   73944.781059  0.



In [14]:
models = {
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

param_grids = {

    "ExtraTrees": {
        # 'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "RandomForest": {
        # 'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "CatBoost": {
        # 'pca__n_components': [5, 10, 15, 19],
        'model__iterations': [500, 1000],
        'model__depth': [4, 6, 8, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__l2_leaf_reg': [1, 3, 5, 7, 9],
        'model__bagging_temperature': [0, 0.5, 1]
    }
}


In [15]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV

preds = {}

pplines = {}
for name, model in models.items():
    print(f"\n GridSearch for {name}...")

    pipeline = Pipeline([
        # ('pca', PCA()),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train_transformed, y_train)

    print(f" Best params for {name}:")
    print(grid_search.best_params_)
    print(f" Best CV score (neg MSE): {-grid_search.best_score_:.4f}")

    best_pipeline = grid_search.best_estimator_

    

    y_pred = best_pipeline.predict(X_valid_transformed)

    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    preds[name] = y_pred
    pplines[name] = best_pipeline


 GridSearch for ExtraTrees...
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
 Best params for ExtraTrees:
{'model__max_depth': 50, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 500}
 Best CV score (neg MSE): 10482991312.9485


  print(f" Final 5-fold CV R2: {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")


 Final 5-fold CV R2: nan ± 0.2014

 GridSearch for RandomForest...
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
 Best params for RandomForest:
{'model__max_depth': 30, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}
 Best CV score (neg MSE): 11491622800.2245


  print(f" Final 5-fold CV R2: {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")


 Final 5-fold CV R2: nan ± 0.1817

 GridSearch for CatBoost...
Fitting 5 folds for each of 480 candidates, totalling 2400 fits
 Best params for CatBoost:
{'model__bagging_temperature': 0, 'model__depth': 10, 'model__iterations': 1000, 'model__l2_leaf_reg': 1, 'model__learning_rate': 0.05}
 Best CV score (neg MSE): 10515390488.1741
 Final 5-fold CV R2: nan ± 0.1700


  print(f" Final 5-fold CV R2: {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")


In [18]:
for name, ppl in pplines.items():
    cv_scores = cross_val_score(
        ppl,
        X_train_transformed,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    print(f"Final {name}_model 5-fold CV RMSE: {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")


Final ExtraTrees_model 5-fold CV RMSE: 102386.4801 ± 45450.8830
Final RandomForest_model 5-fold CV RMSE: 107198.9869 ± 38655.6503
Final CatBoost_model 5-fold CV RMSE: 102544.5781 ± 40477.2928


In [20]:

for name, y_pred in preds.items():
    rmse = mean_squared_error(y_valid, y_pred)
    rmse = np.sqrt(rmse)
    mae = mean_absolute_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f"{name} on Validation:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²:   {r2:.4f}")
    print()


ExtraTrees on Validation:
RMSE: 111685.0749
MAE: 48932.1205
R²:   0.6442

RandomForest on Validation:
RMSE: 119608.2546
MAE: 54980.5737
R²:   0.5919

CatBoost on Validation:
RMSE: 117166.9667
MAE: 54010.2685
R²:   0.6084

