In [1]:
import numpy as np
import pandas as pd

In [2]:
loaded_data = pd.read_csv('../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [3]:
def remove_outliers(df, threshold=3):
    df_copy = df.copy()
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
    outliers = (z_scores > threshold).any(axis=1)

    print(f"Number of outliers removed: {outliers.sum()}")

    return df_copy[~outliers]


In [4]:
df = loaded_data.copy()

def extract_range_mean(val):
    try:
        nums = [int(s) for s in val.replace('cc','').replace('HP','').split('-')]
        return np.mean(nums)
    except:
        return np.nan

df['engine_capacity'] = df['engine_capacity_cc'].apply(extract_range_mean)
df['horsepower_val'] = df['horsepower'].apply(extract_range_mean)

df.drop(columns=['engine_capacity_cc', 'horsepower'], inplace=True)

df['is_automatic'] = (df['transmission_type'] == 'Automatic Transmission').astype(int)
df = df.drop('transmission_type', axis=1)

df['seller_is_owner'] = (df['seller_type'] == 'Owner').astype(int)
df['seller_is_dealer'] = (df['seller_type'] == 'Dealer').astype(int)
df['seller_is_cpo']   = (df['seller_type'] == 'Dealership/Certified Pre-Owned').astype(int)
df = df.drop('seller_type', axis=1)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   0                 7501 non-null   float64
 1   1                 7472 non-null   float64
 2   2                 7359 non-null   float64
 3   3                 7556 non-null   float64
 4   4                 7495 non-null   float64
 5   brand             7962 non-null   object 
 6   model             7962 non-null   object 
 7   trim              7951 non-null   object 
 8   body_type         7962 non-null   object 
 9   fuel_type         7962 non-null   object 
 10  exterior_color    7962 non-null   object 
 11  interior_color    7962 non-null   object 
 12  warranty          7962 non-null   object 
 13  city              7962 non-null   object 
 14  price             7962 non-null   int64  
 15  engine_capacity   4530 non-null   float64
 16  horsepower_val    7021 non-null   float64


In [5]:
from sklearn.model_selection import train_test_split


train_set , valid_set = train_test_split(df, test_size=0.2, random_state=42)

# train_set = remove_outliers(train_set)


for col in ['brand', 'model', 'trim']:
    means = train_set.groupby(col)['price'].mean()
    train_set[col + '_enc'] = train_set[col].map(means)
    valid_set[col + '_enc'] = valid_set[col].map(means)

train_set.drop(columns=['brand', 'model', 'trim'], inplace=True)
valid_set.drop(columns=['brand', 'model', 'trim'], inplace=True)


In [6]:
X_train = train_set.drop(columns=['price'])
y_train = train_set['price']

# y_train = np.log1p(y_train)

X_valid = valid_set.drop(columns=['price'])
y_valid = valid_set['price']

In [7]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity', 'horsepower_val', 'is_automatic', 'seller_is_owner', 'seller_is_dealer', 'seller_is_cpo', 'brand_enc', 'model_enc', 'trim_enc']
Categorical features: ['body_type', 'fuel_type', 'exterior_color', 'interior_color', 'warranty', 'city']


In [8]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),

    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),

    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "KNeighbors": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=1000),
    "LightGBM": LGBMRegressor()
}


In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [11]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                            MSE           RMSE            MAE        R2
# ExtraTrees        1.364939e+10  116830.603202   51545.506937  0.610638
# RandomForest      1.491750e+10  122137.233932   56506.986981  0.574464
# CatBoost          1.534907e+10  123891.356300   57695.248654  0.562153
# GradientBoosting  1.615095e+10  127086.385441   64424.381045  0.539279
# LightGBM          1.621853e+10  127351.995346   60649.685189  0.537351
# KNeighbors        1.710397e+10  130782.150957   64333.041055  0.512093
# Ridge             1.799964e+10  134162.731629   78556.794589  0.486543
# Lasso             1.800913e+10  134198.082115   78576.971162  0.486272
# LinearRegression  1.801033e+10  134202.585130   78580.566612  0.486238
# MLP               1.818794e+10  134862.688100   77399.885267  0.481171
# ElasticNet        1.862588e+10  136476.662202   78301.486216  0.468679
# AdaBoost          2.005669e+10  141621.658385   85959.299705  0.427864
# XGBoost           2.186608e+10  147871.837265   70018.210938  0.376249
# DecisionTree      2.844289e+10  168650.214328   76305.634652  0.188639
# SVR               3.998152e+10  199953.784643  117847.498633 -0.140511



Training SVR...
Training RandomForest...
Training ExtraTrees...
Training XGBoost...
Training CatBoost...
Training LinearRegression...
Training Ridge...
Training Lasso...


  model = cd_fast.sparse_enet_coordinate_descent(


Training ElasticNet...
Training DecisionTree...
Training GradientBoosting...
Training AdaBoost...
Training KNeighbors...
Training MLP...




Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1966
[LightGBM] [Info] Number of data points in the train set: 6369, number of used features: 60
[LightGBM] [Info] Start training from score 163310.384048
                           MSE           RMSE            MAE        R2
ExtraTrees        1.364939e+10  116830.603202   51545.506937  0.610638
RandomForest      1.491750e+10  122137.233932   56506.986981  0.574464
CatBoost          1.534907e+10  123891.356300   57695.248654  0.562153
GradientBoosting  1.614723e+10  127071.760763   64433.437327  0.539385
LightGBM          1.621853e+10  127351.995346   60649.685189  0.537351
KNeighbors        1.710397e+10  130782.150957   64333.041055  0.512093
Ridge             1.799964e+10  134162.731629   78556.794589  0.486543
Lasso             1.800913e+10  134198.082115   78576.971162  0.4



In [16]:
models = {
    # "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    # "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

param_grids = {
    # "RandomForest": {
    #     'pca__n_components': [5, 10, 15, 19],
    #     'model__n_estimators': [100, 300, 500, 800],
    #     'model__max_depth': [None, 10, 20, 30, 50],
    #     'model__min_samples_split': [2, 5, 10],
    #     'model__min_samples_leaf': [1, 2, 4],
    #     'model__max_features': ['sqrt', 'log2', None]
    # },
    "ExtraTrees": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    # "CatBoost": {
    #     'pca__n_components': [5, 10, 15, 19],
    #     'model__iterations': [500, 1000],
    #     'model__depth': [4, 6, 8, 10],
    #     'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    #     'model__l2_leaf_reg': [1, 3, 5, 7, 9],
    #     'model__bagging_temperature': [0, 0.5, 1]
    # }
}


In [17]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV

preds = {}

pplines = {}
for name, model in models.items():
    print(f"\n GridSearch for {name}...")

    pipeline = Pipeline([
        ('pca', PCA()),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train_transformed, y_train)

    print(f" Best params for {name}:")
    print(grid_search.best_params_)
    print(f" Best CV score (neg MSE): {-grid_search.best_score_:.4f}")

    best_pipeline = grid_search.best_estimator_

    cv_scores = cross_val_score(
        best_pipeline,
        X_train_transformed,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    print(f" Final 5-fold CV RMSE (log scale): {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")

    y_pred = best_pipeline.predict(X_valid_transformed)

    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    preds[name] = y_pred
    pplines[name] = best_pipeline


 GridSearch for ExtraTrees...
Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
 Best params for ExtraTrees:
{'model__max_depth': 30, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 800, 'pca__n_components': 19}
 Best CV score (neg MSE): 10710625775.7481
 Final 5-fold CV RMSE (log scale): 103492.1532 ± 38052.0522
ExtraTrees on Validation:
RMSE: 13985265222.6773
R²:   0.6011



In [18]:

for name, y_pred in preds.items():
    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f"{name} on Validation:")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²:   {r2:.4f}")
    print()


ExtraTrees on Validation:
RMSE: 13985265222.6773
R²:   0.6011

