In [1]:
import numpy as np
import pandas as pd

In [2]:
loaded_data = pd.read_csv('../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [3]:
def remove_outliers(df, threshold=2.5):
    df_clean = df.copy()
    initial_rows = len(df)
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    removed_rows = initial_rows - len(df_clean)
    print(f"Removed {removed_rows} rows ({(removed_rows / initial_rows) * 100:.2f}% of data)")

    return df_clean


In [4]:
from sklearn.model_selection import train_test_split

loaded_data = loaded_data.dropna(subset=['0', '1', '2', '3', '4'])
# loaded_data = loaded_data.dropna()
loaded_data = remove_outliers(loaded_data)

loaded_data.info()

Removed 527 rows (9.07% of data)
<class 'pandas.core.frame.DataFrame'>
Index: 5284 entries, 0 to 7960
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   5284 non-null   float64
 1   1                   5284 non-null   float64
 2   2                   5284 non-null   float64
 3   3                   5284 non-null   float64
 4   4                   5284 non-null   float64
 5   brand               5284 non-null   object 
 6   model               5284 non-null   object 
 7   trim                5278 non-null   object 
 8   body_type           5284 non-null   object 
 9   fuel_type           5284 non-null   object 
 10  transmission_type   5284 non-null   object 
 11  engine_capacity_cc  4287 non-null   object 
 12  horsepower          5025 non-null   object 
 13  exterior_color      5284 non-null   object 
 14  interior_color      5284 non-null   object 
 15  warranty            5284 no

In [5]:
df = loaded_data.copy()

def extract_range_mean(val):
    try:
        nums = [int(s) for s in val.replace('cc','').replace('HP','').split('-')]
        return np.mean(nums)
    except:
        return np.nan

df['engine_capacity'] = df['engine_capacity_cc'].apply(extract_range_mean)
df['horsepower_val'] = df['horsepower'].apply(extract_range_mean)
df.drop(columns=['engine_capacity_cc', 'horsepower'], inplace=True)

for col in ['brand', 'model', 'trim']:
    means = df.groupby(col)['price'].mean()
    df[col + '_enc'] = df[col].map(means)

df.drop(columns=['brand', 'model', 'trim'], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5284 entries, 0 to 7960
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   0                  5284 non-null   float64
 1   1                  5284 non-null   float64
 2   2                  5284 non-null   float64
 3   3                  5284 non-null   float64
 4   4                  5284 non-null   float64
 5   body_type          5284 non-null   object 
 6   fuel_type          5284 non-null   object 
 7   transmission_type  5284 non-null   object 
 8   exterior_color     5284 non-null   object 
 9   interior_color     5284 non-null   object 
 10  warranty           5284 non-null   object 
 11  city               5284 non-null   object 
 12  seller_type        5284 non-null   object 
 13  price              5284 non-null   int64  
 14  engine_capacity    3172 non-null   float64
 15  horsepower_val     4645 non-null   float64
 16  brand_enc          5284 non-n

In [6]:
X = df.drop(columns=['price'])
y = df['price']

# X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train shape = {X_train.shape}')

X_train shape = (4227, 18)


In [7]:
# y_train = np.log1p(y_train)

In [8]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity', 'horsepower_val', 'brand_enc', 'model_enc', 'trim_enc']
Categorical features: ['body_type', 'fuel_type', 'transmission_type', 'exterior_color', 'interior_color', 'warranty', 'city', 'seller_type']


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),

    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),

    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "KNeighbors": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=1000),
    "LightGBM": LGBMRegressor()
}

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [12]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                            MSE           RMSE           MAE        R2
# ExtraTrees        2.300680e+09   47965.405562  27334.086064  0.832969
# CatBoost          2.332535e+09   48296.323356  28911.441108  0.830657
# LightGBM          2.544481e+09   50442.845133  29909.264855  0.815269
# RandomForest      2.643218e+09   51412.237630  29510.046310  0.808101
# XGBoost           3.004045e+09   54809.167116  32024.214844  0.781905
# GradientBoosting  3.060930e+09   55325.675034  33478.641035  0.777775
# KNeighbors        3.073613e+09   55440.173602  33711.935667  0.776854
# Ridge             4.233555e+09   65065.776787  44321.855722  0.692641
# Lasso             4.240731e+09   65120.895425  44350.276718  0.692120
# LinearRegression  4.241846e+09   65129.453553  44358.322408  0.692040
# MLP               4.477839e+09   66916.657159  45434.789582  0.674906
# ElasticNet        4.663008e+09   68286.222193  45774.959768  0.661463
# AdaBoost          5.208130e+09   72167.372223  50042.730693  0.621887
# DecisionTree      6.519399e+09   80742.795897  41289.352886  0.526688
# SVR               1.557468e+10  124798.543912  84310.212564 -0.130731


Training SVR...
Training RandomForest...
Training ExtraTrees...
Training XGBoost...
Training CatBoost...
Training LinearRegression...
Training Ridge...
Training Lasso...
Training ElasticNet...
Training DecisionTree...
Training GradientBoosting...
Training AdaBoost...
Training KNeighbors...
Training MLP...




Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1951
[LightGBM] [Info] Number of data points in the train set: 4227, number of used features: 59
[LightGBM] [Info] Start training from score 126314.462266
                           MSE           RMSE           MAE        R2
ExtraTrees        2.300680e+09   47965.405562  27334.086064  0.832969
CatBoost          2.332535e+09   48296.323356  28911.441108  0.830657
LightGBM          2.544481e+09   50442.845133  29909.264855  0.815269
RandomForest      2.643218e+09   51412.237630  29510.046310  0.808101
XGBoost           3.004045e+09   54809.167116  32024.214844  0.781905
GradientBoosting  3.060930e+09   55325.675034  33478.641035  0.777775
KNeighbors        3.073613e+09   55440.173602  33711.935667  0.776854
Ridge             4.233555e+09   65065.776787  44321.855722  0.692641
Las



In [13]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42)
}

param_grids = {
    "RandomForest": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "ExtraTrees": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "CatBoost": {
        'pca__n_components': [5, 10, 15, 19],
        'model__iterations': [500, 1000],
        'model__depth': [4, 6, 8, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__l2_leaf_reg': [1, 3, 5, 7, 9],
        'model__bagging_temperature': [0, 0.5, 1]
    },
    "XGBoost": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [3, 5, 7, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__gamma': [0, 0.1, 0.3, 0.5],
        'model__reg_alpha': [0, 0.1, 1],
        'model__reg_lambda': [1, 1.5, 2]
    }
}


In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV

preds = {}
for name, model in models.items():
    print(f"\n GridSearch for {name}...")

    pipeline = Pipeline([
        ('pca', PCA()),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train_transformed, y_train)

    print(f" Best params for {name}:")
    print(grid_search.best_params_)
    print(f" Best CV score (neg MSE): {-grid_search.best_score_:.4f}")

    best_pipeline = grid_search.best_estimator_

    cv_scores = cross_val_score(
        best_pipeline,
        X_train_transformed,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    print(f" Final 5-fold CV RMSE (log scale): {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")

    y_pred_log = best_pipeline.predict(X_valid_transformed)

    y_pred = np.expm1(y_pred_log)

    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f"{name} on Validation:")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²:   {r2:.4f}")

    preds[name] = y_pred


 GridSearch for RandomForest...
Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
