In [1]:
import numpy as np
import pandas as pd

In [2]:
loaded_data = pd.read_csv('../data/topic21_v23_train.csv')

loaded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7962 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   7501 non-null   float64
 1   1                   7472 non-null   float64
 2   2                   7359 non-null   float64
 3   3                   7556 non-null   float64
 4   4                   7495 non-null   float64
 5   brand               7962 non-null   object 
 6   model               7962 non-null   object 
 7   trim                7951 non-null   object 
 8   body_type           7962 non-null   object 
 9   fuel_type           7962 non-null   object 
 10  transmission_type   7962 non-null   object 
 11  engine_capacity_cc  6362 non-null   object 
 12  horsepower          7584 non-null   object 
 13  exterior_color      7962 non-null   object 
 14  interior_color      7962 non-null   object 
 15  warranty            7962 non-null   object 
 16  city  

In [3]:
def remove_outliers(df, threshold=2.5):
    # Create a copy of the dataframe
    df_copy = df.copy()

    # Get numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

    # Calculate z-scores for numeric columns
    z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())

    # Find rows where any column has z-score > threshold
    outliers = (z_scores > threshold).any(axis=1)

    # Print number of outliers removed
    print(f"Number of outliers removed: {outliers.sum()}")

    # Return dataframe without outliers
    return df_copy[~outliers]


In [4]:
from sklearn.model_selection import train_test_split

# loaded_data = loaded_data.dropna(subset=['0', '1', '2', '3', '4'])
# loaded_data = loaded_data.dropna()
loaded_data = remove_outliers(loaded_data)

loaded_data.info()

Number of outliers removed: 1028
<class 'pandas.core.frame.DataFrame'>
Index: 6934 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   0                   6528 non-null   float64
 1   1                   6491 non-null   float64
 2   2                   6407 non-null   float64
 3   3                   6589 non-null   float64
 4   4                   6531 non-null   float64
 5   brand               6934 non-null   object 
 6   model               6934 non-null   object 
 7   trim                6924 non-null   object 
 8   body_type           6934 non-null   object 
 9   fuel_type           6934 non-null   object 
 10  transmission_type   6934 non-null   object 
 11  engine_capacity_cc  5672 non-null   object 
 12  horsepower          6596 non-null   object 
 13  exterior_color      6934 non-null   object 
 14  interior_color      6934 non-null   object 
 15  warranty            6934 no

In [5]:
df = loaded_data.copy()

def extract_range_mean(val):
    try:
        nums = [int(s) for s in val.replace('cc','').replace('HP','').split('-')]
        return np.mean(nums)
    except:
        return np.nan

df['engine_capacity'] = df['engine_capacity_cc'].apply(extract_range_mean)
df['horsepower_val'] = df['horsepower'].apply(extract_range_mean)
df.drop(columns=['engine_capacity_cc', 'horsepower'], inplace=True)

for col in ['brand', 'model', 'trim']:
    means = df.groupby(col)['price'].mean()
    df[col + '_enc'] = df[col].map(means)

df.drop(columns=['brand', 'model', 'trim'], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6934 entries, 0 to 7961
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   0                  6528 non-null   float64
 1   1                  6491 non-null   float64
 2   2                  6407 non-null   float64
 3   3                  6589 non-null   float64
 4   4                  6531 non-null   float64
 5   body_type          6934 non-null   object 
 6   fuel_type          6934 non-null   object 
 7   transmission_type  6934 non-null   object 
 8   exterior_color     6934 non-null   object 
 9   interior_color     6934 non-null   object 
 10  warranty           6934 non-null   object 
 11  city               6934 non-null   object 
 12  seller_type        6934 non-null   object 
 13  price              6934 non-null   int64  
 14  engine_capacity    4264 non-null   float64
 15  horsepower_val     6133 non-null   float64
 16  brand_enc          6934 non-n

In [6]:
X = df.drop(columns=['price'])
y = df['price']

# X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train shape = {X_train.shape}')

X_train shape = (5547, 18)


In [7]:
# y_train = np.log1p(y_train)

In [8]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['0', '1', '2', '3', '4', 'engine_capacity', 'horsepower_val', 'brand_enc', 'model_enc', 'trim_enc']
Categorical features: ['body_type', 'fuel_type', 'transmission_type', 'exterior_color', 'interior_color', 'warranty', 'city', 'seller_type']


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_transformed = pipeline.fit_transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)


In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    #
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),

    "GradientBoosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "KNeighbors": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=1000),
    "LightGBM": LGBMRegressor()
}

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


def train_and_evaluate(models, X_train, y_train, X_valid, y_valid):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)

        # predictions = np.expm1(predictions)

        mse = mean_squared_error(y_valid, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_valid, predictions)
        r2 = r2_score(y_valid, predictions)

        results[name] = {
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "R2": r2
        }
    return results


In [12]:
results = train_and_evaluate(models, X_train_transformed, y_train, X_valid_transformed, y_valid)
results_df = pd.DataFrame(results).T.sort_values(by="RMSE")
print(results_df)

#                            MSE           RMSE           MAE        R2
# ExtraTrees        2.300680e+09   47965.405562  27334.086064  0.832969
# CatBoost          2.332535e+09   48296.323356  28911.441108  0.830657
# LightGBM          2.544481e+09   50442.845133  29909.264855  0.815269
# RandomForest      2.643218e+09   51412.237630  29510.046310  0.808101
# XGBoost           3.004045e+09   54809.167116  32024.214844  0.781905
# GradientBoosting  3.060930e+09   55325.675034  33478.641035  0.777775
# KNeighbors        3.073613e+09   55440.173602  33711.935667  0.776854
# Ridge             4.233555e+09   65065.776787  44321.855722  0.692641
# Lasso             4.240731e+09   65120.895425  44350.276718  0.692120
# LinearRegression  4.241846e+09   65129.453553  44358.322408  0.692040
# MLP               4.477839e+09   66916.657159  45434.789582  0.674906
# ElasticNet        4.663008e+09   68286.222193  45774.959768  0.661463
# AdaBoost          5.208130e+09   72167.372223  50042.730693  0.621887
# DecisionTree      6.519399e+09   80742.795897  41289.352886  0.526688
# SVR               1.557468e+10  124798.543912  84310.212564 -0.130731


Training SVR...
Training RandomForest...
Training ExtraTrees...
Training XGBoost...
Training CatBoost...
Training LinearRegression...
Training Ridge...
Training Lasso...
Training ElasticNet...
Training DecisionTree...
Training GradientBoosting...
Training AdaBoost...
Training KNeighbors...
Training MLP...




Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1965
[LightGBM] [Info] Number of data points in the train set: 5547, number of used features: 60
[LightGBM] [Info] Start training from score 128315.807644
                           MSE           RMSE           MAE        R2
ExtraTrees        2.783556e+09   52759.420166  28862.946482  0.813568
CatBoost          2.824519e+09   53146.208274  30925.670095  0.810824
RandomForest      2.897725e+09   53830.522311  30835.098919  0.805921
XGBoost           3.039064e+09   55127.709185  32600.339844  0.796455
LightGBM          3.182929e+09   56417.451044  32776.262044  0.786819
KNeighbors        3.468204e+09   58891.461548  35729.918962  0.767712
GradientBoosting  3.605717e+09   60047.618684  35375.775391  0.758502
MLP      



In [13]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "ExtraTrees": ExtraTreesRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(verbosity=0, random_state=42)
}

param_grids = {
    "RandomForest": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "ExtraTrees": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [None, 10, 20, 30, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]
    },
    "CatBoost": {
        'pca__n_components': [5, 10, 15, 19],
        'model__iterations': [500, 1000],
        'model__depth': [4, 6, 8, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__l2_leaf_reg': [1, 3, 5, 7, 9],
        'model__bagging_temperature': [0, 0.5, 1]
    },
    "XGBoost": {
        'pca__n_components': [5, 10, 15, 19],
        'model__n_estimators': [100, 300, 500, 800],
        'model__max_depth': [3, 5, 7, 10],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__gamma': [0, 0.1, 0.3, 0.5],
        'model__reg_alpha': [0, 0.1, 1],
        'model__reg_lambda': [1, 1.5, 2]
    }
}


In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV

preds = {}

pplines = {}
for name, model in models.items():
    print(f"\n GridSearch for {name}...")

    pipeline = Pipeline([
        ('pca', PCA()),
        ('model', model)
    ])

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train_transformed, y_train)

    print(f" Best params for {name}:")
    print(grid_search.best_params_)
    print(f" Best CV score (neg MSE): {-grid_search.best_score_:.4f}")

    best_pipeline = grid_search.best_estimator_

    cv_scores = cross_val_score(
        best_pipeline,
        X_train_transformed,
        y_train,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    print(f" Final 5-fold CV RMSE (log scale): {(-cv_scores.mean()) ** 0.5:.4f} ± {cv_scores.std() ** 0.5:.4f}")

    y_pred = best_pipeline.predict(X_valid_transformed)

    preds[name] = y_pred
    pplines[name] = best_pipeline


 GridSearch for RandomForest...
Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
 Best params for RandomForest:
{'model__max_depth': None, 'model__max_features': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 800, 'pca__n_components': 15}
 Best CV score (neg MSE): 3963025946.5291
 Final 5-fold CV RMSE (log scale): 62952.5690 ± 25490.6335

 GridSearch for ExtraTrees...
Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
 Best params for ExtraTrees:
{'model__max_depth': 30, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 800, 'pca__n_components': 19}
 Best CV score (neg MSE): 3601071226.4137
 Final 5-fold CV RMSE (log scale): 60008.9262 ± 24688.9499

 GridSearch for CatBoost...
Fitting 5 folds for each of 1920 candidates, totalling 9600 fits


In [None]:

for name, y_pred in preds.items():
    rmse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f"{name} on Validation:")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²:   {r2:.4f}")
    print()
