Used Car Price Prediction Dataset https://www.kaggle.com/datasets/taeefnajib/used-car-price-prediction-dataset .

This notebook builds three simple models: xgb, lgbm and cat. Hyperparameters are fine-tuned for each of them. Optimized blending weights are found through Optuna tuning. A cv rmse score of 62058 is reached on original dataset.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_theme(style="ticks", palette="pastel")
import matplotlib.pyplot as plt

import re
import sklearn
from sklearn import preprocessing
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler, RobustScaler
from numpy import percentile
from sklearn.decomposition import PCA

from sklearn.linear_model import HuberRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.metrics import mean_squared_error
import gc
import warnings
warnings.filterwarnings("ignore")

import optuna
from optuna.visualization import plot_param_importances
from optuna.samplers import RandomSampler, TPESampler, CmaEsSampler
from optuna.pruners import HyperbandPruner
from functools import partial

In [None]:
pip install cmaes

In [None]:
random_seed = 0
n_fold = 5

# 1. Import and Glance at the Data.

In [None]:
original = pd.read_csv("/kaggle/input/used-car-price-prediction-dataset/used_cars.csv")

The original dataset have 4009 samples. Only "model_year" column is numerical. All the others are categorical. The columns "fuel_type", "accident" and "clean_title" have null values.

.info(), .describe(), .isnull().sum() and .unique() are frequently used throughout the notebook to check intermediate results.

# 2. Feature Engineering.

Change formats of both milage and price columns in the original datasets. New features: car age, horsepower, engine_displacement, no_of_cylinder, transmission_speed and transmission_type. Drop engine, transmission and model_year columns.

In [None]:
original['milage'] = original['milage'].str.replace(',', '').str.replace(' mi.', '').astype(float)
original['price'] = original['price'].str.replace(',', '').str.replace('$', '').astype(float)

In [None]:
def engine_feat_extract(data):
    df=data.copy()
    
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)HP')
    
    df['engine'] = df['engine'].apply(lambda x: x.replace(' Litre', 'L'))
    df['engine'] = df['engine'].apply(lambda x: x.replace(' Liter', 'L'))
    df['engine'] = df['engine'].apply(lambda x: x.replace(' L', 'L'))
    df['engine_displacement'] = df['engine'].str.extract(r'(\d+\.\d+)L')
    
    df['engine'] = df['engine'].apply(lambda x: x.replace('V-', 'V'))
    df['no_of_cylinder'] = df['engine'].str.extract(r'( \d+ | V\d+ | I\d+ | W\d+ | H\d+ |I\d+ |V\d+ |V\d+|I\d+)')
    df['no_of_cylinder'] = df['no_of_cylinder'].str.strip()
    df['no_of_cylinder'] = df['no_of_cylinder'].str.replace('V','').str.replace('I','').str.replace('H','').str.replace('W','')
    
    df.drop('engine', axis=1, inplace=True)
    
    return df

In [None]:
def transmission_feat_extract(data):
    df=data.copy()
    
    df['transmission'] = df['transmission'].str.replace('Automatic','AT').str.replace('A/T','AT').str.replace('At','AT')
    df['transmission'] = df['transmission'].str.replace('M/T','MT').str.replace('Mt','MT').str.replace('Manual', 'MT')
    df['transmission'] = df['transmission'].str.replace('Auto', 'AT')
    df['transmission'] = df['transmission'].str.replace('-Spd', '-Speed').str.replace('-SPEED', '-Speed')
    df['transmission'] = df['transmission'].str.replace('Single-Speed', '1-Speed').str.replace(' Speed', '-Speed')
    
    df['transmission_speed'] = df['transmission'].str.extract(r'(\d+)-Speed')
    df['transmission_speed'] = df['transmission_speed'].str.strip()
    
    df['transmission_type'] = df['transmission']
    df.loc[df.transmission.str.contains('AT/MT', na=False, case=False), 'transmission_type'] = 'AT/MT'
    df.loc[df.transmission.str.contains('AT', na=False, case=False), 'transmission_type'] = 'AT'
    df.loc[df.transmission.str.contains('MT', na=False, case=False), 'transmission_type'] = 'MT'
    df['transmission_type'] = df['transmission_type'].str.replace('1-Speed Fixed Gear', 'Fixed Gear')
    df.loc[df.transmission_type.str.contains('6-Speed', na=False, case=False), 'transmission_type'] = np.nan
    df.loc[df.transmission_type.str.contains('7-Speed', na=False, case=False), 'transmission_type'] = np.nan
    
    #df['transmission_type'] = df['transmission'].str.replace(r'([0-9]+-Speed)', '', regex=True)
    
    df.drop('transmission', axis=1, inplace=True)

    return df

In [None]:
def feature_engineering(data):
    df=data.copy()
    
    df = engine_feat_extract(df)
    df = transmission_feat_extract(df)
    
    df['age'] = 2025 - df['model_year']
    df.drop(['model_year'], axis=1, inplace=True)
    
    df['milage_per_year'] = df['milage'] / df['age']
    
    df['brand'] = df['brand'].str.lower()
    df['model'] = df['model'].str.lower()
    df['ext_col'] = df['ext_col'].str.lower()
    df['int_col'] = df['int_col'].str.lower()
    df['transmission_type'] = df['transmission_type'].str.lower()
    
    return df

In [None]:
original = feature_engineering(original)

Now the dataset has 15 features.

# 3. Fill in Missing Values.

Need to deal with both numerical('horsepower', 'engine_displacement', 'no_of_cylinder' and 'transmission_speed') and categorical('fuel_type', 'accident', 'clean_title' and 'transmission_type') missing values.

OrdinalEncoder() with encoded_missing_value set to -1 will turn missing values to -1. Compared to SimpleImputer and KNNImputer, I think OrdinalEncoder() makes sense for this dataset. Turn other categorical variables into numbers.

In [None]:
def fill_missing(data):
    df = data.copy()
    
    for col in ['horsepower', 'engine_displacement', 'no_of_cylinder', 'transmission_speed']:
        df[[col]] = df[[col]].fillna(df[col].mode()[0])
        df[[col]] = df[[col]].astype(float)
                
    for col in ['fuel_type','accident','clean_title','transmission_type']:
        enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1)
        df[[col]] = enc.fit_transform(df[[col]])    
        
    #df['fuel_type'] = df['fuel_type'].fillna('none')
    #df['accident'] = df['accident'].fillna('empty')
    #df['clean_title'] = df['clean_title'].fillna('empty')    
        
    for col in ['brand','model','ext_col','int_col']:
        enc = preprocessing.OrdinalEncoder()
        df[[col]] = enc.fit_transform(df[[col]])            
        
    return df    

In [None]:
original = fill_missing(original)

# 4. Build Models.

In [None]:
X_original = original.drop(['price'], axis=1)
y_original = original['price']

In [None]:
scaler = StandardScaler()
scaler.fit(X_original)

Three models are built. Hyperparameters are tuned by Optuna for each of them.

* 4.1 XGB

In [None]:
def objective(trial):
    params = {
        'eta': 0.02,
        'n_estimators': 10000,
        'alpha': trial.suggest_float('alpha', 0., 1.0),
        'lambda': trial.suggest_float('lambda', 1., 100.0),
        'subsample': trial.suggest_float('subsample', 0., 1.0), #column-wise sampling 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0., 1.0), #row-wise sampling
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_float("min_child_weight", 1., 50.),
        'gamma': trial.suggest_float('gamma', 0., 1.0),
        #'max_bin': trial.suggest_int('max_bin', 20, 400), # Gpu does not accept customized max_bin.
        'tree_method': 'gpu_hist',
        'eval_metric': 'rmse',
        'random_state': random_seed,
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'grow_policy': 'lossguide',
        'verbosity': 1,
        'device': 'gpu',
    }
    
    cv = KFold(n_fold, shuffle=True, random_state=random_seed)
    cv_splits = cv.split(X_original, y_original)
    
    val_preds = np.zeros(len(X_original))

    model = XGBRegressor(**params)
    
    for train_idx, val_idx in cv_splits:
        X_train_fold = X_original.iloc[train_idx]
        y_train_fold = y_original.iloc[train_idx]
        X_val_fold, y_val_fold = X_original.iloc[val_idx], y_original.iloc[val_idx]
        
        X_train_fold = scaler.transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
    
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=100, verbose=False)
                
        val_preds[val_idx] = model.predict(X_val_fold)
        gc.collect()
        
        rmse_full = mean_squared_error(y_original, val_preds, squared=False)
    
    return rmse_full

In [None]:
study_name = "xgb"
sampler = TPESampler(multivariate=True, group=True, seed=random_seed)

optimize = False
if optimize:
    study = optuna.create_study(study_name=study_name, sampler=sampler, direction="minimize", load_if_exists=True)
    study.optimize(objective, timeout=1800)
    
    print(f"best optimized rmse: {study.best_value:0.5f}") #72612
    print(f"best hyperparameters: {study.best_params}") 
    xgb_params = study.best_params
else:
    xgb_params = {
        'eta': 0.02,
        'n_estimators': 10000,
        'alpha': 0.4044903388091685, 
        'lambda': 51.28639166297703, 
        'subsample': 0.9595530034937119, 
        'colsample_bytree': 0.5938659459702476, 
        'max_depth': 13, 
        'min_child_weight': 4.545108004764614, 
        'gamma': 0.2646495961144566,
        'tree_method': 'gpu_hist',
        'eval_metric': 'rmse',
        'random_state': random_seed,
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'grow_policy': 'lossguide',
        'verbosity': 1,
        'device': 'gpu',
    }

* 4.2 LGBM

In [None]:
def objective(trial):
    params = {
        'learning_rate': 0.01,
        'n_estimators': 2000,
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 13),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), #row-wise sampling
        'subsample': trial.suggest_float('subsample', 0.5, 1.0), #column-wise sampling   
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.1, 1.0),
        'bin_construct_sample_cnt': trial.suggest_int('bin_construct_sample_cnt', 20000, 300000),
        #'max_bin': trial.suggest_int('max_bin', 20, 400), # Gpu does not accept customized max_bin.
        'random_state': random_seed,
        'verbosity': -1,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'device': 'gpu',
        'eval_metric': 'l2',
    }
    
    cv = KFold(n_fold, shuffle=True, random_state=random_seed)
    cv_splits = cv.split(X_original, y_original)
    
    val_preds = np.zeros(len(X_original))
    
    model = LGBMRegressor(**params)
    
    for train_idx, val_idx in cv_splits:
        X_train_fold = X_original.iloc[train_idx]
        y_train_fold = y_original.iloc[train_idx]
        X_val_fold, y_val_fold = X_original.iloc[val_idx], y_original.iloc[val_idx]
        
        X_train_fold = scaler.transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
    
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], eval_metric='l2', callbacks=[lgb.early_stopping(stopping_rounds=100)])
                
        val_preds[val_idx] = model.predict(X_val_fold)
        gc.collect()
        
        rmse_full = mean_squared_error(y_original, val_preds, squared=False)
    
    return rmse_full

In [None]:
study_name = "lgbm"
sampler = TPESampler(multivariate=True, group=True, seed=random_seed)

optimize = False
if optimize:
    study = optuna.create_study(study_name=study_name, sampler=sampler, direction="minimize", load_if_exists=True)
    study.optimize(objective, timeout=1800)
    
    print(f"best optimized rmse: {study.best_value:0.5f}") 
    print(f"best hyperparameters: {study.best_params}") 
    lgbm_params = study.best_params
else:
    lgbm_params = {
        'learning_rate': 0.01,
        'n_estimators': 2000,
        'num_leaves': 124, 
        'min_child_samples': 22, 
        'max_depth': 12, 
        'reg_alpha': 8.09210204580889e-09, 
        'reg_lambda': 2.757213845367804e-06, 
        'colsample_bytree': 0.8235143062544691, 
        'subsample': 0.7315131225465594, 
        'min_gain_to_split': 0.43989457605442905, 
        'bin_construct_sample_cnt': 226499,
        'random_state': random_seed,
        'verbosity': -1,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'device': 'gpu',
        'eval_metric': 'l2',
    }

* 4.3 CATB

In [None]:
def objective(trial):
    params = {
        'learning_rate': 0.05,
        'iterations': 2000,
        'depth': trial.suggest_int('depth', 4, 16),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 100),
        #"rsm": trial.suggest_float("rsm", 0.5, 1.0),
        'random_strength': trial.suggest_float("random_strength", 0., 10.),
        'border_count': trial.suggest_int('border_count', 20, 500),
        'bagging_temperature': trial.suggest_float("bagging_temperature", 0., 1.),
        'random_state': random_seed,
        'grow_policy': 'SymmetricTree',
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU',
        'logging_level': 'Silent',
    }
    
    cv = KFold(n_fold, shuffle=True, random_state=random_seed)
    cv_splits = cv.split(X_original, y_original)
    
    val_preds = np.zeros(len(X_original))
    
    model = CatBoostRegressor(**params)
    
    for train_idx, val_idx in cv_splits:
        X_train_fold = X_original.iloc[train_idx]
        y_train_fold = y_original.iloc[train_idx]
        X_val_fold, y_val_fold = X_original.iloc[val_idx], y_original.iloc[val_idx]
        
        X_train_fold = scaler.transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
    
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=100, verbose=False)
                
        val_preds[val_idx] = model.predict(X_val_fold)
        gc.collect()
        
        rmse_full = mean_squared_error(y_original, val_preds, squared=False)
    
    return rmse_full

In [None]:
study_name = "catb"
sampler = TPESampler(multivariate=True, group=True, seed=random_seed)

optimize = False
if optimize:
    study = optuna.create_study(study_name=study_name, sampler=sampler, direction="minimize", load_if_exists=True)
    study.optimize(objective, timeout=1800)
    
    print(f"best optimized rmse: {study.best_value:0.5f}") 
    print(f"best hyperparameters: {study.best_params}") 
    catb_params = study.best_params
else:
    catb_params = {
        'learning_rate': 0.05,
        'iterations': 2000,
        'depth': 11, 
        'l2_leaf_reg': 71.80374727086952, 
        'random_strength': 6.027633760716439, 
        'border_count': 282, 
        'bagging_temperature': 0.4236547993389047,
        'random_state': random_seed,
        'grow_policy': 'SymmetricTree',
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU',
        'logging_level': 'Silent',
    }

# 5. Cross Validation and Prediction Analysis.

Apply three hyperparameter-tuned models on the original dataset to get both the cv scores and predictions.

In [None]:
def validation(model):
    cv = KFold(n_fold, shuffle=True, random_state=random_seed)
    cv_splits = cv.split(X_original, y_original)
    
    val_preds = np.zeros(len(X_original))
    
    for train_idx, val_idx in cv_splits:
        X_train_fold = X_original.iloc[train_idx]
        y_train_fold = y_original.iloc[train_idx]
        X_val_fold, y_val_fold = X_original.iloc[val_idx], y_original.iloc[val_idx]
        
        X_train_fold = scaler.transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
        
        model_cloned = clone(model)
        
        if isinstance(model_cloned, XGBRegressor):
            model_cloned.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=100, verbose=False)
        elif isinstance(model_cloned, LGBMRegressor):
            model_cloned.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], eval_metric='l2', callbacks=[lgb.early_stopping(stopping_rounds=100)])
        elif isinstance(model_cloned, CatBoostRegressor):
            model_cloned.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=100, verbose=False)
        
        val_preds[val_idx] = model_cloned.predict(X_val_fold)
        
        rmse_full = mean_squared_error(y_original, val_preds, squared=False)
        
    return rmse_full, val_preds

In [None]:
rmse_full_xgb, val_preds_xgb = validation(XGBRegressor(**xgb_params))

rmse_full_lgbm, val_preds_lgbm = validation(LGBMRegressor(**lgbm_params))

rmse_full_catb, val_preds_catb = validation(CatBoostRegressor(**catb_params))

In [None]:
fig, axes = plt.subplots(1, 4)
sns.boxplot(data=y_original,ax=axes[0]).set_title('y-original')
sns.boxplot(data=val_preds_xgb,ax=axes[1]).set_title('val_preds_xgb')
sns.boxplot(data=val_preds_lgbm,ax=axes[2]).set_title('val_preds_lgbm')
sns.boxplot(data=val_preds_catb,ax=axes[3]).set_title('val_preds_catb')
fig.tight_layout()
plt.show()

All three models underpredict vehicle prices for most training samples.

In [None]:
plt.scatter(y_original, val_preds_xgb, alpha=0.5)
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

In [None]:
plt.scatter(y_original, val_preds_lgbm, alpha=0.5)
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

In [None]:
plt.scatter(y_original, val_preds_catb, alpha=0.5)
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# 6. Optuna Weights and Final Prediction.

Optuna is used to find the optimal weights for an ensemble of three models. The weights that minimize full train dataset's rmse score are needed.

In [None]:
class OptunaWeights:
    def __init__(self, random_state, n_trials=5000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 0, 1) for n in range(len(y_preds) - 1)]
        weights.append(1 - sum(weights))  # Ensure the sum of weights is 1

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=weights)

        rmse_full = mean_squared_error(y_true, weighted_pred, squared=False)
        return rmse_full  

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = CmaEsSampler(seed=self.random_state)
        pruner = HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights",
                                         direction='minimize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials, show_progress_bar=True)
        weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds) - 1)]
        weights.append(1 - sum(weights))  # Ensure the sum of weights is 1
        self.weights = weights

In [None]:
ow = OptunaWeights(random_seed)
ow.fit(y_original, y_preds=[val_preds_xgb, val_preds_lgbm, val_preds_catb])
weights = ow.weights
print(weights)

Final prediction:

In [None]:
weights[0] * val_preds_xgb + weights[1] * val_preds_lgbm + weights[2] * val_preds_catb