
---

# **Used Car Price Regression Problem 🏦📊**

---


---

# **Imports 📦🔧**

---

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import clone
import re

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import *
from sklearn.preprocessing import *

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import log_evaluation, early_stopping

from sklearn.metrics import *

pd.set_option('display.max_columns', None)
from IPython.display import clear_output
from tqdm import tqdm, trange
from tabulate import tabulate
import random
import time
import logging
from IPython.display import display
from IPython.display import display, HTML
from colorama import Fore
from datetime import datetime


---

# **Load and Basic Preprocessing Data 📥📊**

---

In [2]:
%%time

sample_sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
Original = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')

Original[['milage', 'price']] = Original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

train = pd.concat([train, Original], ignore_index=True)

def update(df):
    
    t = 100
    
    df['accident'] = df['accident'].map({
        'None reported': 'not_reported',
        'At least 1 accident or damage reported': 'reported'
    })
    df['transmission'] = df['transmission'].str.replace('/', '').str.replace('-', '')
    df['transmission'] = df['transmission'].str.replace(' ', '_')
    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

train  = update(train)
test   = update(test)

CPU times: user 2.06 s, sys: 201 ms, total: 2.26 s
Wall time: 2.67 s


In [3]:
%%time

def feature(df):
    current_year = datetime.now().year

    df['Vehicle_Age'] = current_year - df['model_year']

    df['Mileage_per_Year'] = df['milage'] / df['Vehicle_Age']

    def extract_horsepower(engine):
        try:
            return float(engine.split('HP')[0])
        except:
            return None

    def extract_engine_size(engine):
        try:
            return float(engine.split(' ')[1].replace('L', ''))
        except:
            return None

    df['Horsepower'] = df['engine'].apply(extract_horsepower)
    df['Engine_Size'] = df['engine'].apply(extract_engine_size)
    df['Power_to_Weight_Ratio'] = df['Horsepower'] / df['Engine_Size']

    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    df['Is_Luxury_Brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    df['Accident_Impact'] = df.apply(lambda x: 1 if x['accident'] == 1 and x['clean_title'] == 0 else 0, axis=1)
    
    return df

train = feature(train)
test = feature(test)

CPU times: user 2.56 s, sys: 80.4 ms, total: 2.64 s
Wall time: 2.64 s


In [4]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,Vehicle_Age,Mileage_per_Year,Horsepower,Engine_Size,Power_to_Weight_Ratio,Is_Luxury_Brand,Accident_Impact
0,0.0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,AT,Yellow,Gray,not_reported,Yes,4200,17,12529.411765,172.0,1.6,107.5,0,0
1,1.0,Lincoln,noise,2002,143250,Gasoline,noise,AT,Silver,Beige,reported,Yes,4999,22,6511.363636,,,,0,0
2,2.0,Chevrolet,noise,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,AT,Blue,Gray,not_reported,Yes,13900,22,6215.045455,320.0,5.3,60.377358,0,0
3,3.0,Genesis,noise,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission_wDual_Shift_Mode,Black,Black,not_reported,Yes,45000,7,2785.714286,420.0,5.0,84.0,0,0
4,4.0,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7Speed_AT,Black,Beige,not_reported,Yes,97500,3,2462.666667,208.0,2.0,104.0,1,0



---

# **Basic Modeling 🧩📉**

---

In [18]:
%%time

X = train.drop(['price'], axis=1)
y = train['price']
callbacks = [early_stopping(stopping_rounds=100)]

SEED = 601
n_splits = 5

def Train_ML(model, model_name, test):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    oof_preds = np.zeros(X.shape[0])
    test_preds = np.zeros(test.shape[0])
    val_rmse_list = []
    train_rmse_list = []

    for fold_idx, (train_index, val_index) in tqdm(enumerate(kf.split(X)), desc=f"Model: {model_name}", total=n_splits):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model_clone = clone(model)
        model_clone.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)

        val_preds = model_clone.predict(X_val)
        oof_preds[val_index] = val_preds
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        val_rmse_list.append(val_rmse)

        train_preds = model_clone.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
        train_rmse_list.append(train_rmse)

        test_preds += model_clone.predict(test)
        clear_output(wait=True)

    mean_test_preds = test_preds / n_splits
    mean_val_rmse = np.mean(val_rmse_list)
    mean_train_rmse = np.mean(train_rmse_list)

    results = {
        'model_name': model_name,
        'mean_train_rmse': mean_train_rmse,
        'mean_val_rmse': mean_val_rmse,
        'per_fold_train_rmse': train_rmse_list,
        'per_fold_val_rmse': val_rmse_list
    }

    print(f"Model: {model_name}")
    print(f"Mean Train RMSE: {mean_train_rmse:.5f}")
    print(f"Mean Validation RMSE: {mean_val_rmse:.5f}\n")

    return oof_preds, mean_test_preds

CPU times: user 9.62 ms, sys: 5 µs, total: 9.62 ms
Wall time: 8.2 ms


In [19]:
%%time

Light2 = {'objective': 'regression','metric': 'rmse','num_boost_round': 10_000,'learning_rate': 0.023395755673174177, 'max_depth': 4, 'num_leaves': 159, 'min_child_weight': 6.64512679143092, 'min_split_gain': 1.6984507610468915e-07, 'subsample': 0.5598176343183838, 'colsample_bytree': 0.510945164298283, 'lambda_l1': 0.1368118399550561, 'lambda_l2': 4.590879971301159}

Light1 = LGBMRegressor(**Light2, random_state=SEED, verbose=-1)
of_p2, mpL1 = Train_ML(Light1,'LGB_Tunned_2',test)

Model: LGB_Tunned_2: 100%|██████████| 5/5 [01:03<00:00, 12.72s/it]

Model: LGB_Tunned_2
Mean Train RMSE: 70780.85663
Mean Validation RMSE: 72498.61291

CPU times: user 1min 3s, sys: 51 ms, total: 1min 3s
Wall time: 1min 3s






---

# **Submission Ensemble 📤✅**

---

In [71]:
Sub_V20 = pd.read_csv('/kaggle/input/used-car-submissions/Submission V20.csv')['price']
Sub_V22 = pd.read_csv('/kaggle/input/get-started-used-car-prices/Submission_E.csv')['price']
ensemble_mp = Sub_V22*0.9 + mpL1*0.1

In [20]:
sample_sub['price'] = mpL1
sample_sub.to_csv("Submission_KFold.csv", index=False)
sample_sub.head()

Unnamed: 0,id,price
0,188533,19161.136669
1,188534,78989.179375
2,188535,55133.687381
3,188536,27723.744951
4,188537,31022.480601
