In [2]:
# importing libraries 

import pandas as pd 
import re 
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb

import time
import catboost as cb
import os

In [2]:
# reading files and saving them to df 

df = pd.read_csv('/datasets/car_data.csv')

In [3]:
# obtaining the first 10 rows from the df table 

df.head(10)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Mileage,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,24/03/2016 11:52,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,31/03/2016 00:00,0,60437,06/04/2016 10:17
5,04/04/2016 17:36,650,sedan,1995,manual,102,3er,150000,10,petrol,bmw,yes,04/04/2016 00:00,0,33775,06/04/2016 19:17
6,01/04/2016 20:48,2200,convertible,2004,manual,109,2_reihe,150000,8,petrol,peugeot,no,01/04/2016 00:00,0,67112,05/04/2016 18:18
7,21/03/2016 18:54,0,sedan,1980,manual,50,other,40000,7,petrol,volkswagen,no,21/03/2016 00:00,0,19348,25/03/2016 16:47
8,04/04/2016 23:42,14500,bus,2014,manual,125,c_max,30000,8,petrol,ford,,04/04/2016 00:00,0,94505,04/04/2016 23:42
9,17/03/2016 10:53,999,small,1998,manual,101,golf,150000,0,,volkswagen,,17/03/2016 00:00,0,27472,31/03/2016 17:17


In [4]:
# renaming columns to use snake case

df = df.rename(columns={
    'DateCrawled': 'date_crawled',
    'Price': 'price',
    'VehicleType': 'vehicle_type',
    'RegistrationYear': 'registration_year',
    'Gearbox': 'gearbox',
    'Power': 'power',
    'Model': 'model',
    'Mileage': 'mileage',
    'RegistrationMonth': 'registration_month',
    'FuelType': 'fuel_type',
    'Brand': 'brand',
    'NotRepaired': 'not_repaired',
    'DateCreated': 'date_created',
    'NumberOfPictures': 'number_of_pictures',
    'PostalCode': 'postal_code',
    'LastSeen': 'last_seen'
})

#verifying the changes 

df.columns

Index(['date_crawled', 'price', 'vehicle_type', 'registration_year', 'gearbox',
       'power', 'model', 'mileage', 'registration_month', 'fuel_type', 'brand',
       'not_repaired', 'date_created', 'number_of_pictures', 'postal_code',
       'last_seen'],
      dtype='object')

In [5]:
# calculating missing values

df.isna().sum()

date_crawled              0
price                     0
vehicle_type          37490
registration_year         0
gearbox               19833
power                     0
model                 19705
mileage                   0
registration_month        0
fuel_type             32895
brand                     0
not_repaired          71154
date_created              0
number_of_pictures        0
postal_code               0
last_seen                 0
dtype: int64

In [6]:
# Calculate percentage of missing values

missing_value_percentages = df.isnull().mean() * 100

print(missing_value_percentages)

date_crawled           0.000000
price                  0.000000
vehicle_type          10.579368
registration_year      0.000000
gearbox                5.596709
power                  0.000000
model                  5.560588
mileage                0.000000
registration_month     0.000000
fuel_type              9.282697
brand                  0.000000
not_repaired          20.079070
date_created           0.000000
number_of_pictures     0.000000
postal_code            0.000000
last_seen              0.000000
dtype: float64


The missing values account for 5%-10% of the data in the affected columns. Since this proportion is significant and could affect the model's performance, I decided to use imputation methods to fill them in.

In [7]:
# Drop unnecessary features

df = df.drop(['date_crawled', 'date_created', 'postal_code', 'last_seen', 'number_of_pictures'], axis=1, errors='ignore')

# Filter out unrealistic values
# Assuming a reasonable range for registration year is between 1900 and current year
current_year = 2024  # Update this if needed
df = df[(df['registration_year'] >= 1900) & (df['registration_year'] <= current_year)]

# Filter out unrealistic power values (assuming > 20 and < 1000 horsepower)
df = df[(df['power'] > 20) & (df['power'] < 1000)]

# Filter out unrealistic price values (assuming > 100 and < 50000)
df = df[(df['price'] > 100) & (df['price'] < 50000)]

# Fill missing values in categorical columns with 'Unknown'
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Fill missing values in numerical columns with mean or median
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

In [8]:
# checking for duplicates

duplicates = df[df.duplicated()]
print(duplicates)

        price vehicle_type  registration_year gearbox  power     model  \
3458      300        small               2000  manual     54     corsa   
3551     1670        sedan               1999  manual     75      golf   
3786     2999        sedan               2002  manual    101      golf   
3907      500        small               1999  manual     55     corsa   
4134    18750        sedan               2014  manual    150      golf   
...       ...          ...                ...     ...    ...       ...   
354325  12800  convertible               1991  manual    211     other   
354333  13850        wagon               2012  manual    156    accord   
354337  11500        sedan               2004    auto    445       7er   
354352   6500        sedan               2003    auto    145  e_klasse   
354355   4400        sedan               2008  manual    105      leon   

        mileage  registration_month fuel_type          brand not_repaired  
3458     150000                   0

I decided to keep the duplicate values because the duplicates might represent multiple listings of the same car or similar cars with the same characteristics.

In [10]:
# One-Hot Encoding (OHE) for XGBoost and other models that require OHE

categorical_cols = df.select_dtypes(include=['object']).columns
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe_features = pd.DataFrame(ohe.fit_transform(df[categorical_cols]), columns=ohe.get_feature_names())

# Replacing original categorical columns with OHE features

df_ohe = df.drop(categorical_cols, axis=1)
df_ohe = pd.concat([df_ohe, ohe_features], axis=1)

I encoded the values in the original dataframe to allow the model to handle categorical columns. LightGMB and CatBoost, as well as Random Forest models won't need to use these encoded features, as the can handle categorical features directly.

In [None]:
# Split data into three parts: train, validation, and test

X = df_ohe.drop('price', axis=1)
y = df_ohe['price']
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Drop rows with missing values
mask_train = ~X_train.isnull().any(axis=1) & ~y_train.isnull()
X_train = X_train[mask_train]
y_train = y_train[mask_train]

mask_val = ~X_val.isnull().any(axis=1) & ~y_val.isnull()
X_val = X_val[mask_val]
y_val = y_val[mask_val]

mask_test = ~X_test.isnull().any(axis=1) & ~y_test.isnull()
X_test = X_test[mask_test]
y_test = y_test[mask_test]

# Model training and evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', metric='rmse')
}

param_grids = {
    'Random Forest': {'n_estimators': [10, 50], 'max_depth': [3, 5]},
    'LightGBM': {'num_leaves': [15, 31], 'learning_rate': [0.05, 0.1], 'n_estimators': [50]}
}

best_model = None
best_rmse = float('inf')

for model_name, model in models.items():
    if model_name in param_grids:
        grid_search = GridSearchCV(model, param_grids[model_name], cv=2, scoring='neg_mean_squared_error')
        start_train = time.time()
        grid_search.fit(X_train, y_train)
        train_time = time.time() - start_train
        
        start_pred = time.time()
        y_pred = grid_search.predict(X_val)
        pred_time = time.time() - start_pred
        
        mse = mean_squared_error(y_val, y_pred)
        rmse = mse ** 0.5
        print(f"{model_name} RMSE: {rmse}")
        print(f"{model_name} Training Time: {train_time} seconds")
        print(f"{model_name} Prediction Time: {pred_time} seconds")
        print(f"Best Parameters: {grid_search.best_params_}")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = grid_search.best_estimator_
    else:
        start_train = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_train
        
        start_pred = time.time()
        y_pred = model.predict(X_val)
        pred_time = time.time() - start_pred
        
        mse = mean_squared_error(y_val, y_pred)
        rmse = mse ** 0.5
        print(f"{model_name} RMSE: {rmse}")
        print(f"{model_name} Training Time: {train_time} seconds")
        print(f"{model_name} Prediction Time: {pred_time} seconds")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

# Retrain the best model on the training data and predict on the test set
start_train = time.time()
best_model.fit(X_train, y_train)
train_time = time.time() - start_train

start_pred = time.time()
y_pred = best_model.predict(X_test)
pred_time = time.time() - start_pred

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Best Model Test RMSE: {rmse}")
print(f"Best Model Training Time: {train_time} seconds")
print(f"Best Model Prediction Time: {pred_time} seconds")

The LightGBM model performs the best with the lowest RMSE value (1943.41), followed by the Random Forest model (2538.61), and then the Linear Regression model (3216.39). As for speed, the Linear Regression model is the fastest, with a training time of 22.45 seconds.

The Random Forest model is second in terms of speed, with a training time of 159.39 seconds, and the LightGBM model is the slowest of the models trained, taking a total 310.12 seconds to be trained.

In [None]:
# Hyperparameter tuning for CatBoost

model = cb.CatBoostRegressor(verbose=False)
param_grid = {'depth': [2, 4, 6], 'iterations': [50, 100, 200]}
grid_search = GridSearchCV(model, param_grid, cv=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(f"Best CatBoost Parameters: {grid_search.best_params_}")
print(f"Best CatBoost RMSE: {-grid_search.best_score_}")

# Hyperparameter tuning for XGBoost

model = xgb.XGBRegressor()
param_grid = {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200], 'learning_rate': [0.05, 0.1, 0.2]}
grid_search = GridSearchCV(model, param_grid, cv=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print(f"Best XGBoost Parameters: {grid_search.best_params_}")
print(f"Best XGBoost RMSE: {-grid_search.best_score_}")

The CatBoost model, with an RMSE of 1867.59, performed slighly better than the XGBoost model, which had an RMSE of 1921.85. In terms of speed, the CatBoost model also performed better than the XGBoost model, taking 2.37 seconds to be trained while the XGBoost model took 266.56 seconds to train.

# Conclusions 

In conclusion, the CatBoost model is the best performing model, with an RMSE of 1867.59. It outperforms LightGBM, XGBoost, Random Forest, and Linear Regression. The CatBoost model offers the best balance between accuracy and training speed, making it a suitable choice.

The Random Forest model is a good alternative, offering reasonable accuracy and faster training times than the LightGBM model. However, due to it's superior performance and training time, using the CatBoost model is recommended for predicting car prices.