Rusty Bargain used car sales service is developing an app to attract new customers. In that app, you can quickly find out the market value of your car. You have access to historical data: technical specifications, trim versions, and prices. You need to build the model to determine the value. 

Rusty Bargain is interested in:

- the quality of the prediction;
- the speed of the prediction;
- the time required for training

## Data preparation

In [1]:
import warnings
warnings.filterwarnings('ignore')

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

import timeit
from functools import lru_cache

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import *


import random
random_state=42
random.seed(random_state)
np.random.seed(random_state)

In [2]:
#missing values
def missing_values(df):
    df_nulls = pd.concat([df.dtypes, df.isna().sum(), df.isna().sum()/len(df)], axis=1)
    df_nulls.columns = ["type", "count", "missing_ratio"]
    df_nulls = df_nulls[df_nulls["count"]>0]
    df_nulls.sort_values(by='missing_ratio', ascending=False)
    return df_nulls

def outlier(data):
    data_mean, data_std = np.mean(data), np.std(data)
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    outliers_removed = [x for x in data if x >= lower and x <= upper]
    return len(outliers)

def plot_roc(y_test, preds, ax=None, label='model'):
    with plt.style.context('seaborn-whitegrid'):
        if not ax: fig, ax = plt.sublots(1,1)
        fpr, tpr, thresholds = roc_curve(y_test, preds)
        ax.plot([0,1], [0,1], 'r--')
        ax.plot(fpr, tpr, lw=2, label=label)
        ax.legend(loc='lower right')
        ax.set_title(
            'ROC curve\n'
            f""" AP: {average_precision_score(
                    y_test, preds, pos_label=1):.2} | """
            f'AUC: {auc(fpr, tpr):.2}')
        ax.set_xlabel('False Positive Rate (FPR)')
        ax.set_ylabel('True Positive Rate (TPR)')
        ax.annotate(f'AUC: {auc(fpr, tpr):.2}', xy=(.43, .025))
        ax.legend()
        ax.grid()
        return ax

def plot_pr(y_test, preds, ax=None, label='model'):
    with plt.style.context('seaborn-whitegrid'):
        precision, recall, thresholds = precision_recall_curve(y_test, preds)
        if not ax: fig, ax = plt.subplots()
        ax.plot([0,1], [1,0], 'r--')
        ax.plot(recall, precision, lw=2, label=label)
        ax.legend()
        ax.set_title(
            'Precision-recall curve\n'
            f""" AP: {average_precision_score(
                y_test, preds, pos_label=1
            ):.2} | """
            f'AUC: {auc(recall, precision):.2}'
        )
        ax.set_xlabel('Recall')
        ax.set_ylabel('Precision')
        ax.set_xlim(-0.05, 1.05)
        ax.set_ylim(-0.05, 1.05)
        ax.legend()
        ax.grid()
        return ax
    
def des_full(df, target_name=""):
    data_describe = df.describe().T
    df_numeric = df._get_numeric_data()
    if target_name in df.columns:
        corr_with_target=df_numeric.drop(target_name, axis=1).apply(lambda x: x.corr(df_numeric[target_name]))
        data_describe['corr_with_target']=corr_with_target
    dtype_df = df_numeric.dtypes
    data_describe['dtypes'] = dtype_df
    data_null = df_numeric.isnull().sum()/len(df) * 100
    data_describe['Missing %'] = data_null
    Cardinality = df_numeric.apply(pd.Series.nunique)
    data_describe['Cardinality'] = Cardinality
    df_skew = df_numeric.skew(axis=0, skipna=True)
    data_describe['Skew'] = df_skew
    data_describe['outliers %']=[outlier(df_numeric[col])/len(df) * 100 for col in df_numeric.columns]
    data_describe['kurtosis']=df_numeric.kurtosis()
    return data_describe

def show_importances(df, features, target):
  X, y = df[features].values,df[target].values
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
  rfc = DecisionTreeRegressor().fit(X_train, y_train)
  y_pred = rfc.predict(X_test)
  df_importances = pd.DataFrame(((zip(features, rfc.feature_importances_)))).rename(columns={0:"feature",1:"coeff"}).sort_values(by="coeff", ascending = False )
  sns.barplot(data=feature_importances, x=df_importances["coeff"], y=df_importances["feature"])
  return df_importances

def display_classification_report(y_true, y_pred):
    display(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T)


In [3]:
df = pd.read_csv('/datasets/car_data.csv')

In [4]:
target='Price'
features = list(set(df.columns)-set(target))

In [5]:
df.head()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Mileage,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,24/03/2016 11:52,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,31/03/2016 00:00,0,60437,06/04/2016 10:17


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Mileage            354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  NotRepaired        283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

In [7]:
#convert to datetime
df['DataCrawled'] = pd.to_datetime(df['DateCrawled'])
df['DataCreated'] = pd.to_datetime(df['DateCreated'])
df['LastSeen'] = pd.to_datetime(df['LastSeen'])

In [8]:
#let's impute
df['NotRepaired'] = df['NotRepaired'].fillna('yes')
df['NotReparied'] = (df['NotRepaired'] == 'yes').astype('int')

In [9]:
df['Gearbox'] = (df['Gearbox'] == 'auto').astype('int')

In [10]:
df.isna().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox                  0
Power                    0
Model                19705
Mileage                  0
RegistrationMonth        0
FuelType             32895
Brand                    0
NotRepaired              0
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
DataCrawled              0
DataCreated              0
NotReparied              0
dtype: int64

In [11]:
def impute_value(in_df, features, target):
    encoders = dict()
    df = in_df.copy()
    for col in df[features].select_dtypes('object').columns:
        df.loc[df[col].isna(), col] = 'None'
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le
    for col in df[features].select_dtypes('datetime64').columns:
        df[f"{col}_hour"] = df[col].dt.hour
        df[f"{col}_month"] = df[col].dt.month
        df[f"{col}_day"] = df[col].dt.day
        del df[col]
    features=list(set(df.columns)-set([target]))
    train_df = df[~df[target].isna()]
    test_df = df[df[target].isna()]
    let = LabelEncoder()
    y_train = let.fit_transform(train_df[target])
    y_train = train_df[target].values
    X_train, X_test = train_df[features].values, test_df[features].values
    if len(X_test) == 0:
        return in_df
    model = DecisionTreeClassifier().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    df.loc[df[target].isna(), target] = y_pred
    in_df[target] = df[target]
    return in_df

In [None]:
for col in ["FuelType", "VehicleType", "Model"]:
    df=impute_value(df, features=list(set(df.columns)-set([col])-set([target])), target=col)

In [None]:
df.isna().sum()

In [None]:
des_full(df, target_name=target)

## Model training

In [None]:
def get_data(df, transform_data=True, apply_encoding=True):
    in_df = df.copy()
    target = "Price"
    features = list(set(in_df.columns) - set([target]))
    
    # Transform datetime columns
    if transform_data:
        for col in in_df[features].select_dtypes(include=['datetime64[ns]']).columns:
            in_df[f"{col}_hour"] = in_df[col].dt.hour
            in_df[f"{col}_month"] = in_df[col].dt.month
            in_df[f"{col}_day"] = in_df[col].dt.day
            del in_df[col]  # Remove original datetime column
    
    # Update feature list after transformation
    features = list(set(in_df.columns) - set([target]))

    # Remove constant features
    constant_features = [col for col in features if in_df[col].nunique() <= 1]
    in_df.drop(columns=constant_features, inplace=True)
    features = list(set(in_df.columns) - set([target]))

    encoders = dict()
    if apply_encoding:
        for col in in_df[features].select_dtypes('object').columns:
            lbl = LabelEncoder()
            in_df[col] = lbl.fit_transform(in_df[col].astype(str).fillna('missing_value'))
            encoders[col] = lbl

    # Ensure all categorical features are converted to numeric
    cat_features = list(in_df[features].select_dtypes('object').columns)
    in_df[cat_features] = in_df[cat_features].astype(str).fillna('NaN')
    in_df[cat_features] = in_df[cat_features].apply(LabelEncoder().fit_transform)
    
    return in_df[features].values, in_df[target].values, features, target, encoders, cat_features


In [None]:
rmse_func = lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)
rmsle = make_scorer(rmse_func, greater_is_better=False)

In [None]:
log_met = {"models": ["catboost", "xgboost", "LGBM"], "rmse_init": [0.0]*3, "fit_time": [0.0]*3, "predict_time": [0.0]*3}
m_idx = {"catboost": 0, "xgboost": 1, "LGBM": 2}

"CatBoost can natively handle categorical data, which is a key feature of the algorithm. For other models, we'll use a simple LabelEncoder for convenience. However, in practice, it's important to carefully consider how categorical data is encoded, especially when the data's order is significant, such as with prices."

In [None]:
X, y, features, target, _, cat_features = get_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

train_ds = Pool(data=X_train, label=y_train, cat_features=cat_features, feature_names=features)
test_ds = Pool(data=X_test, label=y_test, cat_features=cat_features, feature_names=features)
full_ds = Pool(data=X, label=y, cat_features=cat_features, feature_names=features)

In [44]:
# Train initial CatBoost model
model = CatBoostRegressor(iterations=20, task_type="CPU", random_seed=random_state, loss_function='RMSE')

start_fit = time.time()
model.fit(train_ds, verbose=0)
fit_time = time.time() - start_fit

start_predict = time.time()
y_pred = model.predict(test_ds)
predict_time = time.time() - start_predict

rmse = rmse_func(y_test, y_pred)
print(f"CatBoost RMSE: {rmse}")
print(f"CatBoost Fit time: {fit_time} seconds, Predict time: {predict_time} seconds")

CatBoost RMSE: 2015.1266237324533
CatBoost Fit time: 2.477708101272583 seconds, Predict time: 0.005808115005493164 seconds


In [45]:
log_met["rmse_init"][m_idx["catboost"]] = rmse
log_met["fit_time"][m_idx["catboost"]] = fit_time
log_met["predict_time"][m_idx["catboost"]] = predict_time

In [46]:
# Grid search for CatBoost
param_grid = {
    'learning_rate': [0.03, 0.1],
    'depth': [6, 10],
    'l2_leaf_reg': [3, 5, 7, 9]
}

model = CatBoostRegressor(iterations=20, loss_function='RMSE', task_type="CPU", random_seed=random_state)
grid_search_result = model.grid_search(param_grid, 
                                       train_ds,
                                       verbose=0,
                                       partition_random_seed=random_state,
                                       search_by_train_test_split=True,
                                       train_size=0.9)

# Extracting the best parameters and results
cv_data = pd.DataFrame(grid_search_result["cv_results"])
best_value = cv_data['test-RMSE-mean'].min()
best_iter = cv_data['test-RMSE-mean'].idxmin()

print('Best validation RMSE score : {:.4f}±{:.4f} on step {}'.format(
    best_value,
    cv_data['test-RMSE-std'][best_iter],
    best_iter))


0:	learn: 6164.9575944	test: 6116.4025696	best: 6116.4025696 (0)	total: 113ms	remaining: 2.15s
1:	learn: 6016.7853636	test: 5968.0901319	best: 5968.0901319 (1)	total: 209ms	remaining: 1.88s
2:	learn: 5873.5708465	test: 5824.2542823	best: 5824.2542823 (2)	total: 305ms	remaining: 1.73s
3:	learn: 5736.1446797	test: 5686.8466099	best: 5686.8466099 (3)	total: 410ms	remaining: 1.64s
4:	learn: 5603.5312444	test: 5554.9544509	best: 5554.9544509 (4)	total: 513ms	remaining: 1.54s
5:	learn: 5474.5716537	test: 5425.8219143	best: 5425.8219143 (5)	total: 607ms	remaining: 1.42s
6:	learn: 5350.4911938	test: 5301.6301622	best: 5301.6301622 (6)	total: 705ms	remaining: 1.31s
7:	learn: 5230.0738046	test: 5181.0107126	best: 5181.0107126 (7)	total: 802ms	remaining: 1.2s
8:	learn: 5114.7713375	test: 5065.5719369	best: 5065.5719369 (8)	total: 894ms	remaining: 1.09s
9:	learn: 5003.8433654	test: 4954.5650339	best: 4954.5650339 (9)	total: 993ms	remaining: 993ms
10:	learn: 4897.2582568	test: 4847.9213455	best: 48

5:	learn: 5474.5254350	test: 5425.9133760	best: 5425.9133760 (5)	total: 612ms	remaining: 1.43s
6:	learn: 5351.2828726	test: 5302.6375900	best: 5302.6375900 (6)	total: 721ms	remaining: 1.34s
7:	learn: 5230.8024139	test: 5181.9574752	best: 5181.9574752 (7)	total: 819ms	remaining: 1.23s
8:	learn: 5115.4001324	test: 5066.4228751	best: 5066.4228751 (8)	total: 912ms	remaining: 1.11s
9:	learn: 5004.7480489	test: 4955.7006056	best: 4955.7006056 (9)	total: 1.01s	remaining: 1.01s
10:	learn: 4898.4061457	test: 4849.3015470	best: 4849.3015470 (10)	total: 1.11s	remaining: 912ms
11:	learn: 4795.8896012	test: 4746.5778067	best: 4746.5778067 (11)	total: 1.22s	remaining: 814ms
12:	learn: 4695.7072543	test: 4646.2202523	best: 4646.2202523 (12)	total: 1.32s	remaining: 712ms
13:	learn: 4599.4169144	test: 4550.7926462	best: 4550.7926462 (13)	total: 1.42s	remaining: 607ms
14:	learn: 4506.7580822	test: 4457.9546983	best: 4457.9546983 (14)	total: 1.53s	remaining: 509ms
15:	learn: 4417.7113732	test: 4368.92123

9:	learn: 4958.9431413	test: 4911.7968131	best: 4911.7968131 (9)	total: 3.34s	remaining: 3.34s
10:	learn: 4847.9686239	test: 4801.1614582	best: 4801.1614582 (10)	total: 3.67s	remaining: 3.01s
11:	learn: 4739.9973919	test: 4693.4396735	best: 4693.4396735 (11)	total: 4.01s	remaining: 2.67s
12:	learn: 4634.8918006	test: 4588.8113378	best: 4588.8113378 (12)	total: 4.34s	remaining: 2.33s
13:	learn: 4533.5514500	test: 4487.9493538	best: 4487.9493538 (13)	total: 4.67s	remaining: 2s
14:	learn: 4436.2174318	test: 4390.8370063	best: 4390.8370063 (14)	total: 5s	remaining: 1.67s
15:	learn: 4343.2172602	test: 4298.0495443	best: 4298.0495443 (15)	total: 5.33s	remaining: 1.33s
16:	learn: 4251.5527417	test: 4206.7295448	best: 4206.7295448 (16)	total: 5.66s	remaining: 999ms
17:	learn: 4164.6175025	test: 4120.1229666	best: 4120.1229666 (17)	total: 6s	remaining: 667ms
18:	learn: 4079.5213518	test: 4035.3703658	best: 4035.3703658 (18)	total: 6.34s	remaining: 334ms
19:	learn: 3998.0635510	test: 3954.206878

14:	learn: 4441.5592715	test: 4396.0930258	best: 4396.0930258 (14)	total: 5s	remaining: 1.67s
15:	learn: 4348.0991594	test: 4302.9853796	best: 4302.9853796 (15)	total: 5.33s	remaining: 1.33s
16:	learn: 4256.8261015	test: 4212.0286885	best: 4212.0286885 (16)	total: 5.67s	remaining: 1s
17:	learn: 4170.6221923	test: 4126.1321450	best: 4126.1321450 (17)	total: 5.99s	remaining: 666ms
18:	learn: 4085.6007858	test: 4041.4419163	best: 4041.4419163 (18)	total: 6.33s	remaining: 333ms
19:	learn: 4004.8170647	test: 3960.8867076	best: 3960.8867076 (19)	total: 6.66s	remaining: 0us

bestTest = 3960.886708
bestIteration = 19

0:	learn: 5804.0948603	test: 5754.7651164	best: 5754.7651164 (0)	total: 330ms	remaining: 6.26s
1:	learn: 5339.0589627	test: 5290.9295410	best: 5290.9295410 (1)	total: 664ms	remaining: 5.98s
2:	learn: 4925.8966431	test: 4878.1047752	best: 4878.1047752 (2)	total: 998ms	remaining: 5.65s
3:	learn: 4557.7416983	test: 4511.0021144	best: 4511.0021144 (3)	total: 1.34s	remaining: 5.36s
4:

18:	learn: 2301.0454501	test: 2345.7263519	best: 2345.7263519 (18)	total: 4.8s	remaining: 253ms
19:	learn: 2261.2819411	test: 2306.5406966	best: 2306.5406966 (19)	total: 5.05s	remaining: 0us

bestTest = 2306.540697
bestIteration = 19

Training on fold [1/3]
0:	learn: 5789.8603009	test: 5772.0886178	best: 5772.0886178 (0)	total: 259ms	remaining: 4.92s
1:	learn: 5331.5755873	test: 5314.5033186	best: 5314.5033186 (1)	total: 516ms	remaining: 4.64s
2:	learn: 4919.2356948	test: 4903.1632505	best: 4903.1632505 (2)	total: 770ms	remaining: 4.37s
3:	learn: 4554.9768335	test: 4540.3342139	best: 4540.3342139 (3)	total: 1.03s	remaining: 4.12s
4:	learn: 4231.9570745	test: 4218.2005177	best: 4218.2005177 (4)	total: 1.3s	remaining: 3.91s
5:	learn: 3946.4676025	test: 3933.4693605	best: 3933.4693605 (5)	total: 1.55s	remaining: 3.63s
6:	learn: 3696.9702313	test: 3684.7175157	best: 3684.7175157 (6)	total: 1.81s	remaining: 3.37s
7:	learn: 3478.4570516	test: 3466.1140618	best: 3466.1140618 (7)	total: 2.07s	

In [47]:
# Train final CatBoost model with best parameters
model = CatBoostRegressor(iterations=20, loss_function='RMSE', task_type="CPU", random_seed=random_state, **grid_search_result["params"])
start_fit = time.time()
model.fit(train_ds, verbose=1, eval_set=[(X_test, y_test)], use_best_model=True)
fit_time = time.time() - start_fit

start_predict = time.time()
y_pred = model.predict(test_ds)
predict_time = time.time() - start_predict

rmse = rmse_func(y_test, y_pred)
print(f"CatBoost (tuned) RMSE: {rmse}")
print(f"CatBoost (tuned) Fit time: {fit_time} seconds, Predict time: {predict_time} seconds")


0:	learn: 4204.1274364	test: 4228.7330592	best: 4228.7330592 (0)	total: 369ms	remaining: 7.01s
1:	learn: 3936.3128497	test: 3958.4504005	best: 3958.4504005 (1)	total: 732ms	remaining: 6.59s
2:	learn: 3703.9167584	test: 3725.4549272	best: 3725.4549272 (2)	total: 1.11s	remaining: 6.31s
3:	learn: 3494.4131843	test: 3515.8132387	best: 3515.8132387 (3)	total: 1.48s	remaining: 5.92s
4:	learn: 3308.8290352	test: 3329.9062019	best: 3329.9062019 (4)	total: 1.84s	remaining: 5.54s
5:	learn: 3146.9306704	test: 3168.3611120	best: 3168.3611120 (5)	total: 2.22s	remaining: 5.17s
6:	learn: 3004.3502339	test: 3026.5960795	best: 3026.5960795 (6)	total: 2.58s	remaining: 4.8s
7:	learn: 2886.5476515	test: 2910.0927432	best: 2910.0927432 (7)	total: 2.95s	remaining: 4.43s
8:	learn: 2778.1632453	test: 2803.6869421	best: 2803.6869421 (8)	total: 3.31s	remaining: 4.05s
9:	learn: 2681.5480698	test: 2706.9961713	best: 2706.9961713 (9)	total: 3.69s	remaining: 3.69s
10:	learn: 2601.8664313	test: 2627.6754376	best: 26

In [48]:
# XGBoost model - Initial training
model = XGBRegressor(tree_method='hist', random_state=random_state, objective='reg:squarederror')

start_fit = time.time()
model.fit(X_train, y_train)
fit_time = time.time() - start_fit

start_predict = time.time()
y_pred = model.predict(X_test)
predict_time = time.time() - start_predict

rmse = rmse_func(y_test, y_pred)
print(f"XGBoost RMSE: {rmse}")
print(f"XGBoost Fit time: {fit_time} seconds, Predict time: {predict_time} seconds")

XGBoost RMSE: 1771.195953679433
XGBoost Fit time: 9.86487078666687 seconds, Predict time: 0.173109769821167 seconds


In [49]:
log_met["rmse_init"][m_idx["xgboost"]] = rmse
log_met["fit_time"][m_idx["xgboost"]] = fit_time
log_met["predict_time"][m_idx["xgboost"]] = predict_time

In [50]:
# Grid search for XGBoost
param_grid = {
    'learning_rate': [0.03, 0.1],
    'max_depth': [4, 6, 10],
    'objective': ['reg:squarederror']
}

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=1, verbose=False, scoring=rmsle)

start_fit = time.time()
grid.fit(X_train, y_train)
fit_time = time.time() - start_fit

print(grid.best_params_)
print(f"XGBoost Grid Search Fit time: {fit_time} seconds")

{'learning_rate': 0.1, 'max_depth': 10, 'objective': 'reg:squarederror'}
XGBoost Grid Search Fit time: 410.87535333633423 seconds


In [51]:
# LGBM model
model = LGBMRegressor(objective="RMSE", random_state=random_state, verbose=1, force_col_wise=True)

start_fit = time.time()
model.fit(X_train, y_train)
fit_time = time.time() - start_fit

start_predict = time.time()
y_pred = model.predict(X_test)
predict_time = time.time() - start_predict

rmse = rmse_func(y_test, y_pred)
print(f"LGBM RMSE: {rmse}")
print(f"LGBM Fit time: {fit_time} seconds, Predict time: {predict_time} seconds")

[LightGBM] [Info] Total Bins 1431
[LightGBM] [Info] Number of data points in the train set: 318932, number of used features: 22
[LightGBM] [Info] Start training from score 4414.279718
LGBM RMSE: 1852.2844852981566
LGBM Fit time: 9.06836748123169 seconds, Predict time: 0.395052433013916 seconds


In [52]:
log_met["rmse_init"][m_idx["LGBM"]] = rmse
log_met["fit_time"][m_idx["LGBM"]] = fit_time
log_met["predict_time"][m_idx["LGBM"]] = predict_time

In [53]:
# Grid search for LGBM
param_grid = {
    'learning_rate': [0.03, 0.1],
    'max_depth': [4, 6, 10]
}

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=1, verbose=False)

start_fit = time.time()
grid.fit(X_train, y_train)
fit_time = time.time() - start_fit

print(grid.best_params_)
print(f"LGBM Grid Search Fit time: {fit_time} seconds")


[LightGBM] [Info] Total Bins 1431
[LightGBM] [Info] Number of data points in the train set: 255145, number of used features: 22
[LightGBM] [Info] Start training from score 4411.595312
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 255145, number of used features: 22
[LightGBM] [Info] Start training from score 4418.562084
[LightGBM] [Info] Total Bins 1425
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4414.532801
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4414.395366
[LightGBM] [Info] Total Bins 1429
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4412.313033
[LightGBM] [Info] Total Bins 1431
[LightGBM] [Info] Number of data points in the

[LightGBM] [Info] Total Bins 1425
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4414.532801
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4414.395366
[LightGBM] [Info] Total Bins 1429
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4412.313033
[LightGBM] [Info] Total Bins 1431
[LightGBM] [Info] Number of data points in the train set: 255145, number of used features: 22
[LightGBM] [Info] Start training from score 4411.595312
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 255145, number of used features: 22
[LightGBM] [Info] Start training from score 4418.562084
[LightGBM] [Info] Total Bins 1425
[LightGBM] [Info] Number of data points in the

[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 255145, number of used features: 22
[LightGBM] [Info] Start training from score 4418.562084
[LightGBM] [Info] Total Bins 1425
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4414.532801
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4414.395366
[LightGBM] [Info] Total Bins 1429
[LightGBM] [Info] Number of data points in the train set: 255146, number of used features: 22
[LightGBM] [Info] Start training from score 4412.313033
[LightGBM] [Info] Total Bins 1431
[LightGBM] [Info] Number of data points in the train set: 318932, number of used features: 22
[LightGBM] [Info] Start training from score 4414.279718
{'learning_rate': 0.1, 'max_depth': 10}
LGBM Grid Search Fit time: 246.921232938

In [54]:
# Train final LGBM model with best parameters
model = LGBMRegressor(objective="RMSE", random_state=random_state, verbose=0, **grid.best_params_,force_col_wise=True)

start_fit = time.time()
model.fit(X_train, y_train)
fit_time = time.time() - start_fit

start_predict = time.time()
y_pred = model.predict(X_test)
predict_time = time.time() - start_predict

rmse = rmse_func(y_test, y_pred)
print(f"LGBM (tuned) RMSE: {rmse}")
print(f"LGBM (tuned) Fit time: {fit_time} seconds, Predict time: {predict_time} seconds")

LGBM (tuned) RMSE: 1854.528975904095
LGBM (tuned) Fit time: 8.787270069122314 seconds, Predict time: 0.3143935203552246 seconds


In [55]:
log_met["rmse_init"][m_idx["LGBM"]] = rmse
log_met["fit_time"][m_idx["LGBM"]] = fit_time
log_met["predict_time"][m_idx["LGBM"]] = predict_time

## Model analysis

In [56]:
pd.DataFrame(log_met)

Unnamed: 0,models,rmse_init,fit_time,predict_time
0,catboost,2015.126624,2.477708,0.005808
1,xgboost,1771.195954,9.864871,0.17311
2,LGBM,1854.528976,8.78727,0.314394


In summary, XGBoost provided the best predictive performance, while CatBoost offered a quick training process and fast predictions, making it suitable for applications where speed is critical. LightGBM presented a balanced option between the two, with reasonable accuracy and training speed.

- [x]  Jupyter Notebook is open
- [ ]  Code is error free
- [ ]  The cells with the code have been arranged in order of execution
- [ ]  The data has been downloaded and prepared
- [ ]  The models have been trained
- [ ]  The analysis of speed and quality of the models has been performed