# **1, Import necessary libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import sklearn
import numpy
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# **2, Import the dataset**

In [2]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer

In [3]:
# Import the dataset
dataset = pd.read_csv("D:\Project\Car-Evaluation\Dataset\Final\Remove-null-car_name-and-fill-null.csv")

# Split features and target column
features = ['origin', 'car_model', 'mileage', 'exterior_color', 'interior_color', 'num_of_doors',
            'seating_capacity', 'engine', 'engine_capacity', 'transmission', 'drive_type',
            'fuel_consumption', 'brand', 'grade', 'year_of_manufacture']
target = 'price_in_billion'
X = dataset[features] # X = dataset.iloc[:, :-1]
y = dataset[target] # y = dataset.iloc[:, -1].reshape(-1, 1)

In [4]:
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode
        self.encoders = {col: LabelEncoder() for col in columns}
    
    def fit(self, X, y=None):
        for col in self.columns:
            self.encoders[col].fit(X[col])
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = self.encoders[col].transform(X_copy[col])
        return X_copy

In [5]:
categorical_columns = ['origin', 'car_model', 'exterior_color', 'interior_color',
            'engine', 'transmission', 'drive_type','brand', 'grade']
# Create a ColumnTransformer with the custom MultiColumnLabelEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('label', MultiColumnLabelEncoder(columns=categorical_columns), categorical_columns)
    ],
    remainder='passthrough'  # This will keep the other columns unchanged
)

# Create a Pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X = pipeline.fit_transform(X)

In [6]:
# # Initialize LabelEncoder
# le = LabelEncoder()

# # Apply LabelEncoder to each categorical column
# for column in X.select_dtypes(include=['object']).columns:
#     X[column] = le.fit_transform(X[column])


# Split Train and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **3, Training the model**

# a, Decision Tree

In [28]:
# Import necessary libraries
from sklearn.tree import DecisionTreeRegressor

In [18]:
# Initialize the model
dt_model = DecisionTreeRegressor(random_state=42)

# Calculate candidate alphas
path = dt_model.cost_complexity_pruning_path(X_train, y_train)
alpha_set = path['ccp_alphas']
print(alpha_set)
# Candidate params for GridSearchCV
search_space = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [3, 7, 10, 15, 20],
    'ccp_alpha': alpha_set
}

# Calculate best hyperparams using GridSearchCV
dt_gs = GridSearchCV(estimator=dt_model,
                     param_grid=search_space,
                     scoring=['r2', 'neg_mean_squared_error'],
                     refit='r2',
                     cv=5,
                     verbose=2,
                     n_jobs=-1)
dt_gs.fit(X_train, y_train)
dt_best_params = dt_gs.best_params_

# Save all hyperparams in csv file
df = pd.DataFrame(dt_gs.cv_results_)
df.to_csv('/content/drive/MyDrive/CarPricePrediction/DecisionTreeGSResults.csv')

KeyboardInterrupt: 

In [35]:
# Train the model
# dt_model = DecisionTreeRegressor(max_depth=dt_best_params['max_depth'],
#                                  min_samples_leaf=dt_best_params['min_samples_leaf'],
#                                  ccp_alpha=dt_best_params['ccp_alpha'],
#                                  random_state=42)

# Best params after GS
dt_model = DecisionTreeRegressor(max_depth=20,
                                 min_samples_leaf=3,
                                 ccp_alpha=1.0085633759065575e-06,
                                 random_state=42)
dt_model.fit(X_train,y_train)
joblib.dump(dt_model, "dt_model.joblib")

['dt_model.joblib']

In [30]:
# Predict training results
y_pred = dt_model.predict(X_train)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
print("---------- TRAIN SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

# Predict test results
y_pred = dt_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("---------- TEST SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

---------- TRAIN SET ----------
Mean Squared Error: 0.004315907333075696
R² score: 0.9683383023812454
---------- TEST SET ----------
Mean Squared Error: 0.01099843394747257
R² score: 0.9220932035410249


# b, Random Forest

In [31]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Initialize the model
rf_model = RandomForestRegressor(random_state=42)

# Candidate params for GridSearchCV
search_space = {
    'n_estimators': [10, 50, 100, 200, 300, 500, 700, 1000],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [5, 10, 15],
    'max_features': [3, 7, None],
    'max_samples': [0.2, 0.4, 0.6, 0.8],
    'bootstrap': [True]
}

# Calculate best hyperparams using GridSearchCV
rf_gs = GridSearchCV(estimator=rf_model,
                     param_grid=search_space,
                     scoring=['r2', 'neg_mean_squared_error'],
                     refit='r2',
                     cv=5,
                     verbose=2,
                     n_jobs=-1)
rf_gs.fit(X_train, y_train)
rf_best_params = rf_gs.best_params_

# Save all hyperparams in csv file
df = pd.DataFrame(rf_gs.cv_results_)
df.to_csv('/content/drive/MyDrive/CarPricePrediction/RandomForestGSResults.csv')

Fitting 5 folds for each of 756 candidates, totalling 3780 fits


KeyboardInterrupt: 

In [32]:
# Train the model
# rf_model = RandomForestRegressor(n_estimators=rf_best_params['n_estimators'],
#                                  max_depth=rf_best_params['max_depth'],
#                                  min_samples_leaf=rf_best_params['min_samples_leaf'],
#                                  max_features=rf_best_params['max_features'],
#                                  max_samples=rf_best_params['max_samples'],
#                                  bootstrap=rf_best_params['bootstrap'],
#                                  oob_score=True,
#                                  random_state=42)

# Best params after GS
rf_model = RandomForestRegressor(n_estimators=500,
                                 max_depth=20,
                                 min_samples_leaf=5,
                                 max_features=7,
                                 max_samples=0.8,
                                 bootstrap=True,
                                 oob_score=True,
                                 random_state=42)
rf_model.fit(X_train,y_train)

In [34]:
joblib.dump(rf_model,"rf_model.joblib")

['rf_model.joblib']

In [33]:
# Predict training results
y_pred = rf_model.predict(X_train)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
oob_score = rf_model.oob_score_
oob_mse = mean_squared_error(y_train, rf_model.oob_prediction_)
print("---------- TRAIN SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")
print(f"OOB Score: {oob_score}")
print(f"OOB Error (MSE): {oob_mse}")

# Predict test results
y_pred = rf_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("---------- TEST SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

---------- TRAIN SET ----------
Mean Squared Error: 0.006010030970879942
R² score: 0.9559101323095962
OOB Score: 0.9349947076235714
OOB Error (MSE): 0.008861079447931164
---------- TEST SET ----------
Mean Squared Error: 0.008981576028098018
R² score: 0.936379504678242


# c, AdaBoost

In [7]:
# Import necessary libraries
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [8]:
# Initialize the model
ab_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=7), random_state=42)

# Candidate params for GridSearchCV
search_space = {
    'n_estimators': [10, 50, 100, 200, 300, 500, 700, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Calculate best hyperparams using GridSearchCV
ab_gs = GridSearchCV(estimator=ab_model,
                     param_grid=search_space,
                     scoring=['r2', 'neg_mean_squared_error'],
                     refit='r2',
                     cv=5,
                     verbose=2,
                     n_jobs=-1)
ab_gs.fit(X_train, y_train)
ab_best_params = ab_gs.best_params_

# Save all hyperparams in csv file
df = pd.DataFrame(ab_gs.cv_results_)
df.to_csv('/content/drive/MyDrive/CarPricePrediction/AdaBoostGSResults.csv')

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [9]:
# Train the model
# ab_model = AdaBoostRegressor(n_estimators=ab_best_params['n_estimators'],
#                              learning_rate=ab_best_params['learning_rate'],
#                              loss=ab_best_params['loss'],
#                              estimator=DecisionTreeRegressor(max_depth=7),
#                              random_state=42)

# Best params after GS
ab_model = AdaBoostRegressor(n_estimators=100,
                             learning_rate=0.3,
                             loss='exponential',
                             estimator=DecisionTreeRegressor(max_depth=10),
                             random_state=42)
ab_model.fit(X_train,y_train)

In [10]:
# Predict training results
y_pred = ab_model.predict(X_train)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
print("---------- TRAIN SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

# Predict test results
y_pred = ab_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("---------- TEST SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

---------- TRAIN SET ----------
Mean Squared Error: 0.004121885600679936
R² score: 0.9697616548650911
---------- TEST SET ----------
Mean Squared Error: 0.007996033380140521
R² score: 0.9433605413278924


In [12]:
joblib.dump(ab_model,"AdaBoost_model.joblib")

['AdaBoost_model.joblib']

# d, Gradient Boost

In [13]:
# Import necessary libraries
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Initialize the model
gb_model = GradientBoostingRegressor(random_state=42)

# Candidate params for GridSearchCV
search_space = {
    'n_estimators': [50, 100, 200, 300, 500, 700, 1000],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

# Calculate best hyperparams using GridSearchCV
gb_gs = GridSearchCV(estimator=gb_model,
                     param_grid=search_space,
                     scoring=['r2', 'neg_mean_squared_error'],
                     refit='r2',
                     cv=5,
                     verbose=2,
                     n_jobs=-1)
gb_gs.fit(X_train, y_train)
gb_best_params = gb_gs.best_params_

# Save all hyperparams in csv file
df = pd.DataFrame(gb_gs.cv_results_)
df.to_csv('/content/drive/MyDrive/CarPricePrediction/GradientBoostGSResults.csv')

Fitting 5 folds for each of 448 candidates, totalling 2240 fits


KeyboardInterrupt: 

In [14]:
# Train the model
# gb_model = GradientBoostingRegressor(n_estimators=gb_best_params['n_estimators'],
#                                      max_depth=gb_best_params['max_depth'],
#                                      min_samples_split=gb_best_params['min_samples_split'],
#                                      learning_rate=gb_best_params['learning_rate'],
#                                      random_state=42)

# Best params after GS
gb_model = GradientBoostingRegressor(n_estimators=1000,
                                     max_depth=7,
                                     min_samples_split=20,
                                     learning_rate=0.05,
                                     random_state=42)
gb_model.fit(X_train,y_train)

In [15]:
# Predict training results
y_pred = gb_model.predict(X_train)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
print("---------- TRAIN SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

# Predict test results
y_pred = gb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("---------- TEST SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

---------- TRAIN SET ----------
Mean Squared Error: 0.00199706994441336
R² score: 0.9853494016845681
---------- TEST SET ----------
Mean Squared Error: 0.0058492218722845345
R² score: 0.9585673614967543


In [16]:
joblib.dump(gb_model, "gb_model.joblib")

['gb_model.joblib']

# e, Extreme Gradient Boost

In [18]:
# Import necessary libraries
import time
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Initialize the model
xgb_model = XGBRegressor(random_state=42)

# Candidate params for RandomizedSearchCV
search_space = {
    'n_estimators': [50, 100, 200, 300, 500, 700, 1000],
    'max_depth': [3, 5, 7, 10],
    'eta': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.5, 1, 5],
    'reg_lambda': [0, 0.1, 1, 10, 100]
}

# Calculate best hyperparams using RandomizedSearchCV with timeout
xgb_rs = RandomizedSearchCV(estimator=xgb_model,
                            param_distributions=search_space,
                            n_iter=20,
                            scoring=['r2', 'neg_mean_squared_error'],
                            refit='r2',
                            cv=5,
                            n_jobs=-1,
                            verbose=2,
                            random_state=42)
# Measure time for a single iteration
start_time = time.time()
xgb_rs.fit(X_train, y_train)
end_time = time.time()

# Calculate time taken for a single iteration
time_per_iteration = (end_time - start_time)/20
print(f"Time per iteration: {time_per_iteration:.2f} seconds")

# Total available time (in seconds)
total_time_available = 10 * 60 * 60  # 10 hours

# Estimate number of iterations
estimated_n_iter = total_time_available // time_per_iteration
print(f"Estimated number of iterations: {estimated_n_iter}")

# Run RandomizedSearchCV with the estimated number of iterations
xgb_rs = RandomizedSearchCV(estimator=xgb_model,
                            param_distributions=search_space,
                            n_iter=int(estimated_n_iter),
                            scoring=['r2', 'neg_mean_squared_error'],
                            refit='r2',
                            cv=5,
                            n_jobs=-1,
                            verbose=2,
                            random_state=42)
start_time = time.time()
xgb_rs.fit(X_train, y_train)
end_time = time.time()
xgb_best_params = xgb_rs.best_params_
txt = f"n_iter = {int(estimated_n_iter)} running for {round((end_time-start_time)/3600, 2)} hours"
file = "Running time.txt"
with open(file, 'w') as f:
    f.write(txt)

# Save all hyperparams in csv file
df = pd.DataFrame(xgb_rs.cv_results_)
df.to_csv('/content/drive/MyDrive/CarPricePrediction/XGBoostRSResults.csv')

Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 

In [19]:
# # Train the model
# xgb_model = XGBRegressor(n_estimators=xgb_best_params['n_estimators'],
#                          max_depth=xgb_best_params['max_depth'],
#                          eta=xgb_best_params['eta'],
#                          subsample=xgb_best_params['subsample'],
#                          colsample_bytree=xgb_best_params['colsample_bytree'],
#                          gamma=xgb_best_params['gamma'],
#                          reg_lambda=xgb_best_params['reg_lambda'],
#                          random_state=42)

# Best params after RS
xgb_model = XGBRegressor(n_estimators=1000,
                         max_depth=7,
                         eta=0.05,
                         subsample=0.8,
                         colsample_bytree=0.6,
                         gamma=0,
                         reg_lambda=1,
                         random_state=42)
xgb_model.fit(X_train,y_train)

In [20]:
# Predict training results
y_pred = xgb_model.predict(X_train)
r2 = r2_score(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
print("---------- TRAIN SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

# Predict test results
y_pred = xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("---------- TEST SET ----------")
print(f"Mean Squared Error: {mse}")
print(f"R² score: {r2}")

---------- TRAIN SET ----------
Mean Squared Error: 0.0018239183408379578
R² score: 0.9866196499293792
---------- TEST SET ----------
Mean Squared Error: 0.005383300183764726
R² score: 0.9618676919189483


In [21]:
joblib.dump(xgb_model,"XGBoost_model.joblib")

['XGBoost_model.joblib']

In [35]:
print("scikit-learn version:", sklearn.__version__)
print("numpy version:", numpy.__version__)
print("joblib version:", joblib.__version__)

scikit-learn version: 1.3.1
numpy version: 1.24.3
joblib version: 1.3.2
