# Regression Models

We’ll build a regression model using **Decision Tree** Regressor, tuning hyperparameters with the three different methods.


In [1]:
! pip show scipy

Name: scipy
Version: 1.13.1
Summary: Fundamental algorithms for scientific computing in Python
Home-page: https://scipy.org/
Author: 
Author-email: 
License: Copyright (c) 2001-2002 Enthought, Inc. 2003-2024, SciPy Developers.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following
   disclaimer in the documentation and/or other materials provided
   with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived
   from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRI

! pip install --upgrade scipy

! pip install --upgrade scikit-optimize

## Step 1: Load and Prepare the Data

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from datetime import datetime
from datetime import timedelta
import time

In [4]:
import psutil
import os, io
import sys
import platform
import sysconfig
import pandas   as     pd
import numpy    as     np
import sklearn
import time
from   tabulate import tabulate
from   platform import python_version
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [5]:
# check scikit-learn version
print(sklearn.__version__)
print(sys.version)

1.6.1
3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]


In [6]:
from     sklearn.model_selection  import     train_test_split
from     xgboost                  import     XGBRegressor
from     sklearn.preprocessing     import    StandardScaler
import   pandas                    as        pd
import   numpy                     as        np
from     sklearn.metrics           import    mean_squared_error
from     sklearn.metrics           import    explained_variance_score
import   time

In [7]:
!pip install scikit-optimize



In [8]:
def  tune_params_bayes(X_train, y_train,X_test,y_test, paramgrid, model):
    # methods                             =  ['GridSearchCV', 'RandomizedSearchCV', 'BayesSearchCV']
    # rmse_scores                         =  []
    timings                             =  []
###
    # from     sklearn.model_selection    import GridSearchCV
    # from     sklearn.model_selection    import RandomizedSearchCV
    from     skopt                      import BayesSearchCV
    # from     sklearn.metrics            import explained_variance_score
    import   numpy                      as     np
    import   matplotlib.pyplot          as     plt

### 1 -------------------------------------------------------------
    bayes_search                         =  BayesSearchCV( estimator    = model, search_spaces = paramgrid,
                                                          n_iter = 10, # Number of iterations
                                                           cv           = 5,       # Cross-validation folds
                                                           scoring      = 'neg_mean_squared_error',  # Scoring metric
                                                           random_state = 42,
                                                           n_jobs       = -1   # Use all available cores
    )
    # Fit the model
    bayes_search.fit(X_train, y_train)
    # Best parameters and score
    print("Best Parameters:", bayes_search.best_params_)
    print("Best Score:", bayes_search.best_score_)
    start_time                           =  time.time()
    bayes_search.fit(X_train, y_train)
    model_parameters                   =   bayes_search.best_estimator_.get_params()
    print("XGBoost Regression model parameters\n")
    print(model_parameters)
    print("BayesianSearch Best Params:", bayes_search.best_estimator_)

    end_time                            =  time.time()
    bayes_time                          =  end_time  - start_time # time_taken is in seconds
    hours, rest                          =  divmod(bayes_time,3600)
    minutes, seconds                     =  divmod(rest, 60)
    print("\033[1;31mTotal Execution time",  str(hours) + " Min: " + str(minutes) + " Seconds: " + str(round(seconds,6)) + "\033[0m\033[0m\033[0m\033[0m")


## Extract feature importances

In [9]:
# '''
# Feature Importance: The feature_importances_ attribute provides the importance scores for each feature.
# Visualization: A horizontal bar chart is plotted using matplotlib to display the importance of each feature.
# '''
# def  feature_plot(model, model_txt, X, features):
#     import  matplotlib.pyplot  as  plt
#     importances              =   model.feature_importances_
#     indices                  =   np.argsort(importances)

#     plt.title('Feature Importances')
#     plt.barh(range(len(indices)), importances[indices], color='b', align='center')
#     plt.yticks(range(len(indices)), [features[i] for i in indices])
#     plt.xlabel('Relative Importance')
#     plt.show()

#     print(get_feature_importances_df(model,features,importances))
#     '''
#     indices                    =   np.argsort(feature_importances)
#     print(indices)
#     ##
#     plt.figure(1)
#     ttlex    =  "Feature Importance in Decision Tree Regression - " + model_txt
#     plt.title(ttlex)
#     plt.barh(range(len(indices)), feature_importances[indices], color='b', align='center')
#     # plt.yticks(range(len(indices)), features[indices])
#     plt.xlabel('Relative Importance')
#     '''

In [10]:
# import pandas as pd # Make sure pandas is imported

# def get_feature_importances_df(model, features, importances):
#     """
#     Retrieves feature importances from a model and returns them as a sorted DataFrame.

#     Args:
#         model: A trained machine learning model with a .feature_importances_ attribute.
#         features (list): A list of feature names corresponding to the model's input.

#     Returns:
#         pd.DataFrame: A DataFrame with 'Feature' and 'Importance' columns,
#                       sorted by importance in descending order.
#     """
#     if not hasattr(model, 'feature_importances_'):
#         print("Warning: Model does not have a 'feature_importances_' attribute.")
#         return pd.DataFrame(columns=['Feature', 'Importance'])

#     # importances = model.feature_importances_

#     # Create a DataFrame
#     feature_importance_df = pd.DataFrame({
#         'Feature': features,
#         'Importance': importances
#     })

#     # Sort the DataFrame by importance in descending order
#     feature_importance_df = feature_importance_df.sort_values(
#         by='Importance', ascending=False
#     ).reset_index(drop=True)

#     return feature_importance_df

In [11]:
Models_start_time            =   time.time()

In [12]:
from sklearn.model_selection    import  train_test_split

data                              =  pd.read_csv('train_dataset_used_for_model_scaled.csv',nrows=2000)
print(data.head().T)

X                                 =  data.drop('price',axis=1)
y                                 =  data['price']

# Step 3: Train-test split
X_train, X_test, y_train, y_test =   train_test_split(X, y, test_size = 0.2, random_state = 42)

                           0           1           2           3           4
brand_EDA           8.000000    8.000000    2.000000    8.000000    7.000000
model_EDA          10.000000   10.000000   10.000000   10.000000   10.000000
model_year_EDA     10.000000   10.000000   10.000000    3.000000    7.000000
transmission_EDA    0.000000    0.000000    0.000000    9.000000    6.000000
HP                172.000000  252.000000  320.000000  420.000000  208.000000
Litres              1.600000    3.900000    5.300000    5.000000    2.000000
Cylinders           4.000000    8.000000    8.000000    8.000000    4.000000
cylinder_layout     2.000000    2.000000    2.000000    2.000000    2.000000
fuel_type           3.000000    3.000000    2.000000    3.000000    3.000000
ext_col_EDA         7.000000    9.000000    1.000000    0.000000    0.000000
int_col_EDA         6.000000    0.000000    6.000000    1.000000    0.000000
color_category      0.000000    1.000000    2.000000    1.000000    1.000000

In [13]:
param_grid = {'max_depth': [5,6,7,8],
              'learning_rate': [0.01, 0.05, 0.1, 0.2],
              'n_estimators': [100, 200, 300, 500],
              'subsample': [0.6, 0.8, 0.9, 1.0]
             }

In [14]:
xgb          = XGBRegressor(random_state = 42)

In [16]:
models_end_time                            =  time.time()
time_taken                                 =  models_end_time - Models_start_time # time_taken is in seconds
hours, rest                                =  divmod(time_taken,3600)
minutes, seconds                           =  divmod(rest, 60)

In [17]:
data = [["This application was processed using the PC with the below configuration", ":"],
        ["sklearn version", sklearn.__version__],
        ["Python version", python_version()],
        ["OS Name", os.name],
        ["System Platform", sys.platform],
        ["OS on which the Python interpreter is currently running", platform.system()],
        ['Platform Architecture',  str(platform.architecture())],
        ["\033[1;31mTotal RAM", str(round(psutil.virtual_memory().total/(1024 ** 3),0)) + ":GB" + "\033[0m\033[0m\033[0m\033[0m"],
        ["\033[1;31mTotal Execution time",  str(hours) + " Min: " + str(minutes) + " Seconds: " + str(round(seconds,6)) + "\033[0m\033[0m\033[0m\033[0m"]
       ]

headers = ["", "Values"]

print(tabulate(data, headers = headers, tablefmt = "grid"))

+--------------------------------------------------------------------------+--------------------------------+
|                                                                          | Values                         |
| This application was processed using the PC with the below configuration | :                              |
+--------------------------------------------------------------------------+--------------------------------+
| sklearn version                                                          | 1.6.1                          |
+--------------------------------------------------------------------------+--------------------------------+
| Python version                                                           | 3.12.7                         |
+--------------------------------------------------------------------------+--------------------------------+
| OS Name                                                                  | posix                          |
+---------