# Hyper Parameter Tuning and Best Model Selection

In [19]:
# Libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning Algorithims
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [20]:
# tips dataset
df = sns.load_dataset('diamonds').sample(3000, random_state=42)
df.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1388,0.24,Ideal,G,VVS1,62.1,56.0,559,3.97,4.0,2.47
50052,0.58,Very Good,F,VVS2,60.0,57.0,2201,5.44,5.42,3.26
41645,0.4,Ideal,E,VVS2,62.1,55.0,1238,4.76,4.74,2.95
42377,0.43,Premium,E,VVS2,60.8,57.0,1304,4.92,4.89,2.98
17244,1.55,Ideal,E,SI2,62.3,55.0,6901,7.44,7.37,4.61
1608,1.0,Fair,E,SI2,55.4,62.0,3011,6.63,6.59,3.66
46398,0.51,Ideal,F,VS1,60.2,56.0,1765,5.22,5.24,3.15
45493,0.52,Ideal,D,VS2,62.0,56.0,1679,5.17,5.19,3.21
49385,0.62,Premium,E,VS2,60.0,59.0,2102,5.58,5.56,3.34
10460,1.14,Ideal,H,SI1,60.3,57.0,4789,6.79,6.85,4.11


In [21]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 1388 to 49495
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    3000 non-null   float64 
 1   cut      3000 non-null   category
 2   color    3000 non-null   category
 3   clarity  3000 non-null   category
 4   depth    3000 non-null   float64 
 5   table    3000 non-null   float64 
 6   price    3000 non-null   int64   
 7   x        3000 non-null   float64 
 8   y        3000 non-null   float64 
 9   z        3000 non-null   float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 197.2 KB


In [23]:
for i in df.columns:
    if df[i].dtype == 'object' or df[i].dtype == 'category':
        le = LabelEncoder()
        df[i] = le.fit_transform(df[i])

In [24]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1388,0.24,2,3,6,62.1,56.0,559,3.97,4.0,2.47
50052,0.58,4,2,7,60.0,57.0,2201,5.44,5.42,3.26
41645,0.4,2,1,7,62.1,55.0,1238,4.76,4.74,2.95
42377,0.43,3,1,7,60.8,57.0,1304,4.92,4.89,2.98
17244,1.55,2,1,3,62.3,55.0,6901,7.44,7.37,4.61


# Regression Tasks

In [25]:
X = df.drop('cut', axis=1)
y = df['cut']

In [26]:
models = {'Linear Regression': LinearRegression(),
          'Support Vector Regression': SVR(),
          'Decision Tree': DecisionTreeRegressor(),
          'Random Forest': RandomForestRegressor(),
          'KNeighbors': KNeighborsRegressor(),
          'Gradient Boosting': GradientBoostingRegressor(),
          'XGBoost': XGBRegressor()}

In [27]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# train and predict each model with evaluation metrics as well making a for loop
model_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))
    
# print the performing metrics 
# print(name, 'MSE:', mean_squared_error(y_test, y_pred))
# print(name, 'R2:', r2_score(y_test, y_pred))
# print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
# print('\n')

# selection the best model for all above models with evaluation metrics sort 
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)

for model in sorted_models:
    print('mean_absolute_error for', f"{model[0]} is {model[1]: .2f}")


mean_absolute_error for Random Forest is  0.56
mean_absolute_error for Gradient Boosting is  0.57
mean_absolute_error for XGBoost is  0.59
mean_absolute_error for Decision Tree is  0.62
mean_absolute_error for Linear Regression is  0.84
mean_absolute_error for Support Vector Regression is  0.86
mean_absolute_error for KNeighbors is  0.90
CPU times: total: 3.91 s
Wall time: 3.67 s


In [28]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# train and predict each model with evaluation metrics as well making a for loop
model_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))
    
# print the performing metrics 
# print(name, 'MSE:', mean_squared_error(y_test, y_pred))
# print(name, 'R2:', r2_score(y_test, y_pred))
# print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
# print('\n')

# selection the best model for all above models with evaluation metrics sort 
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)

for model in sorted_models:
    print('R_squared score for', f"{model[0]} is {model[1]: .2f}")

R_squared score for Random Forest is  0.47
R_squared score for Gradient Boosting is  0.44
R_squared score for XGBoost is  0.40
R_squared score for Decision Tree is  0.09
R_squared score for Linear Regression is -0.03
R_squared score for Support Vector Regression is -0.04
R_squared score for KNeighbors is -0.10
CPU times: total: 3.97 s
Wall time: 3.39 s


In [29]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# train and predict each model with evaluation metrics as well making a for loop
model_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))
    
# print the performing metrics 
# print(name, 'MSE:', mean_squared_error(y_test, y_pred))
# print(name, 'R2:', r2_score(y_test, y_pred))
# print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
# print('\n')

# selection the best model for all above models with evaluation metrics sort 
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)

for model in sorted_models:
    print('mean squared error for', f"{model[0]} is {model[1]: .2f}")

mean squared error for Gradient Boosting is  0.58
mean squared error for Random Forest is  0.64
mean squared error for XGBoost is  0.69
mean squared error for Linear Regression is  1.00
mean squared error for Decision Tree is  1.09
mean squared error for Support Vector Regression is  1.14
mean squared error for KNeighbors is  1.17
CPU times: total: 4.47 s
Wall time: 4.29 s


# Hyperparameter Tuning

In [30]:
%%time

# create dictionary of list of models to evaluate performance of each model with hyperparameter tuning

models = {
    'Linear Regression': (LinearRegression(), {}),
    'Support Vector Regression': (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10, 15, 20]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300]}),
    'KNeighbors': (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'n_estimators': [100, 200, 300]}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [100, 200, 300]})
}

# train and predict each model with evaluation metrics as well making a for loop

model_scores = []
for name, (model, param) in models.items():
    # pipeline
    pipeline = GridSearchCV(model, param, cv=5)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # print the MSE, R2 and MAE of all Algorithms
    print(name, 'MSE:', mean_squared_error(y_test, y_pred))
    print(name, 'R2:', r2_score(y_test, y_pred))
    print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
    print('\n')

Linear Regression MSE: 1.004941386633767
Linear Regression R2: 0.05780610928274976
Linear Regression MAE: 0.8311071792561413


Support Vector Regression MSE: 1.1395101682854263
Support Vector Regression R2: -0.0683603374770585
Support Vector Regression MAE: 0.8385853995359588


Decision Tree MSE: 0.6552632375655295
Decision Tree R2: 0.38565071808427454
Decision Tree MAE: 0.552205341018078


Random Forest MSE: 0.6352854074074075
Random Forest R2: 0.4043811532868892
Random Forest MAE: 0.5537444444444445


KNeighbors MSE: 1.0673869393882531
KNeighbors R2: -0.0007404080468029761
KNeighbors MAE: 0.8606621004566208


Gradient Boosting MSE: 0.5748084458480414
Gradient Boosting R2: 0.46108199620992274
Gradient Boosting MAE: 0.5451275067336894


XGBoost MSE: 0.6943801665014412
XGBoost R2: 0.348976194858551
XGBoost MAE: 0.5884465757097739


CPU times: total: 1min 39s
Wall time: 1min 27s
