# Selecting the best model with Best hyperparameters

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load dataset
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

## Rergression Tasks

In [5]:
# select features and variables
X = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variables
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])


In [6]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}") 

Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for XGBRegressor is  0.67
Mean Absolute error for GradientBoostingRegressor is  0.72
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.78
Mean Absolute error for DecisionTreeRegressor is  0.88
CPU times: total: 2.31 s
Wall time: 1.06 s


In [7]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}") 

R_squared Score SVR is  0.57
R_squared Score LinearRegression is  0.44
R_squared Score XGBRegressor is  0.41
R_squared Score GradientBoostingRegressor is  0.36
R_squared Score KNeighborsRegressor is  0.33
R_squared Score RandomForestRegressor is  0.22
R_squared Score DecisionTreeRegressor is -0.10
CPU times: total: 2.08 s
Wall time: 874 ms


In [8]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Squared error for', f"{model[0]} is {model[1]: .2f}") 

Mean Squared error for SVR is  0.54
Mean Squared error for LinearRegression is  0.69
Mean Squared error for XGBRegressor is  0.74
Mean Squared error for GradientBoostingRegressor is  0.80
Mean Squared error for KNeighborsRegressor is  0.84
Mean Squared error for RandomForestRegressor is  0.95
Mean Squared error for DecisionTreeRegressor is  1.22
CPU times: total: 2.5 s
Wall time: 1.11 s


## **Assignment:** Find the best model based on each metrics from above mentioned results?  with Diamonds dataset

In [9]:
diamonds = sns.load_dataset('diamonds')

---

# Hyperparameter tuning:


Training LinearRegression...


NameError: name 'X_train' is not defined

In [None]:
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461158




# Assignment: How to get best parameters of each model, write in the for loop among the code, how to get best model out of it?

## Solution

In [None]:
# Write your Code here

##############################################################################################################

---


In [7]:
import warnings
import time
import pandas as pd
import numpy as np
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Suppress warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Load your data (replace with your actual data loading)
# df = pd.read_csv('your_data.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']

# For demonstration, creating dummy data if real data not available
np.random.seed(42)
X = pd.DataFrame(np.random.rand(100, 5), columns=[f'feature_{i}' for i in range(5)])
y = pd.Series(np.random.rand(100), name='target')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Scale features if needed
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Start timer
start_time = time.time()

# Enhanced model dictionary with more parameters
models = {
    'LinearRegression': (LinearRegression(), {}),
    'SVR': (SVR(), {
        'kernel': ['rbf', 'linear'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    }),
    'DecisionTree': (DecisionTreeRegressor(random_state=42), {
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10]
    }),
    'RandomForest': (RandomForestRegressor(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [None, 5, 10]
    }),
    'KNN': (KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }),
    'GradientBoosting': (GradientBoostingRegressor(random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    }),
    'XGBoost': (XGBRegressor(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1]
    })
}

results = []

for name, (model, params) in models.items():
    print(f"\nTraining {name}...")
    
    # Setup GridSearchCV
    grid = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    # Get best model
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store results
    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'MSE': mse,
        'R2': r2,
        'MAE': mae
    })
    
    # Print results
    print(f"{name} Best Parameters: {grid.best_params_}")
    print(f"{name} MSE: {mse:.4f}")
    print(f"{name} R2: {r2:.4f}")
    print(f"{name} MAE: {mae:.4f}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df.sort_values(by='R2', ascending=False))

# Print execution time
print(f"\nTotal execution time: {time.time() - start_time:.2f} seconds")


Training LinearRegression...
LinearRegression Best Parameters: {}
LinearRegression MSE: 0.1239
LinearRegression R2: -0.6208
LinearRegression MAE: 0.3165

Training SVR...
SVR Best Parameters: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}
SVR MSE: 0.1190
SVR R2: -0.5567
SVR MAE: 0.3045

Training DecisionTree...
DecisionTree Best Parameters: {'max_depth': 5, 'min_samples_split': 10}
DecisionTree MSE: 0.1536
DecisionTree R2: -1.0086
DecisionTree MAE: 0.3276

Training RandomForest...
RandomForest Best Parameters: {'max_depth': 10, 'n_estimators': 100}
RandomForest MSE: 0.1501
RandomForest R2: -0.9629
RandomForest MAE: 0.3441

Training KNN...
KNN Best Parameters: {'n_neighbors': 9, 'weights': 'uniform'}
KNN MSE: 0.1104
KNN R2: -0.4437
KNN MAE: 0.2885

Training GradientBoosting...
GradientBoosting Best Parameters: {'learning_rate': 0.01, 'n_estimators': 100}
GradientBoosting MSE: 0.1228
GradientBoosting R2: -0.6054
GradientBoosting MAE: 0.3037

Training XGBoost...
XGBoost Best Parameters: {'l

# **Add preprocessor inside the pipeline**

## Assignment: Find the errors

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Preprocessor setup
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_scaling', StandardScaler(), [0, 1])  # Scale first two columns
    ], 
    remainder='passthrough'
)

# Updated models dictionary with valid parameters
models = { 
    'LinearRegression': (LinearRegression(), {}),
    'SVR': (SVR(), {
        'model__kernel': ['rbf', 'linear'],  # Removed problematic kernels
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],  # Better gamma options
        'model__epsilon': [0.1, 0.01]
    }),
    'DecisionTreeRegressor': (DecisionTreeRegressor(random_state=42), {
        'model__max_depth': [None, 3, 5, 7],  # More conservative depths
        'model__min_samples_split': [2, 5, 10]
    }),
    'RandomForestRegressor': (RandomForestRegressor(random_state=42), {
        'model__n_estimators': [50, 100],  # Reduced options
        'model__max_depth': [None, 5, 7]
    }),
    'KNeighborsRegressor': (KNeighborsRegressor(), {
        'model__n_neighbors': list(range(3, 20, 2)),  # Smaller range
        'model__weights': ['uniform', 'distance']
    }),
    'GradientBoostingRegressor': (GradientBoostingRegressor(random_state=42), {
        'model__loss': ['squared_error', 'absolute_error', 'huber'],  # Valid losses
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.1, 0.01]
    }),
    'XGBRegressor': (XGBRegressor(random_state=42), {
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.1, 0.01],
        'model__max_depth': [3, 5]
    })          
}

# Training loop with error handling
for name, (model, params) in models.items():
    print(f"\n=== Training {name} ===")
    
    try:
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        grid_search = GridSearchCV(
            pipeline, 
            param_grid=params,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            error_score='raise'  # Will raise errors immediately
        )
        
        grid_search.fit(X_train, y_train)
        y_pred = grid_search.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"MSE: {mse:.4f}")
        print(f"R2: {r2:.4f}") 
        print(f"MAE: {mae:.4f}")
        
        # Check for negative R2
        if r2 < 0:
            print("Warning: Negative R2 score indicates model performs worse than horizontal line")
            
    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        continue


=== Training LinearRegression ===
Best parameters: {}
MSE: 0.1239
R2: -0.6208
MAE: 0.3165

=== Training SVR ===
Best parameters: {'model__C': 0.1, 'model__epsilon': 0.1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
MSE: 0.1190
R2: -0.5567
MAE: 0.3045

=== Training DecisionTreeRegressor ===
Best parameters: {'model__max_depth': 3, 'model__min_samples_split': 10}
MSE: 0.1659
R2: -1.1702
MAE: 0.3558

=== Training RandomForestRegressor ===
Best parameters: {'model__max_depth': None, 'model__n_estimators': 100}
MSE: 0.1493
R2: -0.9531
MAE: 0.3424

=== Training KNeighborsRegressor ===
Best parameters: {'model__n_neighbors': 15, 'model__weights': 'uniform'}
MSE: 0.1042
R2: -0.3634
MAE: 0.2833

=== Training GradientBoostingRegressor ===
Best parameters: {'model__learning_rate': 0.01, 'model__loss': 'huber', 'model__n_estimators': 100}
MSE: 0.1249
R2: -0.6338
MAE: 0.3075

=== Training XGBRegressor ===
Best parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimato

# Classifiers:

In [16]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Perform k-fold cross-validation and calculate the mean accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()

Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

Classifier: Decision Tree
Mean Accuracy: 0.9533333333333335

Classifier: Random Forest
Mean Accuracy: 0.9600000000000002

Classifier: SVM
Mean Accuracy: 0.9666666666666668

Classifier: KNN
Mean Accuracy: 0.9733333333333334



# **Main Assignment:**

## Write the complete code to select the best Regressor and classifier for the given dataset called diamonds `(if you have a high end machine, you can use the whole dataset, else use the sample dataset provided in the link)` or you can use Tips datset for Regression task and Iris dataset for Classification task.

## You have to choose all possible models with their best or possible hyperparameters and compare them with each other and select the best model for the given dataset.

## Your code should be complete and explained properly. for layman, each and every step of the code should be commented properly.

## You code should also save the best model in the pickle file.

## You should also write the code to load the pickle file and use it for prediction. in the last snippet of the code.

## Submit your assignment to the discord inbox. (Do not share the link of your notebook, just upload the notebook in the discord inbox). Do not share the notebook in public channels on our discord server.


# **Deadline for Submission:**

## `29th December before 09:30 pm Pakistan time. (No late submission will be accepted).`
