#  House Prices Model with Hyperparameter Tuning and Pipelines


Finding the optimal hyperparameters is a challenging task. Traditionally, this involves trying out various combinations manually to determine which parameters yield the best results. However, a more systematic approach involves creating a grid of hyperparameters and testing all possible combinations. This method is aptly named GridSearch. Thankfully, Scikit-learn provides a built-in solution for this called GridSearchCV.

GridSearchCV simplifies the process by taking a dictionary that outlines the parameters to be tested. Each parameter is defined with a set of values to explore.

We tune the parameters of different regreesors and find the best one for our model

See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html


## Import the basic libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

In [2]:
import mlflow



## Import the sklearn libraries that will be used

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,KFold

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.pipeline import Pipeline

## Import the models

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet

## Import the metrics

In [5]:
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_error

In [6]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Steps:
Set up MLflow for experiment tracking

Use Optuna for hyperparameter tuning

Log the results and best parameters in MLflow

## MLflow

In [7]:
#Add the MLflow tracking URL to the environment variable
import os
os.environ["MLFLOW_TRACKING_URI"] = "http://127.0.0.1:5000"

In [8]:
# mlflow ui

In [12]:
import optuna

In [14]:
# Set MLflow experiment
mlflow.set_experiment("RandomForestRegressor_Hyperparameter_Tuning")

def objective(trial):
    """Objective function for Optuna hyperparameter tuning."""
    
    # Define hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    with mlflow.start_run(nested=True):
        # Train model
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_split", min_samples_split)
        mlflow.log_param("min_samples_leaf", min_samples_leaf)
        mlflow.log_metric("mse", mse)

        # Log the model
        mlflow.sklearn.log_model(model, "random_forest_regressor_model")

        return mse  # Optuna minimizes this value

# Optimize hyperparameters
study = optuna.create_study(direction="minimize")  # Minimize MSE
study.optimize(objective, n_trials=20)

# Log the best parameters in MLflow
best_params = study.best_params
best_mse = study.best_value

with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric("best_mse", best_mse)

print("Best Hyperparameters:", best_params)
print("Best MSE:", best_mse)

MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=RandomForestRegressor_Hyperparameter_Tuning (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x12c089650>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
 # Set up an active experiment for the current running code.

# EXPERIMENT_NAME = "mlflow_autotracking"
# mlflow.set_experiment("House_pricing_experiement")
# experiment = mlflow.get_experiment_by_name("House_pricing_experiement")
# print("experiment_id:", experiment.experiment_id)

In [None]:
mlflow.pytorch.autolog()

# Load the dataset

In [None]:
housesales_train = pd.read_csv("./Datasets/House_sales_train.csv")
# housesales_test = pd.read_csv("./Datasets/House_sales_test.csv")


In [None]:
# housesales_test.shape

In [None]:
HouseSales = housesales_train.copy()

In [None]:
##descriptive stats  for the whole dataset
HouseSales.describe()

In [None]:
HouseSales.head()

In [None]:
#drop the ID
HouseSales = HouseSales.drop(columns=['Id'])

In [None]:
HouseSales.columns

In [None]:
#Find all columns null entries
columns_with_null = HouseSales.columns[HouseSales.isnull().any()]
columns_with_null

In [None]:
##descriptive stats  for the whole SalePrice
HouseSales['SalePrice'].describe()

In [None]:
# Get the number of unique values for each feature
unique_vals =  HouseSales.nunique().sort_values(ascending=False)

# unique_vals = pd.DataFrame(unique_vals)
# pd.set_option('display.max_rows', None)  # Display all rows
print(unique_vals)

In [None]:
# Select only the numerical columns
numeric_columns = HouseSales[HouseSales.select_dtypes(include='number').columns]
# numeric_columns.info()

In [None]:
numeric_columns.columns

In [None]:
# numeric_columns.info()

In [None]:
# Calculate mean of each numeric column
means = numeric_columns.mean()

#  Fill null values with means
# updatednumerical_features = numeic_columns.fillna(means)

#  Fill null values with 0
updatednumerical_features = numeric_columns.fillna(0)

## Histograms

In [None]:
def plot_hist_graphs(df):
    # Create a grid of subplots
    n_cols = len(df.columns)
    n_rows = (n_cols + 3) // 4  # Calculate number of rows needed based on number of columns
    fig, axes = plt.subplots(n_rows, 4, figsize=(24, 20))

    axes = axes.flatten()
    for i, ax in enumerate(axes):
        if i < n_cols:
            column_name = df.columns[i]  # Get column name for the subplot
            ax.hist(df[column_name], bins=30, color='green', edgecolor='black') 
            ax.set_title(f'Histogram of {column_name}') 

    # Adjust spacing between subplots
    plt.subplots_adjust(hspace=1.5, wspace=0.2)
    plt.show()

In [None]:
# Plot the histogram of all the numeric features
# plot_hist_graphs(updatednumerical_features)


In [None]:
non_numeric_features = housesales_train.select_dtypes(include=['object'])
non_numeric_features = non_numeric_features.fillna('nil')
# non_numeric_features.info()

## Data Engineering

## Feature Encoding : Categorical, ordinal


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(non_numeric_features)

In [None]:
# Combine the numeric and non numeric datasets
HouseSalesEncoded = pd.concat([updatednumerical_features, ohetransform], axis=1)
HouseSalesEncoded.shape

## Define the inputs and outputs

In [None]:
X,y = HouseSalesEncoded.loc[:, ~HouseSalesEncoded.columns.isin(['SalePrice'])],HouseSalesEncoded['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

# Hyperparameter tuning  with RandomSearchCV and GridSearchCV

In [None]:
# Define base models
base_models = [
    ('SVR', SVR()),
    ('Dtree', DecisionTreeRegressor(random_state=42)),
    # ('xgb', XGBRegressor()),
    ('rf', RandomForestRegressor()),
    ('EN', ElasticNet(max_iter=10000))
]

In [None]:
# Define a grid of hyperparameters to search
grid_params = [
    {
    'SVR__C': [0.1, 1, 10, 100, 1000],
    'SVR__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'SVR__kernel': ['rbf','linear']
    },
    
    
    # {
    # 'xgb__n_estimators': [10, 10, 10],
    # 'xgb__max_depth': [3, 5, 7]
    # },
    {
     'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
     'rf__bootstrap': [True, False],
    'rf__warm_start': [True, False]
    },
    {
    'EN__alpha'     : [0.1,1,10,0.01],
    'EN__l1_ratio'  :  np.arange(0.40,1.00,0.10),
    'EN__tol'       : [0.0001,0.001]
    }
]

In [None]:
# Create pipelines for the models
pipelines = []
for model, params in zip(base_models, grid_params):
    pipelines.append(Pipeline([
        ('scaler', StandardScaler()),
        model
    ]))

In [None]:
# cv=3

In [None]:
# Define the pipelines and corresponding parameter grids as a generator
def pipeline_generator():
    # Define the pipelines and parameter grids here
    for pipeline, params in zip(pipelines, grid_params):
        yield pipeline, params

In [None]:
def HalvingRandomSearchPipeline1(pipeline, params,X_train, y_train):
   
    start=time()
    # Define the number of folds for cross-validation
    n_splits = 3
    training_time ={}
    # Create a cross-validation splitter
    cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    

    
    # Perform hyperparameter tuning for each pipeline
    best_models, best_params_list = [],[]
    
    with mlflow.start_run():
        for pipeline, params in pipeline_generator():
    
            random_search = HalvingRandomSearchCV(estimator=pipeline,
                                                       param_distributions=params,
                                                       factor=3,
                                                       refit=True,
                                                       scoring='neg_mean_absolute_error',
                                                       cv=cv,
                                                       n_jobs=-1,
                                                       random_state=21,
                                                       # verbose=3
                                                       )  
            random_search.fit(X_train, y_train.ravel())
            best_models.append(random_search.best_estimator_)
            best_params_list.append(random_search.best_params_)
            ########################
            # Log best parameters and score
            mlflow.log_params(random_search.best_params_)
            mlflow.log_metric("best_score", random_search.best_score_)
    
            # Log the model
            mlflow.sklearn.log_model(random_search.best_estimator_, "model")
            ###################
            duration = time() - start
            regressor_name = pipeline.steps[-1][0] 

            training_time.update({regressor_name:duration})
        
        # print('This pipeline took:', str(duration / 60), 'Minutes')
    return  best_models, best_params_list,training_time

In [None]:
# # Get the best parameters for severity prediction for each model
best_models, best_params_list,training_time = HalvingRandomSearchPipeline1(pipelines, params,X_train, y_train)

In [None]:
best_models

In [None]:
best_params_list

In [None]:
import pprint

In [None]:
# Print the best parameters
print("Best Parameters")
pprint.pprint(best_params_list, width=2)

## Show the order of model performance

In [None]:
for idx, model in enumerate(best_models):
    print (idx,model.steps[-1][1])
    print()

In [None]:
##Order if performance
for i in range (0,len(best_models)):
    display(best_models[i])

In [None]:
def rank_models(best_models, X_test,y_test):
        # Evaluate performance of each model on the validation set
        model_scores = []
        for idx, model in enumerate(best_models):
            # Do a prediction for the model
            y_pred = model.predict(X_test)

            # Calculate mean absolute error
            mae = mean_absolute_error(y_test, y_pred)
            # Store the name of the regressor and its score
            regressor_name = model.steps[-1][0]  # Regressor is the last step in the pipeline
            model_scores.append((regressor_name, mae))
        
        # Sort the models based on their scores in ascending order
        sorted_models = sorted(model_scores, key=lambda x: x[1])
        return sorted_models

## Predictions

In [None]:
# Make predictions with tghe best model
y_pred = best_models[0].predict(X_test)

In [None]:
y_df= pd.DataFrame(y_pred)

In [None]:
# Descriptive stats for the predicted SalePrice
y_df.describe()