In [10]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing
# import dagshub

In [11]:
# dagshub.init(repo_owner='eryash15', repo_name='mlflow-mlops', mlflow=True)

In [12]:
housing = fetch_california_housing()
## Preparing the data
data = pd.DataFrame(housing.data, columns=housing.feature_names)
target = housing.target
print(data.head(5))
print(target)

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  
[4.526 3.585 3.521 ... 0.923 0.847 0.894]


### Train Test Split, Model HyperParameter Tuning, MLFlow Experiments

In [13]:
from urllib.parse import urlparse
## Independent and Dependent Features
X = data
y = target
## train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [14]:
## HPO tuning using GridSearchCV

def hyperparameter_tuning(X_train,y_train,param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        verbose=10,
        scoring="neg_mean_squared_error"
    )
    grid_search.fit(X_train,y_train)
    return grid_search

In [15]:
from mlflow.models import infer_signature
signature = infer_signature(X_train, y_train)

# Define HPO grid
param_grid = {
'n_estimators': [100, 200],
'max_depth': [5, 10, None],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
}

with mlflow.start_run() as run:
    print("Run ID:", run.info.run_id)  # Debugging step:
    ## perform hyperparameter tuning
    gridsearch = hyperparameter_tuning(X_train, y_train, param_grid)
    best_model = gridsearch.best_estimator_

    # Evaluate the best model
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)

    ## log best parameters and metrics
    mlflow.log_param("best_n_estimators",gridsearch.best_params_['n_estimators'])
    mlflow.log_param("best_max_depth", gridsearch.best_params_['max_depth'])
    mlflow.log_param("best_min_samples_split", gridsearch.best_params_['min_samples_split'])
    mlflow.log_param("best_min_samples_leaf", gridsearch.best_params_['min_samples_leaf'])
    mlflow.log_metric("mse", mse)

    mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
    tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != 'file' :
        mlflow.sklearn.log_model(best_model,"model",registered_model_name="Best Randomforest Model")
    else:
        mlflow.sklearn.log_model(best_model,"model",signature=signature)

Run ID: 85e4d1c235f4444f89711f34ddc6c5cf
Fitting 3 folds for each of 24 candidates, totalling 72 fits


Successfully registered model 'Best Randomforest Model'.
2025/03/17 01:03:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Randomforest Model, version 1
Created version '1' of model 'Best Randomforest Model'.


🏃 View run polite-panda-584 at: http://127.0.0.1:5000/#/experiments/0/runs/85e4d1c235f4444f89711f34ddc6c5cf
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
