### House Price Prediction with MLFLOW 

outcomes of this project : <br>

-> Run a hyper parameter tuning while training a model <br>
-> Log every Hyperparameter and metrics in the MLFlow UI <br>
-> Compare the results of the various runs in the MLFlow UI <br>
-> Choose the best run and register it as a model

In [2]:
import pandas as pd 
import mlflow 
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV # using grid search cv for hyper parameter tuning 
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing # calling the data set 

housing = fetch_california_housing()
print(housing)


{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8)), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset

In [3]:
## preparing the dataset 

df = pd.DataFrame(housing.data, columns = housing.feature_names)
df['Price'] = housing.target
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [4]:
df.isnull().any() # checking for null values 

MedInc        False
HouseAge      False
AveRooms      False
AveBedrms     False
Population    False
AveOccup      False
Latitude      False
Longitude     False
Price         False
dtype: bool

### Train Test Split, Model Hyper parameter tuning , MLFlow Experiments 

In [5]:
from urllib.parse import urlparse # will come in handy for mlflow
df
### Independent and dependent features 

x = df.drop(columns=["Price"])
y = df["Price"]

print(x)
print(y)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [6]:
# hyper parameter tuning usnig Gridsearch cv 

def hyperparameter_tuning(x_train,y_train,param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=1,scoring = "neg_mean_squared_error")
    grid_search.fit(x_train,y_train)
    return grid_search

In [7]:
## splitting data into training and test sets

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

from mlflow.models import infer_signature # to set the input and output
signature = infer_signature(x_train,y_train) # 

## define the hyper parameter grid 

param_grid = {
    'n_estimators' : [100,200], # controls the number of trees in an ensemble model
    'max_depth' : [5,10,None], # this sets the maximum depth of each tree 5 is shollow and 10 might be a good fit , none means no limits and the tree grows untill all the leaves are pure risk of over fitting 
    'min_samples_split' : [2,5], # the max number of samples required to split a node 
    'min_samples_leaf' : [1,2] 
}

## Start the MLFlow experiments

with mlflow.start_run():
    ## perform hyper parameter tuning 
    grid_search = hyperparameter_tuning(x_train,y_train,param_grid)
    
    # get the best model
    best_model =  grid_search.best_estimator_
    
    #evaluate the best model 
    y_pred = best_model.predict(x_test)
    mse = mean_squared_error(y_test,y_pred)
    
## log best parameters and metrics 
mlflow.log_param("best_n_estimators",grid_search.best_params_['n_estimators'])
mlflow.log_param("best_max_depth",grid_search.best_params_['max_depth'])
mlflow.log_param("best_min_samples_split",grid_search.best_params_['min_samples_split'])
mlflow.log_param("best_min_samples_leaf",grid_search.best_params_['min_samples_leaf'])
mlflow.log_metric("mse",mse) # logging the error metric 

### tracking url
mlflow.set_tracking_uri(uri = "http://127.0.0.1:5000")
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme # used to check if a uri to mlflow has been provided or not if no uri is provided it returns file that means it is pointing to this file 

if tracking_url_type_store !='file':
    mlflow.sklearn.log_model(best_model,"model",registered_model_name="Best Random Forest Model")
else : 
    mlflow.sklearn.log_model(best_model,"model",signature = signature)

print(f"Best Hyperparameters: {grid_search.best_params_}" )
print(f"Mean Squared Error: {mse}")

Fitting 3 folds for each of 24 candidates, totalling 72 fits


Successfully registered model 'Best Random Forest Model'.
2025/10/21 15:36:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Random Forest Model, version 1


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Mean Squared Error: 0.23172976605288542


Created version '1' of model 'Best Random Forest Model'.
