In [None]:
!python --version

In [5]:
import os
import pickle
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import mlflow
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

In [6]:
TRACKING_SERVER_HOST = os.getenv("TRACKING_SERVER_HOST","localhost")
TRACKING_SERVER_PORT = os.getenv("TRACKING_SERVER_PORT","5000")
EXPERIMENT_NAME = os.getenv("EXPERIMENT_NAME", "house-price-prediction")
MODEL_REGISTRY_NAME = os.getenv("EXPERIMENT_NAME", "house-price-prediction-model")
DATA_PATH = "../data/House_Rent_Dataset.csv"

In [7]:
def load_data(filename):
    df = pd.read_csv(filename, sep=',')
    
    # Feature engineering of the column 'floor'
    df['Total Floors'] = [i.split()[-1] for i in df['Floor']]
    df['Floor'] = [i.split()[0] for i in df['Floor']]
    df['Total Floors'] = df['Total Floors'].replace({'Ground': '1'})
    df['Total Floors'] = df['Total Floors'].astype(int)
    df['Floor'].replace({'Ground': '0', 'Lower': '-1'}, inplace = True)
    
    for i, floor in zip(range(df.shape[0]), df['Floor']):
        if floor == 'Upper':
            df.at[i,'Floor']= df.at[i, 'Total Floors']
    df['Floor'] = df['Floor'].astype(int)
    
    # Remove outliers
    df = df[df['Rent']<3000000]
    df = df[df['Bathroom']<10]
    df = df[~df['Area Type'].str.contains("Built Area")]
    df = df[~df['Point of Contact'].str.contains("Contact Builder")]
    
    return df

In [8]:
def split_data(df):
    df_train, df_val = train_test_split(df, test_size=0.2, random_state=42,
                                        stratify=df[['City', 'Furnishing Status', 'Area Type', 'Point of Contact']])
    return df_train, df_val

In [9]:
def prepare_data(df_train, df_val):

    selected_columns = ['BHK', 'Rent', 'Size', 'Area Type', 'City', 'Furnishing Status',
                        'Tenant Preferred', 'Bathroom', 'Floor', 'Total Floors', 'Point of Contact']
    
    df_train = df_train[selected_columns]
    df_val = df_val[selected_columns]
    
    dv = DictVectorizer(sparse=False)
    
    features = ['BHK', 'Size', 'Area Type', 'City', 'Furnishing Status',
                'Tenant Preferred', 'Bathroom', 'Floor', 'Total Floors', 'Point of Contact']

    train_dicts = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)

    val_dicts = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    target = 'Rent'
    y_train = df_train[target].values
    y_val = df_val[target].values

    return X_train, X_val, y_train, y_val, dv

In [None]:
def train_model_xgboost_search(train, valid, y_val, dv, sc):
    mlflow.xgboost.autolog()
    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", "xgboost")
            mlflow.log_param("train_data",DATA_PATH)
            booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=100,
                evals=[(valid, 'validation')],
                early_stopping_rounds=50
            )
            y_pred = booster.predict(valid)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            r2score = r2_score(y_val, y_pred)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2_score", r2score)
            
            with open("preprocessor/preprocessor.b", "wb") as f_out:
                pickle.dump((dv,sc), f_out)
                
            mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.uniform('max_depth', 1, 60)),
        'learning_rate': hp.loguniform('learning_rate', 0.01, 0.4),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 9),
        'objective': 'reg:squarederror'
    }
    
    rstate = np.random.default_rng(42) # for reproducible results
    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=200,
        trials=Trials(),
        rstate=rstate
    )
    return

In [None]:
def train_model_rf_search(X_train, X_val, y_train, y_val, dv, sc):
    mlflow.sklearn.autolog()
    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", "rf")
            mlflow.log_param("train_data",DATA_PATH)
            rf_model = RandomForestRegressor(**params)
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            r2score = r2_score(y_val, y_pred)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2_score", r2score)
            
            with open("preprocessor/preprocessor.b", "wb") as f_out:
                pickle.dump((dv, sc), f_out)
                
            mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'n_estimators' : scope.int(hp.uniform('n_estimators',10,150)),
        'max_depth' : scope.int(hp.uniform('max_depth',1,40)),
        'min_samples_leaf' : scope.int(hp.uniform('min_samples_leaf',1,10)),
        'min_samples_split' : scope.int(hp.uniform('min_samples_split',2,10)),
        'random_state' : 42
    }
    
    rstate = np.random.default_rng(42)  # for reproducible results
    best_result =  fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=200,
        trials=Trials(),
        rstate=rstate
    )
    return

In [4]:
def register_best_model():
    
    client = MlflowClient()
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.rmse ASC"]
    )[0]
    
    # register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    model_rmse = best_run.data.metrics['rmse']
    model_details = mlflow.register_model(model_uri=model_uri, name=MODEL_REGISTRY_NAME)

    date = datetime.today().date()
    
    # transition of our best model in "Production"
    client.transition_model_version_stage(
        name=model_details.name,
        version=model_details.version,
        stage="Production",
        archive_existing_versions=True
    )
    
    client.update_model_version(
        name=model_details.name,
        version=model_details.version,
        description=f"The model version {model_details.version} was transitioned to Production on {date}"
    )
    
    client.update_registered_model(
      name=model_details.name,
      description=f"Current model version in production: {model_details.version}, rmse: {model_rmse}"
    )

In [None]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_experiment(EXPERIMENT_NAME)

In [None]:
df = load_data(DATA_PATH)
df_train, df_val = split_data(df)
X_train, X_val, y_train, y_val, dv = prepare_data(df_train, df_val)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
train_model_xgboost_search(train, valid, y_val, dv, sc)
train_model_rf_search(X_train, X_val, y_train, y_val, dv, sc)

In [None]:
register_best_model()

#### Interacting with the Model Registry

In [None]:
client = MlflowClient(tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [None]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)[0]

In [None]:
run_id = best_run.info.run_id

In [None]:
# Download model artifact
client.download_artifacts(run_id=run_id, path='model/model.pkl', dst_path='.')

In [None]:
client.list_experiments()

In [None]:
client.list_registered_models()

In [10]:
model_uri = f"models:/{MODEL_REGISTRY_NAME}/latest"

In [None]:
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri)

In [None]:
# Prediction of a house rent price.
house_data = {
    'BHK': 3,
    'Size' : 1000,
    'Area Type' : 'Carpet Area',
    'City' : 'Mumbai',
    'Furnishing Status' : 'Furnished',
    'Tenant Preferred' : 'Bachelors/Family',
    'Bathroom' : 2,
    'Floor' : 5,
    'Total Floors' : 15,
    'Point of Contact' : 'Contact Agent'
}

In [None]:
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv, sc = pickel.load(f_in)

In [None]:
X = dv.transform(house_data)
X = sc.transform(X)
price_prediction = loaded_model.predict(X)

In [12]:
print(f"The predicted price for renting this house is {np.int(price_prediction[0])}")