In [None]:
import os
from dotenv import load_dotenv
from hydra import compose, initialize
import pandas as pd
from hdb_resale import data, model, sql

import mlflow
from mlflow.models import infer_signature

In [None]:
# Retrieve environment variables
load_dotenv()

POSTGRESQL_DASH_USER = os.environ.get("POSTGRESQL_DASH_USER")
POSTGRESQL_DASH_PASSWORD = os.environ.get("POSTGRESQL_DASH_PASSWORD")
POSTGRESQL_DASH_DATABASE = os.environ.get("POSTGRESQL_DASH_DATABASE")
POSTGRESQL_HOST = os.environ.get("POSTGRESQL_HOST")
POSTGRESQL_PORT = os.environ.get("POSTGRESQL_PORT")

MLFLOW_TRACKING_USERNAME = os.environ.get("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.environ.get("MLFLOW_TRACKING_PASSWORD")

In [None]:
config_directory = "../../airflow_prd/dags/conf"
config_name = "hdb_resale_config"

with initialize(version_base=None, config_path=config_directory):
    cfg = compose(config_name=config_name)

In [None]:
engine, metadata = sql.setup_database(
    postgresql_dash_user=POSTGRESQL_DASH_USER,
    postgresql_dash_password=POSTGRESQL_DASH_PASSWORD,
    postgresql_dash_database=POSTGRESQL_DASH_DATABASE,
    postgresql_host=POSTGRESQL_HOST,
    postgresql_port=POSTGRESQL_PORT,
)

# Get training data
X, y = data.get_training_data(cfg=cfg, engine=engine, metadata=metadata)


In [None]:
# Setup ML model

# Define model hyperparameters
params = {
    "random_state": cfg.model.random_state,
}

# Define ensemble model
resale_model = model.MultiTreeEnsembleRegressor(**params)

# Run model training
model.train_model(X=X, y=y, model=resale_model, out_dir="../../airflow_prd/models/multreeens_model.joblib")

In [None]:
# Get error metrics from cross validations
cv_res = model.get_cv_error(X=X, y=y, model=resale_model, random_state=6)
# cv_res.to_csv(f"{out_dir}/cv_metric_results.csv", index=False)

# Get actual fitted model sizes
# For performance considerations
size = model.get_model_size(model=resale_model)
# size.to_csv(f"{out_dir}/model_sizes.csv", index=False)

# Get various diagnostic plots - both pre and post predictions
# NOTE Slowest section in terms of computation
diag_fig = model.get_diag_plot(X=X, y=y, model=resale_model, random_state=6)
# diag_fig.savefig(f"{out_dir}/model_diagnostics.png")

In [None]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="https://cheeyeelim.com/mlflow/")

# Set the MLflow Experiment
# It will create a new experiment if not exist
tags = {
    "description" : "Model that predicts the price of HDB resale flats", 
    "data_source" : engine.url.render_as_string(hide_password=True)
}
mlflow.set_experiment("HDB Resale Price")
mlflow.set_experiment_tags(tags)

In [None]:
model_name = "hdb-resale-price"

# Start an MLflow run
with mlflow.start_run():
    # Infer the model signature
    signature = infer_signature(X, resale_model.predict(X))

    # Log the model
    # Do not register model here, register later for more flexibility
    model_info = mlflow.sklearn.log_model(
        sk_model=resale_model,
        artifact_path="hdb_resale",
        signature=signature,
        input_example=X.head(20),
        registered_model_name=model_name
    )
    
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metrics
    # NOTE log_metrics allow the logging of dictionary of metrics
    mlflow.log_metrics(cv_res.mean().to_dict())

    # Log the diagnostic plots
    mlflow.log_figure(diag_fig, "model_diagnostics.png")

In [None]:
# Load the latest model back for predictions
model_uri = f"models:/{model_name}/latest"
loaded_model = mlflow.sklearn.load_model(model_uri)

predictions = loaded_model.predict(X.head(1))

predictions