# AirBnb NY Listing Price Prediction: Register & Serve Models

Useful Pyfunc and Serving API resources:
*  tutorial [link](https://mlflow.org/docs/latest/traditional-ml/serving-multiple-models-with-pyfunc/notebooks/MME_Tutorial.html)
*  https://docs.databricks.com/en/machine-learning/model-serving/custom-models.html
*  https://docs.databricks.com/en/machine-learning/model-serving/score-custom-model-endpoints.html
*  Notebook with example of how to serve and query a custom model https://docs.databricks.com/en/_extras/notebooks/source/machine-learning/deploy-mlflow-pyfunc-model-serving.html 


In [0]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
import random
import numpy
import json
import os

from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointCoreConfigInput, ServedModelInput, ServedModelInputWorkloadSize

In [0]:
### Define Global Variables
catalog_ = os.getenv('CATALOG_NAME')
schema_ = os.getenv('SCHEMA_NAME')
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

SEED_ = 155
mlflow.set_registry_uri("databricks-uc")
experiment_name_ = 'Airbnb_NY_Tuning'
Train_tbl_Name = 'airbnb_ny_gold_data'
xgb_model_name = f"{catalog_}.{schema_}.airbnb_ny_XGB"
xgb_model_ver = 2
rf_model_name = f"{catalog_}.{schema_}.airbnb_ny_RF"
rf_model_ver = 2

### Register Final Models on UC

In [0]:
# Register XGB Model
XGB_run_id = '56cf865581eb4b729614b529b88c1a9b'
mlflow.register_model(
    model_uri="runs:/"+XGB_run_id+"/model",
    name=xgb_model_name
)

In [0]:
# Register RF Model
RF_run_id = '0621a09a2af24659b6f4d20d31796980'
mlflow.register_model(
    model_uri="runs:/"+RF_run_id+"/model",
    name=rf_model_name
)

### Pyfunc: Create a custom class defining the predict method

In [0]:
class CombiningModels(mlflow.pyfunc.PythonModel):
    
    # initialize data
    def __init__(self, uri_models, weights):
        self.uri_models = uri_models # list needed
        self.weights = [float(w) for w in weights] # list needed
        self.models = []
    
    # load models from URI
    def load_models(self):
        for i in range(len(self.uri_models)):
            loaded_model_ = mlflow.pyfunc.load_model(self.uri_models[i])
            self.models.append(loaded_model_)
            #print("Model %s loaded!" % str(i))

    # create a custom predict method
    def predict(self, context, input_data): # context required as part of the API input
        '''input_data received as {'dataframe_split': df.to_dict(orient='split')} and then json.dumps() by the endpoint API but automatically converted to DataFrame'''
        final_pred = .0
        # custom predict logic
        for i in range(len(self.models)):
            model_ = self.models[i]
            pred_ = model_.predict(input_data)
            print("Model %s prediction: %.2f" % (str(i), pred_))
            final_pred += pred_ * self.weights[i]
        return final_pred

### Test the Pyfunc Model

In [0]:
## Copy gold data and split into train and test
gold_data = spark.sql("SELECT * from "+Train_tbl_Name)
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))

In [0]:
## Pick a random datapoint from test set
rnd_ = random.randint(0, len(test_df))

xTest = test_df.iloc[[rnd_], 1:-2] # Pandas df of 1 row
yTest = test_df.loc[rnd_, 'price']

print("Random point picked:")
display(xTest)

#This is how the API input will look like
xTest_dict = {'dataframe_split': xTest.to_dict(orient='split')}
xTest_api = json.dumps(xTest_dict)
print(xTest_api)

In [0]:
## Prepare class parameter and input data
weights_ = [0.3, 0.7]
uri_models_ = [
  f"models:/{rf_model_name}/{rf_model_ver}", # Rf model will have weight 0.3
  f"models:/{xgb_model_name}/{xgb_model_ver}" # XGB model will have weight 0.7
]
input_data_df = xTest.copy()
input_data = xTest_api

## Initialize object from class
myCustomModel = CombiningModels(uri_models = uri_models_, weights = weights_)
myCustomModel.load_models()

In [0]:
# Perform predictions

# Raw prediction from registered models
RF_loaded_model_ = mlflow.pyfunc.load_model(uri_models_[0])
XGB_loaded_model_ = mlflow.pyfunc.load_model(uri_models_[1])
print("Original RF model prediction: %.3f" % RF_loaded_model_.predict(xTest))
print("Original XGB model prediction: %.3f" % XGB_loaded_model_.predict(xTest))

# Custom predictions from Pyfunc, inferring signature
print("\nGenerating prediction via pyfunc:")
customPred = myCustomModel.predict('', xTest)
print("Final pyfunc custom prediction : %.3f" % (customPred))
signature_ = infer_signature(xTest, customPred)

# True price value
print("\nTrue price for random point: %.3f" % yTest)

### Register the pyfunc model

In [0]:
## Set up the MLFlow Experiment
experiment_path = f'/Users/gabriele.albini@databricks.com/{experiment_name_}'
experiment = mlflow.get_experiment_by_name(experiment_path)

if experiment is not None:
    experiment_id = experiment.experiment_id
else:
    experiment_id = mlflow.create_experiment(name=experiment_path)

mlflow.sklearn.autolog(disable=True)

In [0]:
# Log & Register the pyfunc model
model_name_ = 'airbnb_ny_pyfunc'
with mlflow.start_run(experiment_id=experiment_id, run_name="Pyfunc_Model") as run:
  mlflow.pyfunc.log_model(
    model_name_,
    python_model = myCustomModel,
    pip_requirements = ["pandas", "numpy", "mlflow==2.11.1", "scikit-learn==1.4.1.post1"],
    signature = signature_,
    registered_model_name = f"{catalog_}.{schema_}.{model_name_}")

### Serve the pyfunc model

In [0]:
## Define variables for the serving endpoint
host = "https://e2-demo-field-eng.cloud.databricks.com/"
serving_endpoint_name = f"airbnb_ny_pred"
serving_endpoint_url = f"{host}/ml/endpoints/{serving_endpoint_name}"
model_path = f"{catalog_}.{schema_}.{model_name_}"
latest_model_ver = 1

In [0]:
## Generate endpoint configuration
w = WorkspaceClient()
endpoint_config = EndpointCoreConfigInput(
  name = serving_endpoint_name,
  served_models=[
    ServedModelInput(
      model_name = model_path,
      model_version = latest_model_ver,
      workload_size = ServedModelInputWorkloadSize.SMALL,
      scale_to_zero_enabled = True,
      environment_vars = {}
    )
  ]
)

# Check if endpoint exists
endpoint_exists = False
for ep in w.serving_endpoints.list():
  if ep.name == serving_endpoint_name:
    endpoint_exists = True
    break

# Create or update endpoint
if not endpoint_exists:
  print("Creating a new serving endpoint: %s" % serving_endpoint_url)
  w.serving_endpoints.create_and_wait(name=serving_endpoint_name, config=endpoint_config)
else:
  print("Updating the serving endpoint:  %s" % serving_endpoint_url)
  w.serving_endpoints.update_config_and_wait(served_models=endpoint_config.served_models, name=serving_endpoint_name)