# AirBnb NY Listing Price Prediction: ML Best Models

In [0]:
%pip install mlflow
%pip install scikit-learn==1.4.1.post1

In [0]:
#dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import mlflow

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

### Split into Train & Test

In [0]:
catalog_ = f"price_prediction"
schema_ = f"ny_listing"
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

In [0]:
SEED_ = 334

gold_data = spark.sql("SELECT * from gold_data")
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))
#display(test_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-1].to_numpy()
yTrain = train_df.loc[:, 'price'].to_numpy()

xTest = test_df.iloc[:, 1:-1].to_numpy()
yTest = test_df.loc[:, 'price'].to_numpy()

### Train Best Models

In [0]:
### MLFlow Parameters

_mlflow_exp = '/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings'

mlflow.autolog()

# Pick run IDs corresponding to the best 2-3 models from the tuning phase
xgb_run_ids = ['293bac7a7022458bb317005bbe1d0d56',
               '4c2cf5101b474bd0aaf5d3d75d1a37d8',
               '2f22ee0611c546bfbb206268e63ecfe4']

# Pick metrics to be logged
metrics_to_log = {
  'mape': mean_absolute_percentage_error,
  'r2': r2_score,
  'mse': mean_squared_error
}

In [0]:
# Function to Create a model with parameters from an MLFlow runID
def create_model(model_type, run_id):
  client = mlflow.tracking.MlflowClient()
  run = client.get_run(run_id)
  parameters = run.data.params
  if model_type == 'GradientBoostingRegressor':
    _model = GradientBoostingRegressor(
      learning_rate = float(parameters['learning_rate']),
      max_depth = int(parameters['max_depth']) if parameters['max_depth'] else parameters['max_depth'],
      max_features = str(parameters['max_features']) if parameters['max_features'] != 'None' else None,
      n_estimators = int(parameters['n_estimators']) if parameters['n_estimators'] else parameters['n_estimators'],
      random_state = SEED_)
  else:
    _model = RandomForestRegressor(
      max_depth = int(parameters['max_depth']) if parameters['max_depth'] else parameters['max_depth'],
      max_features = str(parameters['max_features']) if parameters['max_features'] != 'None' else None,
      n_estimators = int(parameters['n_estimators']) if parameters['n_estimators'] else parameters['n_estimators'],
      random_state = SEED_)
    
  return _model

# Function to Train the model, logging within a running parent mlflow run
def train_test_model(model, xTrain, yTrain, xTest, yTest, metrics):
  with mlflow.start_run(nested=True) as run:
    trained_ = model.fit(xTrain, yTrain)
    pred_yTrain = trained_.predict(xTrain) 
    pred_yTest = trained_.predict(xTest)
    ## ml flow logging
    signature_ = mlflow.models.infer_signature(model_input=xTrain[:10], model_output=pred_yTrain[:10])
    dataset_train = mlflow.data.load_delta(table_name= str(catalog_+'.'+schema_+'.gold_data'))
    mlflow.log_input(dataset_train, context="training") # Allows to build UC Lineage
    performances_ = {}
    for m_name in metrics.keys():
      _metric = metrics[m_name]
      performances_['train'+m_name] = _metric(yTrain, pred_yTrain)
      performances_['test'+m_name] = _metric(yTest, pred_yTest)
    mlflow.sklearn.log_model(sk_model = model, artifact_path = "model", signature=signature_, input_example=xTest)
    run_id = mlflow.active_run().info.run_id

  return trained_, performances_, run_id

In [0]:
### Start XGB Best Run
with mlflow.start_run(experiment_id = mlflow.set_experiment(_mlflow_exp).experiment_id, run_name = "XGB_best") as run:

  xgb_results = {}
  for _id in xgb_run_ids:
    xgb_model = create_model('GradientBoostingRegressor', _id)
    xgb_best, xgb_performances, xgb_run_id = train_test_model(
      xgb_model,
      xTrain,
      yTrain,
      xTest,
      yTest,
      metrics_to_log)
    xgb_results[xgb_run_id] = (xgb_best, xgb_performances)

print(xgb_results)

### Best Model Train (old)

In [0]:
_mlflow_exp = '/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings'

metrics_to_log = {
  'mape': mean_absolute_percentage_error,
  'r2': r2_score,
  'mse': mean_squared_error
}

In [0]:
## Function to train and test a model
mlflow.autolog()

def train_test_model(model, xTrain, yTrain, xTest, yTest, metrics, mlflow_experiment, mlflow_run_name):
  with mlflow.start_run(experiment_id=mlflow_experiment.experiment_id, run_name=mlflow_run_name) as run:
    trained_ = model.fit(xTrain, yTrain)
    pred_yTrain = trained_.predict(xTrain) 
    pred_yTest = trained_.predict(xTest)
    ## ml flow logging
    signature_ = mlflow.models.infer_signature(model_input=xTrain[:10], model_output=pred_yTrain[:10])
    dataset_train = mlflow.data.load_delta(table_name= str(catalog_+'.'+schema_+'.gold_data'))
    mlflow.log_input(dataset_train, context="training") # Allows to build UC Lineage
    performances_ = {}
    for m_name in metrics.keys():
      _metric = metrics[m_name]
      performances_['train'+m_name] = _metric(yTrain, pred_yTrain)
      performances_['test'+m_name] = _metric(yTest, pred_yTest)
    mlflow.sklearn.log_model(sk_model = model, artifact_path = "model", signature=signature_, input_example=xTest)
    run_id = mlflow.active_run().info.run_id

  return trained_, performances_, run_id

In [0]:
## Train and test
experiment_ = mlflow.set_experiment(_mlflow_exp)

### RF
RF_best, RF_performances, RF_run_id = train_test_model(
  RandomForestRegressor(
    max_depth = 50,
    max_features = None,
    n_estimators = 131,
    random_state = SEED_
  ),
  xTrain,
  yTrain,
  xTest,
  yTest,
  metrics_to_log,
  experiment_,
  "RF_best"
)

In [0]:
### XGB
XGB_best, XGB_performances, XGB_run_id = train_test_model(
  GradientBoostingRegressor(
    learning_rate = 0.9,
    max_depth = 50,
    max_features = None,
    n_estimators = 844,
    random_state = SEED_
  ),
  xTrain,
  yTrain,
  xTest,
  yTest,
  metrics_to_log,
  experiment_,
  "XGB_best"
)

### Register Final Models on UC

In [0]:
# Register RF Model
catalog = catalog_
schema = schema_
model_name = "NY_Price_Listings_RF"
mlflow.set_registry_uri("databricks-uc")
mlflow.register_model(
    model_uri="runs:/"+RF_run_id+"/model",
    name=f"{catalog}.{schema}.{model_name}"
)

In [0]:
# Register XGB Model
catalog = catalog_
schema = schema_
model_name = "NY_Price_Listings_XGB"
mlflow.set_registry_uri("databricks-uc")
mlflow.register_model(
    model_uri="runs:/"+XGB_run_id+"/model",
    name=f"{catalog}.{schema}.{model_name}"
)