# AirBnb NY Listing Price Prediction: ML Best Models

In [0]:
%pip install mlflow
%pip install scikit-learn==1.4.1.post1

In [0]:
#dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import mlflow

from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

### Split into Train & Test

In [0]:
catalog_ = f"price_prediction"
schema_ = f"ny_listing"
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

In [0]:
SEED_ = 155

gold_data = spark.sql("SELECT * from gold_data")
train_df, test_df = gold_data.randomSplit([.85, .15], seed = SEED_)
train_df = train_df.toPandas()
test_df = test_df.toPandas()

display(train_df.head(2))
#display(test_df.head(2))

In [0]:
# Convert data into np arrays
xTrain = train_df.iloc[:, 1:-1].to_numpy()
yTrain = train_df.loc[:, 'price'].to_numpy()

xTest = test_df.iloc[:, 1:-1].to_numpy()
yTest = test_df.loc[:, 'price'].to_numpy()

### Best Model Train

In [0]:
_mlflow_exp = '/Users/gabriele.albini@databricks.com/ModelServing_mlflow/NY_Price_Listings'

In [0]:
## Function to train and test a model
def train_test_model(model, xTrain, yTrain, xTest, yTest, metric, mlflow_experiment, mlflow_run_name):
  mlflow.sklearn.autolog()
  with mlflow.start_run(experiment_id=mlflow_experiment.experiment_id, run_name=mlflow_run_name) as run:
    trained_ = model.fit(xTrain, yTrain)
    pred_yTrain = trained_.predict(xTrain) 
    pred_yTest = trained_.predict(xTest)
    performance = metric(yTest, pred_yTest)
    ## ml flow logging
    signature_ = mlflow.models.infer_signature(model_input=xTrain[:10], model_output=pred_yTrain[:10])
    dataset_train = mlflow.data.load_delta(table_name= str(catalog_+'.'+schema_+'.gold_data'))
    mlflow.log_input(dataset_train, context="training") # Allows to build UC Lineage
    mlflow.log_metric("OBJECTIVE_METRIC", performance)
    mlflow.sklearn.log_model(sk_model = model, artifact_path = "model", signature=signature_, input_example=xTest)
    run_id = mlflow.active_run().info.run_id

  return trained_, performance, run_id

In [0]:
## Train and test
experiment_ = mlflow.set_experiment(_mlflow_exp)

### RF
RF_best, RF_best_performance, RF_run_id = train_test_model(
  RandomForestRegressor(
    max_depth = 50,
    max_features = None,
    n_estimators = 131,
    random_state = SEED_
  ),
  xTrain,
  yTrain,
  xTest,
  yTest,
  mean_absolute_percentage_error,
  experiment_,
  "RF_best"
)
print("Random Forest, Mean absolute percentage error on test set: %.4f" % RF_best_performance)

In [0]:
### XGB
XGB_best, XGB_best_performance, XGB_run_id = train_test_model(
  GradientBoostingRegressor(
    learning_rate = 0.9,
    max_depth = 50,
    max_features = None,
    n_estimators = 844,
    random_state = SEED_
  ),
  xTrain,
  yTrain,
  xTest,
  yTest,
  mean_absolute_percentage_error,
  experiment_,
  "XGB_best"
)
print("XGBoost Regressor, Mean absolute percentage error on test set: %.4f" % XGB_best_performance)

In [0]:
# Register RF Model
catalog = catalog_
schema = schema_
model_name = "NY_Price_Listings_RF"
mlflow.set_registry_uri("databricks-uc")
mlflow.register_model(
    model_uri="runs:/"+RF_run_id+"/model",
    name=f"{catalog}.{schema}.{model_name}"
)

In [0]:
# Register XGB Model
catalog = catalog_
schema = schema_
model_name = "NY_Price_Listings_XGB"
mlflow.set_registry_uri("databricks-uc")
mlflow.register_model(
    model_uri="runs:/"+XGB_run_id+"/model",
    name=f"{catalog}.{schema}.{model_name}"
)