In [1]:
import mlflow
from databricks.connect import DatabricksSession
from airbnb_listing.config import Config, Tags, config
from airbnb_listing.models.basic_model import BasicModel

In [2]:
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

In [3]:
spark = DatabricksSession.builder.getOrCreate()

In [4]:
# raw tags
tags_dict = {"git_sha": "abcd12345", "branch": "week2"}
# validated tags
tags = Tags(**tags_dict)
tags.git_sha

'abcd12345'

In [5]:
# Initialize model with the config path
basic_model = BasicModel(config=config, tags=tags, spark=spark)

In [6]:
basic_model.load_data()


2025-02-16 21:52:35,049 - airbnb_listing - INFO - 🔄 Loading data from silver layer...
INFO:airbnb_listing:🔄 Loading data from silver layer...
INFO:airbnb_listing:🔄 Loading data from silver layer...
2025-02-16 21:52:37,265 - airbnb_listing - INFO - ✅ Data loaded successfully.
INFO:airbnb_listing:✅ Data loaded successfully.


In [7]:
# Define the train pipeline
basic_model.prepare_features()


2025-02-16 21:52:37,274 - airbnb_listing - INFO - 🔄 Defining train pipeline...
INFO:airbnb_listing:🔄 Defining train pipeline...
2025-02-16 21:52:37,279 - airbnb_listing - INFO - ✅ Train pipeline defined.
INFO:airbnb_listing:✅ Train pipeline defined.
INFO:airbnb_listing:🔄 Defining train pipeline...
2025-02-16 21:52:37,279 - airbnb_listing - INFO - ✅ Train pipeline defined.
INFO:airbnb_listing:✅ Train pipeline defined.


In [8]:
# Train and log the model (runs everything including MLflow logging)
basic_model.train()
basic_model.log_model()

2025-02-16 21:52:37,290 - airbnb_listing - INFO - 🚂 Training the model...
INFO:airbnb_listing:🚂 Training the model...


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 27277, number of used features: 123
[LightGBM] [Info] Start training from score 4.739467


2025-02-16 21:52:46,107 - airbnb_listing - INFO - 📊 Mean Squared Error: 0.18983959383516283
INFO:airbnb_listing:📊 Mean Squared Error: 0.18983959383516283
2025-02-16 21:52:46,108 - airbnb_listing - INFO - 📊 Mean Absolute Error: 0.31339085484799456
INFO:airbnb_listing:📊 Mean Absolute Error: 0.31339085484799456
INFO:airbnb_listing:📊 Mean Squared Error: 0.18983959383516283
2025-02-16 21:52:46,108 - airbnb_listing - INFO - 📊 Mean Absolute Error: 0.31339085484799456
INFO:airbnb_listing:📊 Mean Absolute Error: 0.31339085484799456
2025/02/16 21:52:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run vaunted-lamb-595 at: https://adb-2972378804555913.13.azuredatabricks.net/ml/experiments/2094072416575048/runs/bf8986fd6474461dab8edafa8788d53a.
2025/02/16 21:52:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://adb-2972378804555913.13.azuredatabricks.net/ml/experiments/2094072416575048.


In [None]:
# Get experiment run id
run_id = mlflow.search_runs(
    experiment_names=["/Users/henryhfung4_gmail.com#ext#@henryhfung4gmail.onmicrosoft.com/airbnb_listing_price_basic"], 
    filter_string="tags.branch='week2'").run_id[0]
run_id

'bf8986fd6474461dab8edafa8788d53a'

In [10]:
# Load the model from the current experiment run
model = mlflow.sklearn.load_model(f"runs:/{run_id}/lightgbm-pipeline-model")

In [11]:
# Retrieve dataset from the current experiment run
display(basic_model.retrieve_current_run_dataset().limit(5))

2025-02-16 21:52:55,886 - airbnb_listing - INFO - ✅ Dataset source loaded.
INFO:airbnb_listing:✅ Dataset source loaded.


Unnamed: 0,id,is_manhattan,neighbourhood,room_type,minimum_nights,latitude,longitude,estimated_listed_months,availability_365,number_of_reviews,calculated_host_listings_count,last_review,log_price,update_timestamp_utc
0,18570075,False,Clinton Hill,Entire home/apt,2.0,40.68908,-73.96429,26.130653,361.0,52.0,2.0,2019-06-09,5.857933,2025-02-12 04:05:47.873
1,26717498,False,Gowanus,Entire home/apt,9.0,40.66873,-73.99263,,0.0,0.0,1.0,NaT,4.025352,2025-02-12 04:05:47.873
2,18523182,False,Bedford-Stuyvesant,Private room,1.0,40.68473,-73.94799,26.153846,0.0,17.0,1.0,2019-05-26,4.077537,2025-02-12 04:05:47.873
3,12303877,True,Hell's Kitchen,Private room,3.0,40.76446,-73.99175,,0.0,0.0,1.0,NaT,4.394449,2025-02-12 04:05:47.873
4,31115149,False,Bedford-Stuyvesant,Private room,2.0,40.69032,-73.92437,,10.0,0.0,1.0,NaT,4.094345,2025-02-12 04:05:47.873


In [12]:
basic_model.retrieve_current_run_metadata()

2025-02-16 21:52:57,628 - airbnb_listing - INFO - ✅ Dataset metadata loaded.
INFO:airbnb_listing:✅ Dataset metadata loaded.


({'mae': 0.31339085484799456, 'mse': 0.18983959383516283},
 {'learning_rate': '0.01',
  'max_depth': '7',
  'model_type': 'LGBMRegressor with preprocessing',
  'n_estimators': '400',
  'num_leaves': '70'})

In [13]:
# Register model to the Unity Catalog Model Registry
basic_model.register_model()

2025-02-16 21:52:57,637 - airbnb_listing - INFO - 📦 Registering model in Unity Catalog...
INFO:airbnb_listing:📦 Registering model in Unity Catalog...
INFO:airbnb_listing:📦 Registering model in Unity Catalog...
Registered model 'dev.airbnb_listing_ml_assets.airbnb_listing_price_model_basic' already exists. Creating a new version of this model...
Created version '3' of model 'dev.airbnb_listing_ml_assets.airbnb_listing_price_model_basic'.
2025-02-16 21:53:01,598 - airbnb_listing - INFO - ✅ Model registered as version 3
INFO:airbnb_listing:✅ Model registered as version 3


In [14]:
# Perform inference with the registered model using the test set
test_set = spark.table(f"{config.general.DEV_CATALOG}.{config.general.SILVER_SCHEMA}.airbnb_listing_price_test")

In [15]:
X_test = test_set.drop(config.model.TARGET).limit(10).toPandas()
X_test

Unnamed: 0,id,is_manhattan,neighbourhood,room_type,minimum_nights,latitude,longitude,estimated_listed_months,availability_365,number_of_reviews,calculated_host_listings_count,last_review,update_timestamp_utc
0,23075778,True,East Village,Entire home/apt,2.0,40.72629,-73.98417,16.806723,2.0,20.0,1.0,2019-06-16,2025-02-12 04:05:57.704
1,29513000,True,Gramercy,Entire home/apt,5.0,40.7367,-73.98985,8.333333,145.0,28.0,1.0,2019-06-19,2025-02-12 04:05:57.704
2,1886240,True,SoHo,Entire home/apt,1.0,40.72617,-74.00141,,0.0,0.0,1.0,NaT,2025-02-12 04:05:57.704
3,19970764,True,Lower East Side,Entire home/apt,1.0,40.72337,-73.99057,4.761905,352.0,3.0,3.0,2019-05-05,2025-02-12 04:05:57.704
4,30388011,True,Theater District,Entire home/apt,14.0,40.75988,-73.98568,,341.0,0.0,232.0,NaT,2025-02-12 04:05:57.704
5,18190894,False,Bushwick,Private room,1.0,40.69745,-73.93038,27.027027,89.0,20.0,2.0,2019-06-29,2025-02-12 04:05:57.704
6,549873,True,East Village,Entire home/apt,2.0,40.72564,-73.98252,55.555556,0.0,5.0,1.0,2015-05-25,2025-02-12 04:05:57.704
7,28387362,False,Elmhurst,Private room,2.0,40.72946,-73.88054,10.0,324.0,4.0,2.0,2018-10-01,2025-02-12 04:05:57.704
8,23797329,False,Greenpoint,Shared room,14.0,40.72104,-73.93985,15.789474,365.0,3.0,10.0,2018-08-01,2025-02-12 04:05:57.704
9,2134052,True,Chelsea,Entire home/apt,3.0,40.74193,-74.0017,63.888889,0.0,23.0,1.0,2015-12-06,2025-02-12 04:05:57.704


In [16]:
predictions_df = basic_model.load_latest_model_and_predict(X_test)

2025-02-16 21:53:02,486 - airbnb_listing - INFO - 🔄 Loading model from MLflow alias 'latest-model'...
INFO:airbnb_listing:🔄 Loading model from MLflow alias 'latest-model'...
INFO:airbnb_listing:🔄 Loading model from MLflow alias 'latest-model'...
2025-02-16 21:53:03,539 - airbnb_listing - INFO - ✅ Model successfully loaded.
INFO:airbnb_listing:✅ Model successfully loaded.


In [17]:
predictions_df

array([5.14894246, 5.59164465, 5.51052038, 5.71519618, 5.55721253,
       4.21121894, 5.17172854, 4.13208999, 3.56902393, 5.31133469])