In [1]:
import mlflow
from pyspark.sql import SparkSession

from src.airline_reviews.models.basic_model import BasicModel
from src.config import ProjectConfig, Tags

In [2]:
config = ProjectConfig.from_yaml(config_path="../project_config.yml")
spark = SparkSession.builder.getOrCreate()
tags = Tags(**{"git_sha": "9e66454", "branch": "efehan_week2"})

In [3]:
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

In [4]:
basic_model = BasicModel(config=config, tags=tags, spark=spark)

In [5]:
basic_model.load_data()
basic_model.prepare_features()

[32m2025-02-15 15:52:10.580[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mload_data[0m:[36m43[0m - [1m🔄 Loading data from Databricks tables...[0m
[32m2025-02-15 15:52:15.539[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mload_data[0m:[36m53[0m - [1m✅ Data successfully loaded.[0m
[32m2025-02-15 15:52:15.539[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mprepare_features[0m:[36m63[0m - [1m🔄 Defining preprocessing pipeline...[0m
[32m2025-02-15 15:52:15.539[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mprepare_features[0m:[36m71[0m - [1m✅ Preprocessing pipeline defined.[0m


In [6]:
basic_model.train()
basic_model.log_model()

[32m2025-02-15 15:52:15.543[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mtrain[0m:[36m77[0m - [1m🚀 Starting training...[0m


[LightGBM] [Info] Number of positive: 0, number of negative: 20000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776


MlflowException: Reading databricks credential configuration failed with MLflow tracking URI 'databricks', Please ensure that you installed 'databricks-sdk' library, set correct tracking URI and set up databricks authentication configuration correctly. The available tracking URI can be either 'databricks' (using 'DEFAULT' authentication profile) or 'databricks://{profile}'. To set up databricks authentication configuration, you can set environmental variables DATABRICKS_HOST + DATABRICKS_TOKEN, or set environmental variables DATABRICKS_HOST + DATABRICKS_CLIENT_ID + DATABRICKS_CLIENT_SECRET, or you can edit '~/.databrickscfg' file to set host + token or host + client_id + client_secret for specific profile section, or you can log in by command 'databricks auth login' which configures an authentication profile in '~/.databrickscfg' with auth_type of 'databricks-cli'.
For details of these authentication types, please refer to document 'https://docs.databricks.com/en/dev-tools/auth/index.html#unified-auth'.

In [7]:
run_id = mlflow.search_runs(
    experiment_names=["/Shared/airline-reviews-basic"], filter_string="tags.branch='efehan_week2'"
).run_id[0]

model = mlflow.sklearn.load_model(f"runs:/{run_id}/lightgbm-pipeline-model")

In [None]:
basic_model.retrieve_current_run_dataset()

In [None]:
basic_model.retrieve_current_run_metadata()

In [None]:
basic_model.register_model()

In [None]:
test_set = spark.table(f"{config.catalog_name}.{config.schema_name}.test_set").limit(10)

X_test = test_set.drop(config.target).toPandas()

predictions_df = basic_model.load_latest_model_and_predict(X_test)