In [1]:
import mlflow
from pyspark.sql import SparkSession

from src.config import ProjectConfig, Tags
from src.airline_reviews.models.basic_model import BasicModel

In [2]:
config = ProjectConfig.from_yaml(config_path="../project_config.yml")
spark = SparkSession.builder.getOrCreate()
tags = Tags(**{"git_sha": "9e66454", "branch": "efehan_week2"})

In [3]:
config

ProjectConfig(num_features=['seat_comfort', 'cabin_service', 'food_bev', 'entertainment', 'ground_service', 'value_for_money'], cat_features=['traveller_type', 'cabin'], target='recommended', data_path='/Volumes/mlops_dev/efedanis/data/airline_reviews.csv', catalog_name='mlops_dev', schema_name='efedanis', experiment_name_basic='/Shared/airline-reviews-basic', parameters={'learning_rate': 0.01, 'n_estimators': 1000, 'max_depth': 6})

In [4]:
basic_model = BasicModel(config=config, tags=tags, spark=spark)

In [5]:
basic_model.load_data()
basic_model.prepare_features()

[32m2025-02-15 14:20:53.786[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mload_data[0m:[36m46[0m - [1m🔄 Loading data from Databricks tables...[0m
[32m2025-02-15 14:20:57.871[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mload_data[0m:[36m56[0m - [1m✅ Data successfully loaded.[0m
[32m2025-02-15 14:20:57.872[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mprepare_features[0m:[36m66[0m - [1m🔄 Defining preprocessing pipeline...[0m
[32m2025-02-15 14:20:57.872[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mprepare_features[0m:[36m74[0m - [1m✅ Preprocessing pipeline defined.[0m


In [6]:
basic_model.train()
basic_model.log_model()

[32m2025-02-15 14:20:57.883[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mtrain[0m:[36m80[0m - [1m🚀 Starting training...[0m


[LightGBM] [Info] Number of positive: 0, number of negative: 20000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 52
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776


[32m2025-02-15 14:21:06.170[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mlog_model[0m:[36m99[0m - [1m📊 Mean Squared Error: 1.0[0m
[32m2025-02-15 14:21:06.173[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mlog_model[0m:[36m100[0m - [1m📊 Mean Absolute Error: 1.0[0m
[32m2025-02-15 14:21:06.173[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mlog_model[0m:[36m101[0m - [1m📊 F1 Score: 1.0[0m


In [12]:
run_id = mlflow.search_runs(
    experiment_names=["/Shared/airline-reviews-basic"], filter_string="tags.branch='efehan_week2'"
).run_id[0]

model = mlflow.sklearn.load_model(f"runs:/{run_id}/lightgbm-pipeline-model")

In [13]:
basic_model.retrieve_current_run_dataset()


[32m2025-02-15 14:21:40.794[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mretrieve_current_run_dataset[0m:[36m150[0m - [1m✅ Dataset source loaded.[0m


Unnamed: 0,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money,overall,airline,traveller_type,cabin,recommended,unique_id,update_timestamp_utc
0,,,,,,1,1,Emirates,Family Leisure,Economy Class,0,3384,2025-02-15 09:25:05.414
1,,,,,,1,1,Turkish Airlines,Business,Economy Class,0,327,2025-02-15 09:25:05.414
2,,,,,,1,1,Turkish Airlines,Family Leisure,Economy Class,0,533,2025-02-15 09:25:05.414
3,,,,,,1,1,Turkish Airlines,Family Leisure,Economy Class,0,760,2025-02-15 09:25:05.414
4,,,,,,1,1,Turkish Airlines,Solo Leisure,Economy Class,0,394,2025-02-15 09:25:05.414
5,,,,,,1,2,Emirates,Solo Leisure,Economy Class,0,3039,2025-02-15 09:25:05.414
6,,,,,1.0,1,1,Emirates,Couple Leisure,Economy Class,0,2996,2025-02-15 09:25:05.414
7,,,,,1.0,1,1,Emirates,Solo Leisure,Economy Class,0,3054,2025-02-15 09:25:05.414
8,,,,,1.0,1,1,Emirates,Solo Leisure,Economy Class,0,3508,2025-02-15 09:25:05.414
9,,,,,1.0,1,1,Qatar Airways,Business,Economy Class,0,1801,2025-02-15 09:25:05.414


In [14]:
basic_model.retrieve_current_run_metadata()


[32m2025-02-15 14:21:43.793[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mretrieve_current_run_metadata[0m:[36m160[0m - [1m✅ Dataset metadata loaded.[0m


({'accuracy': 1.0, 'f1_score': 1.0, 'precision': 1.0},
 {'learning_rate': '0.01',
  'max_depth': '6',
  'model_type': 'LightGBM with preprocessing',
  'n_estimators': '1000'})

In [15]:
basic_model.register_model()

[32m2025-02-15 14:21:43.802[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mregister_model[0m:[36m126[0m - [1m🔄 Registering the model in UC...[0m
Registered model 'mlops_dev.efedanis.airline_reviews_basic' already exists. Creating a new version of this model...
Created version '3' of model 'mlops_dev.efedanis.airline_reviews_basic'.
[32m2025-02-15 14:21:44.336[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mregister_model[0m:[36m132[0m - [1m✅ Model registered as version 3.[0m


In [16]:
test_set = spark.table(f"{config.catalog_name}.{config.schema_name}.test_set").limit(10)

X_test = test_set.drop(config.target).toPandas()

predictions_df = basic_model.load_latest_model_and_predict(X_test)

[32m2025-02-15 14:21:46.611[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mload_latest_model_and_predict[0m:[36m171[0m - [1m🔄 Loading model from MLflow alias 'production'...[0m
[32m2025-02-15 14:21:47.312[0m | [1mINFO    [0m | [36msrc.airline_reviews.models.basic_model[0m:[36mload_latest_model_and_predict[0m:[36m176[0m - [1m✅ Model successfully loaded.[0m
