In [1]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import mlflow
import time
from datetime import datetime, timezone

# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import sys
sys.path.append("/Users/gabriel/Documents/Git/End-to-end MLOps for Time Series")
from utils import load_config
from dotenv import dotenv_values

env_vars = dotenv_values("../.env")
config = load_config("../config/development/pipeline.yaml")

# Read the Data

In [3]:
from lgbm_hpo import load_data

X_train, y_train = load_data("../data/processed/consumption_train.csv")
X_test, y_test = load_data("../data/processed/consumption_test.csv")

X shape : (757248, 20)
y shape : (757248,)
X shape : (186732, 20)
y shape : (186732,)


# Setting up the MLFlow experiment

From a terminal, run:
```
mlflow server --host 127.0.0.1 --port 5000
```

In [4]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [7]:
# Set the current active MLflow experiment
experiment_name = f"Enefit DataV1 {config['model_name']} HpoConfigV{config['hpo_config_version']}"
try:
    mlflow.set_experiment(experiment_name=experiment_name)
except mlflow.exceptions.MlflowException:
    print("MLflow server not running. Please start the server using this command:")
    print("mlflow server --host 127.0.0.1 --port 5000")

MLflow server not running. Please start the server using this command:
mlflow server --host 127.0.0.1 --port 5000


In [6]:
experiment = mlflow.get_experiment_by_name(experiment_name)
print(f"ID for experiment '{experiment_name}': {experiment.experiment_id}")

ID for experiment 'Enefit DataV1 lightgbm HpoConfigV1': 215806994480999889


# Log the study's best model

In [7]:
from lgbm_hpo import get_study
config['studies_dir'] = "../optuna_studies"
study = get_study(config)

loading sampler from ../optuna_studies/datav1_lightgbm_config1_sampler.pkl


[I 2025-04-24 17:55:27,030] Using an existing study with name 'datav1_lightgbm_config1' instead of creating a new one.


In [None]:
# run metadata
utc_datetime = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SUTC")
run_name = f"TrialNumber{study.best_trial.number}-{utc_datetime}"

# training
params = study.best_params
model = LGBMRegressor(**params, random_state=config['random_state'])
start = time.time()
model.fit(X_train, y_train)
end = time.time()

# metrics
y_fit = model.predict(X_train)
y_pred = model.predict(X_test)
train_mae = mean_absolute_error(y_train, y_fit)
test_mae = mean_absolute_error(y_test, y_pred)
training_duration = end - start
metrics = {"train_mae": train_mae, "test_mae": test_mae, "training_duration": training_duration}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.lightgbm.log_model(
        lgb_model=model, input_example=X_train.iloc[:1], artifact_path=config['model_name']
    )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3189
[LightGBM] [Info] Number of data points in the train set: 757248, number of used features: 19
[LightGBM] [Info] Start training from score 463.770859


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/04/24 17:55:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run TrialNumber35 at: http://127.0.0.1:5000/#/experiments/215806994480999889/runs/dc0e887215074f5f86d2765ced1e8094.
2025/04/24 17:55:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/215806994480999889.


