In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import mlflow
import optuna
import os
import sys
sys.path.append("/Users/gabriel/Documents/Git/End-to-end MLOps for Time Series")
from utils import train_test_split
from preprocessing.preprocessing import *
import time
from datetime import datetime, timezone

# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)

# Global variables

In [2]:
forecast_horizon = 48
n_lags = 48
model_name = "lightgbm"
preprocessing_version = 2
config_version = 1
config_dir_path = "../config"
study_path = "../optuna_studies"
random_state = 0

In [3]:
# forecast_horizon = 24
# n_lags = 48
# test_window = 24*60 # in hours
# preprocessing_version = 1 # preprocessing version
# study_version = 1 # Optuna study to take hyperparameters from
# random_state = 0

# Read the Data

In [4]:
df = pd.read_csv("../data/consumption.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.head()

Unnamed: 0,datetime,prediction_unit_id,consumption
0,2021-09-01,0,96.59
1,2021-09-01,1,17.314
2,2021-09-01,2,656.859
3,2021-09-01,3,59.0
4,2021-09-01,4,501.76


In [5]:
df.shape

(1009176, 3)

# Train/Test split

In [6]:
# taking the last 60 days of each unit for test
df_train, df_test = train_test_split(df, test_window=24 * 60)

In [7]:
assert df.shape[0] == df_train.shape[0] + df_test.shape[0]
assert df.shape[1] == df_train.shape[1] == df_test.shape[1]

In [8]:
test_size = df_test.shape[0] / (df.shape[0])
print(f"test set : {round(test_size*100, 2)}% of the data set")

test set : 9.85% of the data set


# Preprocessing

In [9]:
print(f"Using preprocessing version {preprocessing_version}")
preprocessing = vars()[f"preprocessing_{preprocessing_version}"]

Using preprocessing version 2


In [10]:
X_train, y_train = preprocessing(df_train)
print(f"X_train shape : {X_train.shape}")
print(f"y_train shape : {y_train.shape}")
X_train.head()

  df[feat_name] = feat_vals[restore_idxs]
  df[feat_name] = feat_vals[restore_idxs]


X_train shape : (854079, 99)
y_train shape : (854079,)


Unnamed: 0,lag48,lag49,lag50,lag51,lag52,lag53,lag54,lag55,lag56,lag57,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
5795,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,43.671,...,82.505417,82.612111,81.8605,81.756654,81.308,81.1456,80.502083,9,5,23
5856,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,...,83.241833,83.355714,82.505417,82.612111,81.8605,81.756654,81.308,9,6,0
5917,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,...,83.893958,84.131655,83.241833,83.355714,82.505417,82.612111,81.8605,9,6,1
5978,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,...,84.539375,84.841667,83.893958,84.131655,83.241833,83.355714,82.505417,9,6,2
6039,94.536,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,...,84.552333,85.716806,84.539375,84.841667,83.893958,84.131655,83.241833,9,6,3


In [11]:
X_test, y_test = preprocessing(df_test)
print(f"X_test shape : {X_test.shape}")
print(f"y_test shape : {y_test.shape}")
X_test.head()

X_test shape : (84735, 99)
y_test shape : (84735,)


  df[feat_name] = feat_vals[restore_idxs]
  df[feat_name] = feat_vals[restore_idxs]


Unnamed: 0,lag48,lag49,lag50,lag51,lag52,lag53,lag54,lag55,lag56,lag57,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
920263,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,160.944,...,722.741458,747.325111,723.395542,740.683269,723.535208,734.97888,725.296292,4,2,23
920328,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,...,723.69675,753.981429,722.741458,747.325111,723.395542,740.683269,723.535208,4,3,0
920393,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,...,724.803458,761.656966,723.69675,753.981429,722.741458,747.325111,723.395542,4,3,1
920458,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,...,726.249667,769.130867,724.803458,761.656966,723.69675,753.981429,722.741458,4,3,2
920523,1014.902,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,...,731.313625,778.516258,726.249667,769.130867,724.803458,761.656966,723.69675,4,3,3


# Setting up the MLFlow experiment

From a terminal, run:
```
mlflow server --host 127.0.0.1 --port 5000
```

In [12]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [15]:
# Set the current active MLflow experiment
experiment_name = f"Enefit {model_name} Preprocessing {preprocessing_version}"
mlflow.set_experiment(experiment_name=experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/882091402080986748', creation_time=1740711820379, experiment_id='882091402080986748', last_update_time=1740711820379, lifecycle_stage='active', name='Enefit lightgbm Preprocessing 2', tags={}>

In [16]:
experiment = mlflow.get_experiment_by_name(experiment_name)
print(f"ID for experiment '{experiment_name}': {experiment.experiment_id}")

ID for experiment 'Enefit lightgbm Preprocessing 2': 882091402080986748


# Log the study's best model

In [17]:
study_name = (
    f"{model_name}_preprocessing{preprocessing_version}_config{config_version}"
)
storage_path = "sqlite:///{}/{}.db".format(study_path, study_name)
print(f"Study name : {study_name}")
study = optuna.create_study(study_name=study_name, storage=storage_path, load_if_exists=True)

Study name : lightgbm_preprocessing2_config1


[I 2025-03-12 11:20:00,420] Using an existing study with name 'lightgbm_preprocessing2_config1' instead of creating a new one.


In [21]:
# run metadata
utc_datetime = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SUTC")
run_name = f"{model_name}-config{config_version}-{utc_datetime}"

# training
params = study.best_params
model = LGBMRegressor(**params, random_state=random_state)
start = time.time()
model.fit(X_train, y_train)
end = time.time()

# metrics
y_fit = model.predict(X_train)
y_pred = model.predict(X_test)
train_mae = mean_absolute_error(y_train, y_fit)
test_mae = mean_absolute_error(y_test, y_pred)
training_duration = end - start
metrics = {"train_mae": train_mae, "test_mae": test_mae, "training_duration": training_duration}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.lightgbm.log_model(
        lgb_model=model, input_example=X_train.iloc[:1], artifact_path=model_name
    )



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24524
[LightGBM] [Info] Number of data points in the train set: 854079, number of used features: 99
[LightGBM] [Info] Start training from score 485.532565




🏃 View run lightgbm-config1-20250228T030405UTC at: http://127.0.0.1:5000/#/experiments/882091402080986748/runs/963a54cea853490a9b21a465e6e99f94
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/882091402080986748
