In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import mlflow
import optuna
import os
import sys
sys.path.append(os.path.realpath("../../modules"))
sys.path.append(os.path.realpath("../../preprocessing"))
from utils import train_test_split
from preprocessing import *
import time
from datetime import datetime, UTC

# import warnings
# warnings.filterwarnings("ignore", category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


# Global variables

In [2]:
forecast_horizon = 24
n_lags = 48
test_window = 24*60 # in hours
preprocessing_version = 1 # preprocessing version
study_version = 1 # Optuna study to take hyperparameters from
random_state = 0

# Import Data

In [3]:
df = pd.read_csv(
    "../../data/consumption.csv", usecols=["prediction_unit_id", "datetime", "target"]
)[["prediction_unit_id", "datetime", "target"]].rename(
    columns={"prediction_unit_id": "unique_id", "datetime": "ds", "target": "y"}
)
df["ds"] = pd.to_datetime(df["ds"])
df.head()

Unnamed: 0,unique_id,ds,y
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [4]:
df.shape

(1009176, 3)

# Train/Test split

In [5]:
df_train, df_test = train_test_split(df, test_window=test_window)

In [6]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [7]:
df.shape[1] == df_train.shape[1] == df_test.shape[1]

True

In [8]:
test_size = df_test.shape[0] / (df.shape[0])
print(f"test set : {round(test_size*100, 2)}% of the data set")

test set : 9.85% of the data set


# Preprocessing

In [9]:
preprocessing = vars()[f"preprocessing_{preprocessing_version}"]

In [11]:
X_train, y_train = preprocessing(df_train)
print(X_train.shape, y_train.shape)
X_train.head()

(904917, 53) (904917,)


Unnamed: 0,lag24,lag25,lag26,lag27,lag28,lag29,lag30,lag31,lag32,lag33,...,lag67,lag68,lag69,lag70,lag71,rolling_mean_lag1_window_size24,rolling_mean_lag24_window_size24,month,dayofweek,hour
4331,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,43.671,...,88.184,87.955,91.594,77.691,96.59,90.123375,87.297042,9,4,23
4392,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,...,89.781,88.184,87.955,91.594,77.691,90.93125,87.203833,9,5,0
4453,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,...,96.481,89.781,88.184,87.955,91.594,91.548458,86.574125,9,5,1
4514,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,...,94.592,96.481,89.781,88.184,87.955,92.886708,86.21325,9,5,2
4575,94.536,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,...,77.308,94.592,96.481,89.781,88.184,93.257917,85.842542,9,5,3


In [14]:
X_test, y_test = preprocessing(df_test)
print(X_test.shape, y_test.shape)
X_test.head()

(94461, 53) (94461,)


Unnamed: 0,lag24,lag25,lag26,lag27,lag28,lag29,lag30,lag31,lag32,lag33,...,lag67,lag68,lag69,lag70,lag71,rolling_mean_lag1_window_size24,rolling_mean_lag24_window_size24,month,dayofweek,hour
918702,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,160.944,...,953.645,949.4,923.365,925.559,967.361,721.631208,699.523333,4,1,23
918768,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,...,959.313,953.645,949.4,923.365,925.559,717.695458,703.838542,4,2,0
918833,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,...,1025.369,959.313,953.645,949.4,923.365,712.335875,708.685833,4,2,1
918898,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,...,921.18,1025.369,959.313,953.645,949.4,708.313417,712.0985,4,2,2
918963,1014.902,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,...,710.08,921.18,1025.369,959.313,953.645,705.044333,715.481833,4,2,3


# Train a vanilla LightGBM

In [15]:
model = LGBMRegressor(random_state=random_state)
start = time.time()
model.fit(X_train, y_train)
end = time.time()

# metrics
y_fit = model.predict(X_train)
y_pred = model.predict(X_test)
train_mae = mean_absolute_error(y_train, y_fit)
test_mae = mean_absolute_error(y_test, y_pred)
training_duration = end - start
metrics = {"train_mae": train_mae, "test_mae": test_mae, "training_duration": training_duration}
metrics

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12794
[LightGBM] [Info] Number of data points in the train set: 904917, number of used features: 53
[LightGBM] [Info] Start training from score 468.673058


{'train_mae': 50.607146141734816,
 'test_mae': 64.93105969874263,
 'training_duration': 5.41973090171814}

# Setting up the MLFlow experiment

In [13]:
mlflow.set_tracking_uri("http://localhost:5000")

In [15]:
# Set the current active MLflow experiment
experiment_name = f"Enefit Preprocessing {preprocessing_version}"
mlflow.set_experiment(experiment_name=experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/139855071362737171', creation_time=1724822394070, experiment_id='139855071362737171', last_update_time=1724822394070, lifecycle_stage='active', name='Enefit Preprocessing 1', tags={}>

In [None]:
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment.experiment_id

'139855071362737171'

# Log the study's best model

In [None]:
study_name = "study_{}".format(study_version) # will also be used for the run_name
study_name

'study_1'

In [None]:
study_path = "../../data/optuna-studies"
storage_name = "sqlite:///{}/{}.db".format(study_path, study_name)
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)

[I 2024-08-28 07:48:21,276] Using an existing study with name 'study_1' instead of creating a new one.


In [None]:
# run metadata
utc_datetime = datetime.now(UTC).strftime("%Y%m%dT%H%M%SUTC")
run_name = f"best_model_{study_name}_{utc_datetime}"
artifact_path = "lgbm"

# training
params = study.best_params
model = LGBMRegressor(**params, random_state=random_state)
start = time.time()
model.fit(X_train, y_train)
end = time.time()

# metrics
y_fit = model.predict(X_train)
y_pred = model.predict(X_test)
train_mae = mean_absolute_error(y_train, y_fit)
test_mae = mean_absolute_error(y_test, y_pred)
training_duration = end - start
metrics = {"train_mae": train_mae, "test_mae": test_mae, "training_duration": training_duration}

with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.lightgbm.log_model(
        lgb_model=model, input_example=X_train.iloc[:1], artifact_path=artifact_path
    )



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12794
[LightGBM] [Info] Number of data points in the train set: 904917, number of used features: 53
[LightGBM] [Info] Start training from score 468.645991






Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.15it/s]




2024/08/28 08:30:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_model_study_1_20240828T062931UTC at: http://localhost:5000/#/experiments/139855071362737171/runs/b0d5225429a04863b236129f56183c3f.
2024/08/28 08:30:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/139855071362737171.
