<a href="https://colab.research.google.com/github/gabriel1628/End-to-end-MLOps-for-Time-Series/blob/main/lgbm_hpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python version

In [103]:
import sys
sys.version

'3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]'

# If run in Google Colab

In [104]:
COLAB = True # if notebook is run in Google Colab

In [105]:
if COLAB:
    !pip install boto3 mlforecast optuna lightgbm GPUtil -q

    import requests

    utils = requests.get("https://raw.githubusercontent.com/gabriel1628/End-to-end-MLOps-for-Time-Series/main/utils.py")
    open('utils.py', 'wb').write(utils.content)

    !mkdir -p preprocessing
    preprocessing = requests.get("https://raw.githubusercontent.com/gabriel1628/End-to-end-MLOps-for-Time-Series/main/preprocessing/preprocessing.py")
    open('preprocessing/preprocessing.py', 'wb').write(preprocessing.content)

# Import libraries

In [106]:
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error

import logging
import optuna

import boto3

import GPUtil
import os
import sys
from pathlib import Path
import yaml
import time
from natsort import natsorted

from utils import train_test_split
from preprocessing.preprocessing import *

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Set device

In [107]:
device="gpu" if GPUtil.getAvailable() else "cpu"
device

'cpu'

In [108]:
# import subprocess

# try:
#     subprocess.check_output('nvidia-smi')
#     device = "cuda"
# except Exception: # this command not being found can raise quite a few different errors depending on the configuration
#     device = "cpu"

# Global variables

In [109]:
forecast_horizon = 48
n_lags = 48
model_name = "lightgbm"
preprocessing_version = 2  # preprocessing version
config_version = None
config_dir_path = "./configuration_files"
study_path = "./optuna_studies"

# Downloading data and Optuna studies from S3

In [110]:
if COLAB: # If you are on Colab, you can use secret environment variables (click on the key in the left panel)
    from google.colab import userdata
    s3 = boto3.client(
        's3',
        aws_access_key_id=userdata.get("ACCESS_KEY"),
        aws_secret_access_key=userdata.get("SECRET_KEY"),
    )
else:
    s3 = boto3.client("s3") # use credentials in the ~/.aws folder

print("list objects in the enefit-competition bucket to check connection :\n")
response = s3.list_objects(
    Bucket='enefit-competition',
    MaxKeys=5,
)
for content in response["Contents"]:
    print(content["Key"])

list objects in the enefit-competition bucket to check connection :

configuration_files/lightgbm/config_1.yaml
configuration_files/lightgbm/config_2.yaml
configuration_files/lightgbm/config_3.yaml
data/consumption.csv
optuna-studies/lightgbm-preprocessing2-config2.db


In [111]:
def download_s3_folder(bucket_name, s3_folder, local_dir):
    """function to download objects from an S3 bucket located in the s3_folder directory"""
    # Ensure the local directory exists
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    # List objects within the specified S3 folder
    paginator = s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder)

    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                # Get the path of the object key
                s3_key = obj["Key"]
                if s3_key == s3_folder:
                    continue
                # Remove the prefix from the key to get the relative file path
                relative_path = os.path.relpath(s3_key, s3_folder)

                # Create the full local path
                local_file_path = os.path.join(local_dir, relative_path)

                # Create local directory if not exists
                local_file_dir = os.path.dirname(local_file_path)
                if not os.path.exists(local_file_dir):
                    os.makedirs(local_file_dir)

                # Download the file
                print(f"Downloading s3://{bucket_name}/{s3_key} to {local_file_path}...")
                s3.download_file(bucket_name, s3_key, local_file_path)

In [112]:
bucket_name = "enefit-competition"
s3_folders = ["data/", "optuna_studies/", "configuration_files/"]  # These are the "folders" in the S3 bucket
local_dirs = ["./data/", "./optuna_studies/", "./configuration_files/"]  # Local directory to save files

for s3_folder, local_dir in zip(s3_folders, local_dirs):
    download_s3_folder(bucket_name, s3_folder, local_dir)

Downloading s3://enefit-competition/data/consumption.csv to ./data/consumption.csv...
Downloading s3://enefit-competition/optuna_studies/lightgbm_preprocessing2_config1.db to ./optuna_studies/lightgbm_preprocessing2_config1.db...
Downloading s3://enefit-competition/optuna_studies/lightgbm_preprocessing2_config3.db to ./optuna_studies/lightgbm_preprocessing2_config3.db...
Downloading s3://enefit-competition/configuration_files/lightgbm/config_1.yaml to ./configuration_files/lightgbm/config_1.yaml...
Downloading s3://enefit-competition/configuration_files/lightgbm/config_2.yaml to ./configuration_files/lightgbm/config_2.yaml...
Downloading s3://enefit-competition/configuration_files/lightgbm/config_3.yaml to ./configuration_files/lightgbm/config_3.yaml...


# Read the data

In [113]:
df = pd.read_csv("./data/consumption.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df.head()

Unnamed: 0,datetime,prediction_unit_id,consumption
0,2021-09-01,0,96.59
1,2021-09-01,1,17.314
2,2021-09-01,2,656.859
3,2021-09-01,3,59.0
4,2021-09-01,4,501.76


In [114]:
df.shape

(1009176, 3)

# Train/Test split

In [115]:
# taking the last 60 days of each unit for test
df_train, df_test = train_test_split(df, test_window=24 * 60)

In [116]:
assert df.shape[0] == df_train.shape[0] + df_test.shape[0]
assert df.shape[1] == df_train.shape[1] == df_test.shape[1]

In [117]:
test_size = df_test.shape[0] / (df.shape[0])
print(f"test set : {round(test_size*100, 2)}% of the data set")

test set : 9.85% of the data set


# Preprocessing

In [118]:
# get preprocessing function from preprocessing/preprocessing.py
preprocessing = vars()[f"preprocessing_{preprocessing_version}"]

In [119]:
X_train, y_train = preprocessing(df_train)
print(f"X_train shape : {X_train.shape}")
print(f"y_train shape : {y_train.shape}")
X_train.head()

  df[feat_name] = feat_vals[restore_idxs]
  df[feat_name] = feat_vals[restore_idxs]


X_train shape : (854079, 99)
y_train shape : (854079,)


Unnamed: 0,lag48,lag49,lag50,lag51,lag52,lag53,lag54,lag55,lag56,lag57,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
5795,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,43.671,...,82.505417,82.612111,81.8605,81.756654,81.308,81.1456,80.502083,9,5,23
5856,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,...,83.241833,83.355714,82.505417,82.612111,81.8605,81.756654,81.308,9,6,0
5917,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,...,83.893958,84.131655,83.241833,83.355714,82.505417,82.612111,81.8605,9,6,1
5978,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,...,84.539375,84.841667,83.893958,84.131655,83.241833,83.355714,82.505417,9,6,2
6039,94.536,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,...,84.552333,85.716806,84.539375,84.841667,83.893958,84.131655,83.241833,9,6,3


In [120]:
X_test, y_test = preprocessing(df_test)
print(f"X_test shape : {X_test.shape}")
print(f"y_test shape : {y_test.shape}")
X_test.head()

X_test shape : (84735, 99)
y_test shape : (84735,)


  df[feat_name] = feat_vals[restore_idxs]
  df[feat_name] = feat_vals[restore_idxs]


Unnamed: 0,lag48,lag49,lag50,lag51,lag52,lag53,lag54,lag55,lag56,lag57,...,rolling_mean_lag68_window_size24,expanding_mean_lag69,rolling_mean_lag69_window_size24,expanding_mean_lag70,rolling_mean_lag70_window_size24,expanding_mean_lag71,rolling_mean_lag71_window_size24,month,dayofweek,hour
920263,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,160.944,...,722.741458,747.325111,723.395542,740.683269,723.535208,734.97888,725.296292,4,2,23
920328,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,...,723.69675,753.981429,722.741458,747.325111,723.395542,740.683269,723.535208,4,3,0
920393,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,...,724.803458,761.656966,723.69675,753.981429,722.741458,747.325111,723.395542,4,3,1
920458,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,...,726.249667,769.130867,724.803458,761.656966,723.69675,753.981429,722.741458,4,3,2
920523,1014.902,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,...,731.313625,778.516258,726.249667,769.130867,724.803458,761.656966,723.69675,4,3,3


# HPO with Optuna

## Experiment configurations

In [121]:
experiment_config = {
    "int_params": [
        {
            "name": "num_trees",
            "low": 3,
            "high": 10,
            "log": False,
        },
        {
            "name": "max_depth",
            "low": 3,
            "high": 10,
        },
        {
            "name": "num_leaves",
            "low": 5,
            "high": 25,
        },
    ],
    "float_params": [
        {
            "name": "learning_rate",
            "low": 0.001,
            "high": 0.1,
            "log": True,
        },
        {
            "name": "feature_fraction",
            "low": 0.1,
            "high": 0.75,
        }
    ],
    "objective_values": "mean",
}

In [122]:
# Create config files directory if does not exist
path = Path(config_dir_path, model_name)
try:
    path.mkdir(parents=True)
except:
    pass

In [123]:
# Check if the config has already been tested. If not, create a new config file
config_files_path = Path(config_dir_path, model_name)
list_config_files = list(config_files_path.glob("*.yaml"))
n_config_files = len(list_config_files)
if n_config_files == 0:
    with open(Path(config_files_path, "config_1.yaml"), "w") as file:
        yaml.dump(experiment_config, file)

elif config_version:
    config_file = Path(config_files_path, f"config_{config_version}.yaml")

else:
    config_version = 1
    for config_file in natsorted(list_config_files):
        with open(config_file, "rb") as file:
            config = yaml.safe_load(file)
        if experiment_config == config:
            break
        config_version += 1
        if config_version > n_config_files:
            config_file = Path(config_files_path, f"config_{config_version}.yaml")
            with open(config_file, "w") as file:
                yaml.dump(experiment_config, file)

print(f"using {config_file}")

with open(config_file, "rb") as file:
    config = yaml.safe_load(file)

using configuration_files/lightgbm/config_3.yaml


In [124]:
# upload configuration files to S3
list_config_files = list(config_files_path.glob("*.yaml"))
for file_path in list_config_files:
    s3.upload_file(
        file_path,
        bucket_name,
        str(file_path),
    )

## Objective function

In [125]:
def objective(trial):
    # Define hyperparameters
    study_params = {
        "verbosity": -1,
        "random_state": 0,
        "device": device,
    }
    for int_param in config["int_params"]:
        study_params[int_param["name"]] = trial.suggest_int(**int_param)
    for float_param in config["float_params"]:
        study_params[float_param["name"]] = trial.suggest_float(**float_param)

    # Evaluate model using cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    model = LGBMRegressor(**study_params)
    cv_errors = cross_val_score(
        model, X_train, y_train, scoring="neg_mean_absolute_error", cv=tscv
    )

    # Log CV results
    cv_errors = -cv_errors
    for i in range(len(cv_errors)):
        trial.set_user_attr(f"error_split_{i+1}", cv_errors[i])
    # trial.set_user_attr("cv_errors", list(cv_errors))
    trial.set_user_attr("cv_errors_std", cv_errors.std())

    # Log train MAE
    model.fit(X_train, y_train)
    y_fit = model.predict(X_train)
    train_mae = mean_absolute_error(y_train, y_fit)
    trial.set_user_attr("train_mae", train_mae)

    # Log test MAE
    y_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred)
    trial.set_user_attr("test_mae", test_mae)

    return cv_errors.mean()

## Create and run study

In [126]:
experiment_name = (
    f"{model_name}_preprocessing{preprocessing_version}_config{config_version}"
)
storage_path = "sqlite:///{}/{}.db".format(study_path, experiment_name)
print(f"experiment name : {experiment_name}")

experiment name : lightgbm_preprocessing2_config3


In [127]:
# Add stream handler of stdout to show the messages
logger = optuna.logging.get_logger("optuna")
if logger.hasHandlers():
    logger.handlers.clear()
logger.addHandler(logging.StreamHandler(sys.stdout))

# restored_sampler = pickle.load(open("{}-study-sampler.pkl".format(study_name), "rb"))
sampler = optuna.samplers.TPESampler(
    seed=0
)
# For a list a available samplers : https://optuna.readthedocs.io/en/stable/reference/samplers/index.html

In [128]:
# Initialize the Optuna study
study = optuna.create_study(
    study_name=experiment_name,
    storage=storage_path,
    load_if_exists=True,
    directions=["minimize"],
    sampler=sampler,
    # pruner=pruner,
)

Using an existing study with name 'lightgbm_preprocessing2_config3' instead of creating a new one.


In [129]:
len(study.trials)

3

In [130]:
# !git clone --recursive https://github.com/microsoft/LightGBM

In [131]:
# cd LightGBM

In [132]:
# !cmake -B build -S . -DUSE_GPU=ON
# # if you have installed NVIDIA CUDA to a customized location, you should specify paths to OpenCL headers and library like the following:
# # !cmake -B build -S . -DUSE_GPU=ON -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/
# !cmake --build build -j4

In [133]:
# Execute the hyperparameter optimization trials
%%time

checkpoint = time.time()
for i in range(10):
    study.optimize(objective, n_trials=1)
    if time.time() - checkpoint > 300:
        checkpoint = time.time()
        print("Upload trials database to S3...")
        s3.upload_file(
            experiment_path,
            bucket_name,
            str(experiment_path),
        )

print("Final upload of the trials database to S3...")
s3.upload_file(
    experiment_path,
    bucket_name,
    str(experiment_path),
)

Trial 3 finished with value: 562.0289973165827 and parameters: {'num_trees': 7, 'max_depth': 8, 'num_leaves': 17, 'learning_rate': 0.012296071107325713, 'feature_fraction': 0.3753756195702881}. Best is trial 1 with value: 326.0017935863218.
Trial 4 finished with value: 326.0017935863218 and parameters: {'num_trees': 8, 'max_depth': 6, 'num_leaves': 23, 'learning_rate': 0.08459126528049378, 'feature_fraction': 0.34923698723675556}. Best is trial 1 with value: 326.0017935863218.
Trial 5 finished with value: 341.7653447494029 and parameters: {'num_trees': 9, 'max_depth': 7, 'num_leaves': 16, 'learning_rate': 0.07098936257405905, 'feature_fraction': 0.14617343782862652}. Best is trial 1 with value: 326.0017935863218.
Trial 6 finished with value: 554.26156702858 and parameters: {'num_trees': 3, 'max_depth': 3, 'num_leaves': 22, 'learning_rate': 0.0360009119291161, 'feature_fraction': 0.6655078963604325}. Best is trial 1 with value: 326.0017935863218.
Trial 7 finished with value: 439.7075803

## Experimental history

In [135]:
study = optuna.create_study(
    study_name=experiment_name, storage=storage_path, load_if_exists=True
)
print("number of trials in the study :", len(study.trials))
trials_df = study.trials_dataframe().drop(
    columns=["datetime_start", "datetime_complete"]
)
print("best studies :")
trials_df.sort_values(by="value").head(10)

Using an existing study with name 'lightgbm_preprocessing2_config3' instead of creating a new one.
number of trials in the study : 13
best studies :


Unnamed: 0,number,value,duration,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,params_num_trees,user_attrs_cv_errors_std,user_attrs_error_split_1,user_attrs_error_split_2,user_attrs_error_split_3,user_attrs_error_split_4,user_attrs_error_split_5,user_attrs_test_mae,user_attrs_train_mae,state
1,1,326.001794,0 days 00:00:46.746205,0.349237,0.084591,6,23,8,97.59545,466.871761,274.5128,234.993587,418.5898,235.04102,276.26882,304.298886,COMPLETE
4,4,326.001794,0 days 00:00:48.830408,0.349237,0.084591,6,23,8,97.59545,466.871761,274.5128,234.993587,418.5898,235.04102,276.26882,304.298886,COMPLETE
2,2,341.765345,0 days 00:00:44.079774,0.146173,0.070989,7,16,9,101.498828,487.241754,287.002712,246.416068,439.596183,248.570007,286.724679,319.667296,COMPLETE
5,5,341.765345,0 days 00:00:45.127688,0.146173,0.070989,7,16,9,101.498828,487.241754,287.002712,246.416068,439.596183,248.570007,286.724679,319.667296,COMPLETE
11,11,348.584717,0 days 00:00:47.921888,0.508833,0.098443,5,21,6,104.233124,503.753031,292.249076,253.211518,441.643638,252.066323,291.9019,326.556195,COMPLETE
7,7,439.70758,0 days 00:00:45.703819,0.176878,0.036396,9,14,10,132.84484,636.74234,364.197159,321.709089,559.978213,315.911102,370.280522,409.28083,COMPLETE
6,6,554.261567,0 days 00:00:45.970803,0.665508,0.036001,3,22,3,169.147411,815.076909,454.635578,410.574571,694.347774,396.673003,472.443605,515.9298,COMPLETE
0,0,562.028997,0 days 00:00:44.813456,0.375376,0.012296,8,17,7,169.512766,833.366835,464.588755,419.1039,687.338406,405.747091,484.895453,526.451593,COMPLETE
3,3,562.028997,0 days 00:01:01.945884,0.375376,0.012296,8,17,7,169.512766,833.366835,464.588755,419.1039,687.338406,405.747091,484.895453,526.451593,COMPLETE
8,8,562.080908,0 days 00:00:47.677996,0.36953,0.011059,4,24,8,170.572181,831.306609,463.0047,418.045952,694.090378,403.956898,482.856869,525.094222,COMPLETE
