<a href="https://colab.research.google.com/github/gabriel1628/End-to-end-MLOps-for-Time-Series/blob/main/lgbm_hpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python version

In [None]:
import sys

sys.version

'3.10.11 (main, Aug  8 2024, 11:04:12) [Clang 15.0.0 (clang-1500.3.9.4)]'

# If run in Google Colab

In [None]:
COLAB = False  # if notebook is run in Google Colab

In [None]:
if COLAB:
    %pip install boto3 mlforecast optuna lightgbm GPUtil -q

In [None]:
# download files from github
if COLAB:
    # TODO: just git clone the repo and move files in the working directory
    pass

In [None]:
# TODO: Setup training on GPU if GPU available
# !git clone --recursive https://github.com/microsoft/LightGBM

In [156]:
# cd LightGBM

In [157]:
# !cmake -B build -S . -DUSE_GPU=ON
# # if you have installed NVIDIA CUDA to a customized location, you should specify paths to OpenCL headers and library like the following:
# # !cmake -B build -S . -DUSE_GPU=ON -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/
# !cmake --build build -j4

# Import libraries

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

# TODO: set the metric to use in config file
from sklearn.metrics import mean_absolute_error
import logging
import optuna
import boto3
import GPUtil
from pathlib import Path
from natsort import natsorted
from dotenv import dotenv_values
import os
import pickle
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Set device

In [None]:
device = "gpu" if GPUtil.getAvailable() else "cpu"
print(f"device set to {device}")

device set to cpu


In [160]:
# import subprocess

# try:
#     subprocess.check_output('nvidia-smi')
#     device = "cuda"
# except Exception: # this command not being found can raise quite a few different errors depending on the configuration
#     device = "cpu"

# Configurations

In [None]:
import sys

sys.path.append(
    "/Users/gabriel/Documents/Git/End-to-end-MLOps-for-Time-Series-Forecasting"
)
from utils import load_config, download_s3_dir
import yaml

In [None]:
env_vars = dotenv_values("../.env")
config = load_config("../config/development/config.yaml")

# Downloading data and Optuna studies from S3

In [None]:
if (
    COLAB
):  # If you are on Colab, you can use secret environment variables (click on the key in the left panel)
    from google.colab import userdata

    s3 = boto3.client(
        "s3",
        aws_access_key_id=userdata.get("ACCESS_KEY"),
        aws_secret_access_key=userdata.get("SECRET_KEY"),
    )
else:
    s3 = boto3.client(
        "s3",
        aws_access_key_id=env_vars["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=env_vars["AWS_SECRET_ACCESS_KEY"],
    )

# print("list objects in the enefit-competition bucket to check connection :\n")
# response = s3.list_objects(
#     Bucket='enefit-competition',
#     MaxKeys=5,
# )
# for content in response["Contents"]:
#     print(content["Key"])

In [None]:
if COLAB:
    s3_bucket = userdata.get("E2E_MLOPS_BUCKET")
    download_s3_dir(s3_bucket, "data", "./data")

# Load the data

In [165]:
df_train = pd.read_csv("../data/processed/consumption_train.csv")
X_train = df_train.drop(columns="target")
y_train = df_train["target"]
print(df_train.shape)
print(X_train.shape)
print(y_train.shape)

(757248, 21)
(757248, 20)
(757248,)


In [166]:
df_test = pd.read_csv("../data/processed/consumption_test.csv")
X_test = df_test.drop(columns="target")
y_test = df_test["target"]
print(df_test.shape)
print(X_test.shape)
print(y_test.shape)

(186732, 21)
(186732, 20)
(186732,)


# HPO with Optuna

## Experiment configurations

In [None]:
environment = "development"
config_file_path = Path(
    "../config",
    environment,
    f"{config['model_name']}_hpo",
    f"config_{config['hpo_config_version']}.yaml",
)
print(f"using {config_file_path} for HPO")
with open(config_file_path, "rb") as file:
    hpo_config = yaml.safe_load(file)

using ../config/development/lightgbm_hpo/config_1.yaml for HPO


In [None]:
from pprint import pprint

pprint(hpo_config)

{'float_params': [{'high': 0.1,
                   'log': True,
                   'low': 0.001,
                   'name': 'learning_rate'},
                  {'high': 0.75, 'low': 0.1, 'name': 'feature_fraction'}],
 'int_params': [{'high': 10, 'log': False, 'low': 3, 'name': 'num_trees'},
                {'high': 10, 'low': 3, 'name': 'max_depth'},
                {'high': 25, 'low': 5, 'name': 'num_leaves'}],
 'objective_values': 'mean'}


## Objective function

In [169]:
def objective(trial, X_train, y_train, hpo_config, random_state, device):
    study_params = {
        "verbosity": -1,
        "random_state": random_state,
        "device": device,
    }
    for int_param in hpo_config["int_params"]:
        study_params[int_param["name"]] = trial.suggest_int(**int_param)
    for float_param in hpo_config["float_params"]:
        study_params[float_param["name"]] = trial.suggest_float(**float_param)

    tscv = TimeSeriesSplit(n_splits=5)
    model = LGBMRegressor(**study_params)
    cv_errors = cross_val_score(
        model, X_train, y_train, scoring="neg_mean_absolute_error", cv=tscv
    )

    cv_errors = -cv_errors
    for i in range(len(cv_errors)):
        trial.set_user_attr(f"error_split_{i+1}", cv_errors[i])
    trial.set_user_attr("cv_errors_std", cv_errors.std())

    return cv_errors.mean()

## Create and run study

In [172]:
# Add stream handler of stdout to show the messages
logger = optuna.logging.get_logger("optuna")
if logger.hasHandlers():
    logger.handlers.clear()
logger.addHandler(logging.StreamHandler(sys.stdout))

In [None]:
# TODO: setup data version control then replace 'datav1' with appropriate data info
study_name = f"datav1_{config['model_name']}_config{config['hpo_config_version']}"
study_path = f"../optuna_studies/{study_name}.db"
storage_path = "sqlite:///{}".format(study_path)
print(f"Study path : {study_path}")

Study path : ../optuna_studies/datav1_lightgbm_config1.db


In [174]:
# Set sampler
# For a list a available samplers : https://optuna.readthedocs.io/en/stable/reference/samplers/index.html
sampler_name = f"{study_name}_sampler.pkl"
if sampler_name in os.listdir("../optuna_studies"):
    sampler_loaded = True
    print("loading sampler")
    sampler = pickle.load(open(f"../optuna_studies/{sampler_name}", "rb"))
else:
    sampler_loaded = False
    print("no sampler saved for the study, creating a new one")
    sampler = optuna.samplers.TPESampler(seed=0)

loading sampler


In [175]:
# Initialize the Optuna study
study = optuna.create_study(
    study_name=study_name,
    storage=storage_path,
    load_if_exists=True,
    directions=["minimize"],
    sampler=sampler,
    # pruner=pruner,
)
if not sampler_loaded:
    print("saving sampler")
    with open(f"../optuna_studies/{study_name}_sampler.pkl", "wb") as file:
        pickle.dump(study.sampler, file)

Using an existing study with name 'datav1_lightgbm_config1' instead of creating a new one.


In [176]:
len(study.trials)

27

In [178]:
%%time
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        hpo_config,
        config["random_state"],
        device,
    ),
    n_trials=config["n_trials"],
)

Trial 28 finished with value: 334.943091437233 and parameters: {'num_trees': 10, 'max_depth': 8, 'num_leaves': 19, 'learning_rate': 0.055059679897938266, 'feature_fraction': 0.2991197279057422}. Best is trial 12 with value: 278.7178225690444.
Trial 29 finished with value: 316.4536860439521 and parameters: {'num_trees': 6, 'max_depth': 7, 'num_leaves': 23, 'learning_rate': 0.09887926482264432, 'feature_fraction': 0.7460474914658897}. Best is trial 12 with value: 278.7178225690444.
Trial 30 finished with value: 505.3767389290104 and parameters: {'num_trees': 9, 'max_depth': 4, 'num_leaves': 12, 'learning_rate': 0.008270700835239831, 'feature_fraction': 0.21620237040100462}. Best is trial 12 with value: 278.7178225690444.
CPU times: user 35.2 s, sys: 2.3 s, total: 37.5 s
Wall time: 6.76 s


# Visualize the optimization history

In [None]:
fig = optuna.visualization.plot_optimization_history(study, target_name="value")

fig.update_layout(autosize=True, width=800, height=600)
fig.show()

## Experimental history

In [180]:
study = optuna.create_study(
    study_name=study_name, storage=storage_path, load_if_exists=True
)
print("number of trials in the study :", len(study.trials))
trials_df = study.trials_dataframe().drop(
    columns=["datetime_start", "datetime_complete"]
)
print("best studies :")
trials_df.sort_values(by="value").head(10)

Using an existing study with name 'datav1_lightgbm_config1' instead of creating a new one.
number of trials in the study : 31
best studies :


Unnamed: 0,number,value,duration,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,params_num_trees,user_attrs_cv_errors_std,user_attrs_error_split_1,user_attrs_error_split_2,user_attrs_error_split_3,user_attrs_error_split_4,user_attrs_error_split_5,state
12,12,278.717823,0 days 00:00:03.798330,0.314475,0.098443,5.0,25.0,8.0,73.941599,405.627489,242.308044,212.41568,317.689122,215.548777,COMPLETE
22,22,279.086695,0 days 00:00:03.697283,0.338305,0.098714,6.0,23.0,8.0,73.187975,402.302628,242.146965,212.252616,322.185593,216.545673,COMPLETE
24,24,293.301392,0 days 00:00:03.439837,0.22197,0.090901,6.0,24.0,8.0,78.998159,428.727067,256.331912,224.060161,334.70991,222.677908,COMPLETE
2,2,306.117927,0 days 00:00:03.576550,0.349237,0.084591,6.0,23.0,8.0,81.086923,448.319878,264.874165,236.122785,343.429998,237.842807,COMPLETE
7,7,306.117927,0 days 00:00:03.416266,0.349237,0.084591,6.0,23.0,8.0,81.086923,448.319878,264.874165,236.122785,343.429998,237.842807,COMPLETE
29,29,316.453686,0 days 00:00:02.217235,0.746047,0.098879,7.0,23.0,6.0,25.803431,324.826578,297.096087,290.431568,307.367608,362.546589,COMPLETE
3,3,321.404805,0 days 00:00:03.534894,0.146173,0.070989,7.0,16.0,9.0,85.739122,471.255182,282.280264,249.030162,360.524252,243.934167,COMPLETE
8,8,321.404805,0 days 00:00:03.442026,0.146173,0.070989,7.0,16.0,9.0,85.739122,471.255182,282.280264,249.030162,360.524252,243.934167,COMPLETE
28,28,334.943091,0 days 00:00:02.356720,0.29912,0.05506,8.0,19.0,10.0,26.408694,342.045493,317.898623,308.454299,323.3183,382.998742,COMPLETE
15,15,367.754752,0 days 00:00:03.483267,0.267745,0.0938,4.0,25.0,5.0,101.512338,546.17618,319.425761,285.674737,412.538628,274.958456,COMPLETE
