<a href="https://colab.research.google.com/github/gabriel1628/End-to-end-MLOps-for-Time-Series/blob/main/lgbm_hpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python version

In [1]:
import sys
sys.version

'3.10.11 (main, Aug  8 2024, 11:04:12) [Clang 15.0.0 (clang-1500.3.9.4)]'

# If run in Google Colab

In [2]:
COLAB = False # if notebook is run in Google Colab

In [3]:
if COLAB:
    !pip install boto3 mlforecast optuna lightgbm GPUtil -q

In [4]:
# download files from github
if COLAB:
    #TODO: just git clone the repo and move files in the working directory
    pass

In [5]:
#TODO: Setup training on GPU if GPU available
# !git clone --recursive https://github.com/microsoft/LightGBM

In [6]:
# cd LightGBM

In [7]:
# !cmake -B build -S . -DUSE_GPU=ON
# # if you have installed NVIDIA CUDA to a customized location, you should specify paths to OpenCL headers and library like the following:
# # !cmake -B build -S . -DUSE_GPU=ON -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/
# !cmake --build build -j4

# Import libraries

In [8]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
#TODO: set the metric to use in config file
from sklearn.metrics import mean_absolute_error
import logging
import optuna
import boto3
import GPUtil
from pathlib import Path
from natsort import natsorted
from dotenv import dotenv_values
import os
import pickle
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Set device

In [9]:
device="gpu" if GPUtil.getAvailable() else "cpu"
print(f"device set to {device}")

device set to cpu


In [10]:
# import subprocess

# try:
#     subprocess.check_output('nvidia-smi')
#     device = "cuda"
# except Exception: # this command not being found can raise quite a few different errors depending on the configuration
#     device = "cpu"

# Global variables

In [11]:
import sys
sys.path.append("/Users/gabriel/Documents/Git/End-to-end MLOps for Time Series")
from utils import load_config, download_s3_dir
import yaml

In [12]:
env_vars = dotenv_values("../.env")
config = load_config("../config/config.yaml")

# Downloading data and Optuna studies from S3

In [13]:
if COLAB: # If you are on Colab, you can use secret environment variables (click on the key in the left panel)
    from google.colab import userdata
    s3 = boto3.client(
        's3',
        aws_access_key_id=userdata.get("ACCESS_KEY"),
        aws_secret_access_key=userdata.get("SECRET_KEY"),
    )
else:
    s3 = boto3.client(
            "s3",
            aws_access_key_id=env_vars["AWS_ACCESS_KEY_ID"],
            aws_secret_access_key=env_vars["AWS_SECRET_ACCESS_KEY"],
        )

# print("list objects in the enefit-competition bucket to check connection :\n")
# response = s3.list_objects(
#     Bucket='enefit-competition',
#     MaxKeys=5,
# )
# for content in response["Contents"]:
#     print(content["Key"])

In [14]:
# if COLAB:
#     config["s3_bucket"] = ...
if config["s3_bucket"]:
    for s3_dir, local_dir in zip(config["s3_dirs"], config["local_dirs"]):
        download_s3_dir(config["s3_bucket"], s3_dir, local_dir)

# Load the data

In [15]:
df_train = pd.read_csv("../data/processed/consumption_train_processed.csv")
X_train = df_train.drop(columns="target")
y_train = df_train["target"]
print(df_train.shape)
print(X_train.shape)
print(y_train.shape)

(756114, 36)
(756114, 35)
(756114,)


In [16]:
df_test = pd.read_csv("../data/processed/consumption_test_processed.csv")
X_test = df_test.drop(columns="target")
y_test = df_test["target"]
print(df_test.shape)
print(X_test.shape)
print(y_test.shape)

(185598, 36)
(185598, 35)
(185598,)


# HPO with Optuna

## Experiment configurations

In [17]:
hpo_notebook_config = {
    "int_params": [
        {
            "name": "num_trees",
            "low": 3,
            "high": 10,
            "log": False,
        },
        {
            "name": "max_depth",
            "low": 3,
            "high": 10,
        },
        {
            "name": "num_leaves",
            "low": 5,
            "high": 25,
        },
    ],
    "float_params": [
        {
            "name": "learning_rate",
            "low": 0.001,
            "high": 0.1,
            "log": True,
        },
        {
            "name": "feature_fraction",
            "low": 0.1,
            "high": 0.75,
        }
    ],
    "objective_values": "mean",
}

In [18]:
# Create config files directory if does not exist
hpo_config_path = Path("../config", f"{config['model_name']}_hpo")
try:
    path.mkdir(parents=True)
except:
    pass

In [19]:
# Check if the config has already been tested. If not, create a new config file
list_config_files = list(hpo_config_path.glob("*.yaml"))
n_config_files = len(list_config_files)

if n_config_files == 0:
    hpo_config_version = 1
    config_name = "config_1.yaml"
    print(f"no config file in {hpo_config_path}, creating {hpo_config_path}/{config_name}.")
    config_file_path = Path(hpo_config_path, config_name)
    with open(config_file_path, "w") as file:
        yaml.dump(hpo_notebook_config, file)

elif config["hpo_config_version"]:
    hpo_config_version = config["hpo_config_version"]
    config_name = f"config_{config['hpo_config_version']}.yaml"
    print(f"config version {config['hpo_config_version']} given, using {hpo_config_path}/{config_name}.")
    config_file_path = Path(hpo_config_path, config_name)

else:
    print(f"""{hpo_config_path} not empty but no config version given, looping over existing files to check if the content matches 
          with the config set in the notebook. If no match, creating a new file.\n""")
    hpo_config_version = 1
    for config_file_path in natsorted(list_config_files):
        with open(config_file_path, "rb") as file:
            hpo_config = yaml.safe_load(file)
        if hpo_notebook_config == hpo_config:
            print(f"The content of {hpo_config_path}/{config_file_path} matches.")
            break
        hpo_config_version += 1
        if hpo_config_version > n_config_files:
            config_file_path = Path(hpo_config_path, f"config_{hpo_config_version}.yaml")
            print(f"No match found, creating {hpo_config_path}/{config_file_path}.")
            with open(config_file_path, "w") as file:
                yaml.dump(hpo_notebook_config, file)


with open(config_file_path, "rb") as file:
    hpo_config = yaml.safe_load(file)

config version 1 given, using ../config/lightgbm_hpo/config_1.yaml.


In [20]:
from pprint import pprint
pprint(hpo_config)

{'float_params': [{'high': 0.1,
                   'log': True,
                   'low': 0.001,
                   'name': 'learning_rate'},
                  {'high': 0.75, 'low': 0.1, 'name': 'feature_fraction'}],
 'int_params': [{'high': 10, 'log': False, 'low': 3, 'name': 'num_trees'},
                {'high': 10, 'low': 3, 'name': 'max_depth'},
                {'high': 25, 'low': 5, 'name': 'num_leaves'}],
 'objective_values': 'mean'}


## Objective function

In [21]:
def objective(trial):
    # Define hyperparameters
    study_params = {
        "verbosity": -1,
        "random_state": config["random_state"],
        "device": device,
    }
    for int_param in hpo_config["int_params"]:
        study_params[int_param["name"]] = trial.suggest_int(**int_param)
    for float_param in hpo_config["float_params"]:
        study_params[float_param["name"]] = trial.suggest_float(**float_param)

    # Evaluate model using cross-validation
    tscv = TimeSeriesSplit(n_splits=config["n_splits"])
    model = LGBMRegressor(**study_params)
    cv_errors = cross_val_score(
        model, X_train, y_train, scoring=config["scoring"], cv=tscv
    )

    # Log CV results
    cv_errors = -cv_errors
    for i in range(len(cv_errors)):
        trial.set_user_attr(f"error_split_{i+1}", cv_errors[i])
    # trial.set_user_attr("cv_errors", list(cv_errors))
    trial.set_user_attr("cv_errors_std", cv_errors.std())

    return cv_errors.mean()

## Create and run study

In [22]:
# Add stream handler of stdout to show the messages
logger = optuna.logging.get_logger("optuna")
if logger.hasHandlers():
    logger.handlers.clear()
logger.addHandler(logging.StreamHandler(sys.stdout))

In [23]:
#TODO: setup data version control then replace 'datav1' with appropriate data info
study_name = (f"datav1_{config['model_name']}_config{hpo_config_version}")
study_path = f"../optuna_studies/{study_name}.db"
storage_path = "sqlite:///{}".format(study_path)
print(f"Study path : {study_path}")

Study path : ../optuna_studies/datav1_lightgbm_config1.db


In [24]:
# Set sampler
# For a list a available samplers : https://optuna.readthedocs.io/en/stable/reference/samplers/index.html
sampler_name = f"{study_name}_sampler.pkl"
if sampler_name in os.listdir("../optuna_studies"):
    sampler_loaded = True
    print("loading sampler")
    sampler = pickle.load(open(f"../optuna_studies/{sampler_name}", "rb"))
else:
    sampler_loaded = False
    print("no sampler saved for the study, creating a new one")
    sampler = optuna.samplers.TPESampler(seed=0)

loading sampler


In [25]:
# Initialize the Optuna study
study = optuna.create_study(
    study_name=study_name,
    storage=storage_path,
    load_if_exists=True,
    directions=["minimize"],
    sampler=sampler,
    # pruner=pruner,
)
if not sampler_loaded:
    print("saving sampler")
    with open(f"../optuna_studies/{study_name}_sampler.pkl", "wb") as file:
        pickle.dump(study.sampler, file)

Using an existing study with name 'datav1_lightgbm_config1' instead of creating a new one.


In [26]:
len(study.trials)

21

In [27]:
%%time
study.optimize(objective, n_trials=config["n_trials"])

Trial 21 finished with value: 484.86594742465996 and parameters: {'num_trees': 8, 'max_depth': 7, 'num_leaves': 23, 'learning_rate': 0.023409232476941308, 'feature_fraction': 0.3320026334987302}. Best is trial 12 with value: 278.7178225690444.
Trial 22 finished with value: 279.0866950994272 and parameters: {'num_trees': 8, 'max_depth': 6, 'num_leaves': 23, 'learning_rate': 0.09871353738442702, 'feature_fraction': 0.33830491602492235}. Best is trial 12 with value: 278.7178225690444.
Trial 23 finished with value: 371.9548322058276 and parameters: {'num_trees': 9, 'max_depth': 5, 'num_leaves': 21, 'learning_rate': 0.051246801820596016, 'feature_fraction': 0.4363242922079621}. Best is trial 12 with value: 278.7178225690444.
Trial 24 finished with value: 293.30139163162306 and parameters: {'num_trees': 8, 'max_depth': 6, 'num_leaves': 24, 'learning_rate': 0.09090060486468222, 'feature_fraction': 0.22196970134533567}. Best is trial 12 with value: 278.7178225690444.
Trial 25 finished with val

# Visualize the optimization history

In [28]:
fig = optuna.visualization.plot_optimization_history(study, target_name='value')

fig.update_layout(
    autosize=True,
    width=800,
    height=600
)
fig.show()

## Experimental history

In [29]:
study = optuna.create_study(
    study_name=study_name, storage=storage_path, load_if_exists=True
)
print("number of trials in the study :", len(study.trials))
trials_df = study.trials_dataframe().drop(
    columns=["datetime_start", "datetime_complete"]
)
print("best studies :")
trials_df.sort_values(by="value").head(10)

Using an existing study with name 'datav1_lightgbm_config1' instead of creating a new one.
number of trials in the study : 26
best studies :


Unnamed: 0,number,value,duration,params_feature_fraction,params_learning_rate,params_max_depth,params_num_leaves,params_num_trees,user_attrs_cv_errors_std,user_attrs_error_split_1,user_attrs_error_split_2,user_attrs_error_split_3,user_attrs_error_split_4,user_attrs_error_split_5,state
12,12,278.717823,0 days 00:00:03.798330,0.314475,0.098443,5,25,8,73.941599,405.627489,242.308044,212.41568,317.689122,215.548777,COMPLETE
22,22,279.086695,0 days 00:00:03.697283,0.338305,0.098714,6,23,8,73.187975,402.302628,242.146965,212.252616,322.185593,216.545673,COMPLETE
24,24,293.301392,0 days 00:00:03.439837,0.22197,0.090901,6,24,8,78.998159,428.727067,256.331912,224.060161,334.70991,222.677908,COMPLETE
2,2,306.117927,0 days 00:00:03.576550,0.349237,0.084591,6,23,8,81.086923,448.319878,264.874165,236.122785,343.429998,237.842807,COMPLETE
7,7,306.117927,0 days 00:00:03.416266,0.349237,0.084591,6,23,8,81.086923,448.319878,264.874165,236.122785,343.429998,237.842807,COMPLETE
8,8,321.404805,0 days 00:00:03.442026,0.146173,0.070989,7,16,9,85.739122,471.255182,282.280264,249.030162,360.524252,243.934167,COMPLETE
3,3,321.404805,0 days 00:00:03.534894,0.146173,0.070989,7,16,9,85.739122,471.255182,282.280264,249.030162,360.524252,243.934167,COMPLETE
15,15,367.754752,0 days 00:00:03.483267,0.267745,0.0938,4,25,5,101.512338,546.17618,319.425761,285.674737,412.538628,274.958456,COMPLETE
23,23,371.954832,0 days 00:00:03.528858,0.436324,0.051247,5,21,9,103.039047,553.038749,320.639569,289.764809,418.070889,278.260144,COMPLETE
25,25,402.229854,0 days 00:00:03.510400,0.223203,0.053908,5,25,7,111.796831,599.232183,345.514889,312.308199,451.585232,302.508768,COMPLETE
