In [1]:
!pip install lightgbm optuna mlforecast

Collecting lightgbm
  Downloading lightgbm-4.5.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting mlforecast
  Downloading mlforecast-0.13.4-py3-none-any.whl.metadata (12 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting coreforecast>=0.0.11 (from mlforecast)
  Downloading coreforecast-0.0.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting utilsforecast>=0.1.9 (from mlforecast)
  Downlo

In [2]:
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error

import logging
import optuna

# import mlflow

import boto3

import os
import sys
import yaml
import time

# Import custom modules
sys.path.append(os.path.realpath("../../modules"))
from utils import train_test_split

sys.path.append(os.path.realpath("../../preprocessing"))
from preprocessing import *

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Global variables

In [3]:
forecast_horizon = 24
n_lags = 48
model_name = "lightgbm"
feature_set_version = 2  # preprocessing version
study_path = "../optuna_studies"

# Import data from S3

In [4]:
def download_s3_folder(bucket_name, s3_folder, local_dir):
    # Initialize S3 client
    s3 = boto3.client("s3")

    # Ensure the local directory exists
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    # List objects within the specified S3 folder
    paginator = s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder)

    for page in pages:
        if "Contents" in page:
            for obj in page["Contents"]:
                # Get the path of the object key
                s3_key = obj["Key"]
                if s3_key == s3_folder:
                    continue
                # Remove the prefix from the key to get the relative file path
                relative_path = os.path.relpath(s3_key, s3_folder)

                # Create the full local path
                local_file_path = os.path.join(local_dir, relative_path)

                # Create local directory if not exists
                local_file_dir = os.path.dirname(local_file_path)
                if not os.path.exists(local_file_dir):
                    os.makedirs(local_file_dir)

                # Download the file
                print(f"Downloading {s3_key} to {local_file_path}...")
                s3.download_file(bucket_name, s3_key, local_file_path)

In [5]:
# Retrieve the list of existing buckets
s3 = boto3.client("s3")
response = s3.list_buckets()

# Output the bucket names
print("Existing buckets:")
for bucket in response["Buckets"]:
    print(f'  {bucket["Name"]}')

Existing buckets:
  bs-ekratest
  data-boris
  elasticbeanstalk-eu-west-1-058264089030
  projet-cobalt
  projet-d2ian
  projet-tirecs
  raawn-project
  rag-m-data
  ragoud-pdf
  sagemaker-eu-west-1-058264089030
  sagemaker-studio-lbkxz4v2xkk
  sagemaker-studio-pdxyk96esz
  tirecs-melissarohart-tests


In [6]:
bucket_name = "projet-cobalt"
s3_folders = ["data/", "optuna_studies/"]  # These are the "folders" in the S3 bucket
local_dirs = ["../../data/", "../optuna_studies/"]  # Local directory to save files

for s3_folder, local_dir in zip(s3_folders, local_dirs):
    download_s3_folder(bucket_name, s3_folder, local_dir)

Downloading data/consumption.csv to ../../data/consumption.csv...
Downloading data/consumption_no_nan.csv to ../../data/consumption_no_nan.csv...
Downloading data/data_json.json to ../../data/data_json.json...
Downloading data/data_json_mlforecast.json to ../../data/data_json_mlforecast.json...
Downloading data/data_json_small.json to ../../data/data_json_small.json...
Downloading data/data_json_test.json to ../../data/data_json_test.json...
Downloading data/df_hat.csv to ../../data/df_hat.csv...
Downloading data/production.csv to ../../data/production.csv...


# Import Data

In [7]:
df = pd.read_csv(
    "../../data/consumption.csv", usecols=["prediction_unit_id", "datetime", "target"]
)[["prediction_unit_id", "datetime", "target"]].rename(
    columns={"prediction_unit_id": "unique_id", "datetime": "ds", "target": "y"}
)
df["ds"] = pd.to_datetime(df["ds"])
df.head()

Unnamed: 0,unique_id,ds,y
0,0,2021-09-01,96.59
1,1,2021-09-01,17.314
2,2,2021-09-01,656.859
3,3,2021-09-01,59.0
4,4,2021-09-01,501.76


In [8]:
df.shape

(1009176, 3)

# Train/Test split

In [9]:
# taking the last 60 days of each unit for test
df_train, df_test = train_test_split(df, test_window=24 * 60)

In [10]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [11]:
df.shape[1] == df_train.shape[1] == df_test.shape[1]

True

In [12]:
test_size = df_test.shape[0] / (df.shape[0])
print(f"test set : {round(test_size*100, 2)}% of the data set")

test set : 9.85% of the data set


# Preprocessing

In [13]:
preprocessing = vars()[f"preprocessing_{feature_set_version}"]

In [14]:
X_train, y_train = preprocessing(df_train)
print(X_train.shape, y_train.shape)
X_train.head()

(855591, 99) (855591,)


  df[feat_name] = feat_vals[restore_idxs]
  df[feat_name] = feat_vals[restore_idxs]


Unnamed: 0,lag24,lag25,lag26,lag27,lag28,lag29,lag30,lag31,lag32,lag33,...,rolling_mean_lag44_window_size24,expanding_mean_lag45,rolling_mean_lag45_window_size24,expanding_mean_lag46,rolling_mean_lag46_window_size24,expanding_mean_lag47,rolling_mean_lag47_window_size24,month,dayofweek,hour
4331,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,43.671,...,82.505417,82.612111,81.8605,81.756654,81.308,81.1456,80.502083,9,4,23
4392,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,46.84,...,83.241833,83.355714,82.505417,82.612111,81.8605,81.756654,81.308,9,5,0
4453,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,43.428,...,83.893958,84.131655,83.241833,83.355714,82.505417,82.612111,81.8605,9,5,1
4514,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,80.621,...,84.539375,84.841667,83.893958,84.131655,83.241833,83.355714,82.505417,9,5,2
4575,94.536,96.193,81.92,107.129,120.54,134.986,150.412,152.763,136.13,121.033,...,84.552333,85.716806,84.539375,84.841667,83.893958,84.131655,83.241833,9,5,3


In [15]:
X_test, y_test = preprocessing(df_test)
print(X_test.shape, y_test.shape)
X_test.head()

(86247, 99) (86247,)


  df[feat_name] = feat_vals[restore_idxs]
  df[feat_name] = feat_vals[restore_idxs]


Unnamed: 0,lag24,lag25,lag26,lag27,lag28,lag29,lag30,lag31,lag32,lag33,...,rolling_mean_lag44_window_size24,expanding_mean_lag45,rolling_mean_lag45_window_size24,expanding_mean_lag46,rolling_mean_lag46_window_size24,expanding_mean_lag47,rolling_mean_lag47_window_size24,month,dayofweek,hour
918702,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,160.944,...,722.741458,747.325111,723.395542,740.683269,723.535208,734.97888,725.296292,4,1,23
918768,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,193.661,...,723.69675,753.981429,722.741458,747.325111,723.395542,740.683269,723.535208,4,2,0
918833,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,246.753,...,724.803458,761.656966,723.69675,753.981429,722.741458,747.325111,723.395542,4,2,1
918898,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,382.365,...,726.249667,769.130867,724.803458,761.656966,723.69675,753.981429,722.741458,4,2,2
918963,1014.902,1001.917,999.628,1055.621,1057.285,1107.518,1177.874,1114.973,860.687,624.847,...,731.313625,778.516258,726.249667,769.130867,724.803458,761.656966,723.69675,4,2,3


# HPO with Optuna

## Define objective function

In [16]:
# # Define an objective function to be minimized
# def objective(trial):
#   # Define hyperparameters
#   study_params = {
#       "verbosity": -1,
#       "random_state": 0,
#       "num_trees": trial.suggest_int("num_trees", 10, 500, log=False),
#       "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.1, log=True),
#       "max_depth": trial.suggest_int("max_depth", 10, 100),
#       "num_leaves": trial.suggest_int("num_leaves", 10, 100),
#       # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 500, log=True
#       "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
#       "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 500, log=True),
#   }

#   # Train model with cross-validation
#   tscv = TimeSeriesSplit(n_splits=5)
#   model = LGBMRegressor(**study_params)
#   errors = cross_val_score(model, X_train, y_train, scoring="neg_mean_absolute_error", cv=tscv)
#   errors = -errors

#   return errors.mean()

In [17]:
# import inspect

# # Get the code of the study and save it
# source_code = inspect.getsource(objective)
# function_name = f"{model_name}-objective{objective_function_version}.py"
# with open(f"{study_path}/objective_functions/{function_name}", "w") as file:
#   file.write(source_code)

# with open(f"{study_path}/objective_functions/{function_name}", "rb") as file:
#   s3.upload_fileobj(file, bucket_name, f"optuna-studies/objective_functions/{function_name}")

In [18]:
experiment_config = {
    "objective_params": {
        "num_trees": {
            "min": 100,
            "max": 1000,
            "log": True,
        },
        "learning_rate": {
            "min": 0.0001,
            "max": 0.1,
            "log": True,
        },
        "max_depth": {
            "min": 3,
            "max": 8,
        },
        "num_leaves": {
            "min": 5,
            "max": 25,
        },
        "feature_fraction": {
            "min": 0.1,
            "max": 0.75,
        },
    },
    "objective_values": "mean",
}

In [19]:
# os.mkdir(f"../configuration_files/")
# os.mkdir(f"../configuration_files/lightgbm_experiments")

In [20]:
config_version = 1
config_files_path = f"../configuration_files/{model_name}_experiments"
list_config_files = os.listdir(config_files_path)
list_config_files = [file for file in list_config_files if file.split(".")[1] == "yaml"]
n_config_files = len(list_config_files)
if n_config_files == 0:
    with open(f"{config_files_path}/config1.yaml", "w") as file:
        yaml.dump(experiment_config, file)
else:
    for config_file in list_config_files:
        with open(f"{config_files_path}/{config_file}", "rb") as file:
            config = yaml.safe_load(file)
        if experiment_config == config:
            break
        config_version += 1
        if config_version > n_config_files:
            with open(f"{config_files_path}/config{config_version}.yaml", "w") as file:
                yaml.dump(experiment_config, file)

In [29]:
with open(f"{config_files_path}/config{config_version}.yaml", "rb") as file:
    config = yaml.safe_load(file)

In [39]:
# upload configuration files to S3
for file in os.listdir(config_files_path):
    s3.upload_file(
        f"{config_files_path}/{file}",
        bucket_name,
        "/".join(config_files_path.split("/")[1:]) + "/" + file,
    )

In [40]:
# list_a = [1, 2, 3]
# list_b = [1, 2, 3, 4, 5]

# # Check if all elements of list_a are in list_b
# result = all(elem in list_b for elem in list_a)
# print(result)

In [41]:
def objective(trial):
    # Define hyperparameters
    study_params = {
        "verbosity": -1,
        "random_state": 0,
    }
    objective_params = config["objective_params"]
    if "num_trees" in config["objective_params"].keys():
        study_params["num_trees"] = trial.suggest_int(
            name="num_trees",
            low=objective_params["num_trees"]["min"],
            high=objective_params["num_trees"]["max"],
            log=objective_params["num_trees"]["log"],
        )
    if "learning_rate" in config["objective_params"].keys():
        study_params["learning_rate"] = trial.suggest_float(
            name="learning_rate",
            low=objective_params["learning_rate"]["min"],
            high=objective_params["learning_rate"]["max"],
            log=objective_params["learning_rate"]["log"],
        )
    if "max_depth" in config["objective_params"].keys():
        study_params["max_depth"] = trial.suggest_int(
            name="max_depth",
            low=objective_params["max_depth"]["min"],
            high=objective_params["max_depth"]["max"],
        )
    if "num_leaves" in config["objective_params"].keys():
        study_params["num_leaves"] = trial.suggest_int(
            name="num_leaves",
            low=objective_params["num_leaves"]["min"],
            high=objective_params["num_leaves"]["max"],
        )
    if "feauture_fraction" in config["objective_params"].keys():
        study_params["feature_fraction"] = trial.suggest_float(
            name="feature_fraction",
            low=objective_params["feature_fraction"]["min"],
            high=objective_params["feature_fraction"]["max"],
        )
    if "min_data_in_leaf" in config["objective_params"].keys():
        study_params["min_data_in_leaf"] = trial.suggest_float(
            name="min_data_in_leaf",
            low=objective_params["min_data_in_leaf"]["min"],
            high=objective_params["min_data_in_leaf"]["max"],
            log=objective_params["min_data_in_leaf"]["log"],
        )

    # Train model with cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    model = LGBMRegressor(**study_params)
    cv_errors = cross_val_score(
        model, X_train, y_train, scoring="neg_mean_absolute_error", cv=tscv
    )

    # Log CV results
    cv_errors = -cv_errors
    for i in range(len(cv_errors)):
        trial.set_user_attr(f"error_split_{i+1}", cv_errors[i])
    # trial.set_user_attr("cv_errors", list(cv_errors))
    trial.set_user_attr("cv_errors_std", cv_errors.std())

    # Log train MAE
    model.fit(X_train, y_train)
    y_fit = model.predict(X_train)
    train_mae = mean_absolute_error(y_train, y_fit)
    trial.set_user_attr("train_mae", train_mae)

    # Log test MAE
    y_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred)
    trial.set_user_attr("test_mae", test_mae)

    return cv_errors.mean()

## Create and run study

In [42]:
experiment_name = (
    f"{model_name}-preprocessing{feature_set_version}-config{config_version}"
)
storage_name = "sqlite:///{}/{}.db".format(study_path, experiment_name)
print(f"experiment name : {experiment_name}")

experiment name : lightgbm-preprocessing2-config2


In [43]:
# Add stream handler of stdout to show the messages
logger = optuna.logging.get_logger("optuna")
if logger.hasHandlers():
    logger.handlers.clear()
logger.addHandler(logging.StreamHandler(sys.stdout))

# restored_sampler = pickle.load(open("{}-study-sampler.pkl".format(study_name), "rb"))
sampler = optuna.samplers.TPESampler(
    seed=0
)  # RandomSampler, GridSampler, TPESampler, CmaEsSampler, NSGAIISampler, QMCSampler, GPSampler, BoTorchSampler, BruteForceSampler

In [44]:
# Initialize the Optuna study
study = optuna.create_study(
    study_name=experiment_name,
    storage=storage_name,
    load_if_exists=True,
    directions=["minimize"],
    # sampler=sampler,
    # pruner=pruner,
)

Using an existing study with name 'lightgbm-preprocessing2-config2' instead of creating a new one.


In [45]:
len(study.trials)

232

In [None]:
# Execute the hyperparameter optimization trials.
for i in range(1000):
    study.optimize(objective, n_trials=1)
    s3.upload_file(
        f"{study_path}/{experiment_name}.db",
        bucket_name,
        f"optuna-studies/{experiment_name}.db",
    )

Trial 232 finished with value: 62.10438272246294 and parameters: {'num_trees': 755, 'learning_rate': 0.08735690927417673, 'max_depth': 7, 'num_leaves': 11}. Best is trial 207 with value: 61.69501638989667.
Trial 233 finished with value: 62.54260283418894 and parameters: {'num_trees': 773, 'learning_rate': 0.07350959807959037, 'max_depth': 7, 'num_leaves': 12}. Best is trial 207 with value: 61.69501638989667.
Trial 234 finished with value: 62.12367501697656 and parameters: {'num_trees': 824, 'learning_rate': 0.08461857200043713, 'max_depth': 7, 'num_leaves': 11}. Best is trial 207 with value: 61.69501638989667.
Trial 235 finished with value: 61.86109566371069 and parameters: {'num_trees': 800, 'learning_rate': 0.08434767497954963, 'max_depth': 7, 'num_leaves': 11}. Best is trial 207 with value: 61.69501638989667.
Trial 236 finished with value: 62.42070212283306 and parameters: {'num_trees': 737, 'learning_rate': 0.0871994178987373, 'max_depth': 7, 'num_leaves': 11}. Best is trial 207 wi

In [30]:
print("best trial number :", study.best_trial.number)
print("best params :", study.best_params)
print("best error :", study.best_value)

best trial number : 400
best params : {'num_trees': 93, 'learning_rate': 0.09965941125937848, 'max_depth': 9, 'num_leaves': 35}
best error : 63.19364006608713


## Experimental history

In [31]:
study = optuna.create_study(
    study_name=experiment_name, storage=storage_name, load_if_exists=True
)
print("number of trials in the study :", len(study.trials))
trials_df = study.trials_dataframe().drop(
    columns=["datetime_start", "datetime_complete"]
)
trials_df.sort_values(by="value").head()

Using an existing study with name 'lightgbm-preprocessing2-config1' instead of creating a new one.
number of trials in the study : 691


Unnamed: 0,number,value,duration,params_learning_rate,params_max_depth,params_num_leaves,params_num_trees,user_attrs_cv_errors_std,user_attrs_error_split_1,user_attrs_error_split_2,user_attrs_error_split_3,user_attrs_error_split_4,user_attrs_error_split_5,user_attrs_test_mae,user_attrs_train_mae,state
400,400,63.19364,0 days 00:00:15.705318,0.099659,9,35,93,38.742354,53.388416,57.152069,35.108136,138.161619,32.157961,73.871434,55.813876,COMPLETE
301,301,63.213407,0 days 00:00:15.849273,0.072456,9,32,100,37.942814,53.909184,57.993,35.362319,136.415355,32.387178,74.998595,57.314667,COMPLETE
296,296,63.215329,0 days 00:00:16.442244,0.087321,9,49,96,39.351986,53.197649,56.591022,35.165786,139.474059,31.64813,73.422721,54.379721,COMPLETE
132,132,63.219725,0 days 00:00:15.500934,0.09971,9,35,95,38.808659,53.380351,57.049509,35.512525,138.334141,31.8221,73.83719,55.568589,COMPLETE
628,628,63.239345,0 days 00:00:16.001369,0.086211,9,36,100,38.632593,53.273483,57.820008,34.992164,137.894456,32.216613,73.6644,55.962821,COMPLETE
