
# Importações

In [8]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import cudf as pd
import random
import tensorflow as tf
import shap

from keras import Sequential
from keras.src.layers import Input, LSTM, Dense
from pyESN import ESN
from shap.plots import colors
from sklearn.ensemble import RandomForestRegressor as RandomForest
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from xgboost import XGBRegressor

SEED = 100


def reset_seed(rnd_seed=SEED):
    os.environ['PYTHONHASHSEED'] = '0'
    random.seed(rnd_seed)
    np.random.seed(rnd_seed)
    tf.random.set_seed(rnd_seed)


def rrmse(actual, predicted) -> int:
    return root_mean_squared_error(actual, predicted) / np.mean(actual)


def smape(actual, predicted) -> int:
    if not all([isinstance(actual, np.ndarray),
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)

    return round(
        np.mean(
            np.abs(predicted - actual) /
            ((np.abs(predicted) + np.abs(actual)) / 2)
        ) * 100, 2
    )


reset_seed()
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/path/to/cuda'

# Carregar Datasets

In [9]:
df_mesclado = pd.read_csv("./dados/dados_mesclados.csv", sep=';', decimal='.')
df_features = pandas.read_csv("./dados/fitness_features_agrupado.csv", sep=";", decimal=".")

df_features = df_features.sort_values("MAPE").head(1).reset_index(drop=True)
df_features = pandas.DataFrame(
    columns=str(df_features.iloc[0]["FEATURES"]).replace("(", '').replace(")", '').replace("'", "").split(", "))



# Melhores Parâmetros

In [7]:
best = {}
for modelo in ["ESN", "LSTM", "RF", "XGB"]:
    df_aux = pd.DataFrame()
    for optimizer in ["GA", "PSO"]:
        for seed in ["10000", "20000", "30000"]:
            new_df = pd.read_csv(f'parâmetros/{optimizer}-{modelo} ITERS SEED {seed}.csv', sep=";", decimal=".",
                                 header=0)
            df_aux = pd.concat([df_aux, new_df], axis=0)

    df_aux = df_aux.sort_values(by=["Fitness"])
    df_aux[df_aux.isnull()] = None
    best[f"{modelo}"] = df_aux[:1].iloc[0].to_dict()

# Criação dos Modelos

In [10]:
#ESN
esn = ESN(n_inputs=df_mesclado[df_features].shape[1],
          n_outputs=1,
          n_reservoir=int(best["ESN"]["Reservoirs"]),
          sparsity=best["ESN"]["Sparsity"],
          spectral_radius=best["ESN"]["Spectral Radius"],
          noise=best["ESN"]["Noise"],
          random_state=SEED)

#LSTM
tf.keras.backend.clear_session()
lstm = Sequential([
    Input((df_mesclado[df_features].shape[1], 1)),
    LSTM(best["LSTM"]["Units"],
         activation=best["LSTM"]["Activation"],
         use_bias=best["LSTM"]["Bias"],
         seed=SEED),
    Dense(1),
])
lstm.compile(loss='mape')

#RF
rf = RandomForest(random_state=SEED,
                  n_estimators=int(best["RF"]["N_estimators"]),
                  max_depth=int(best["RF"]["Max_depth"]),
                  min_samples_split=int(best["RF"]["Min_samples_split"]),
                  min_samples_leaf=int(best["RF"]["Min_samples_leaf"]))

#XGB
updater = "coord_descent" if best["XGB"]["Booster"] == "gblinear" else None
xgb = XGBRegressor(random_state=SEED,
                   n_estimators=int(best["XGB"]["N_estimators"]),
                   max_depth=int(best["XGB"]["Max_depth"]),
                   booster=best["XGB"]["Booster"],
                   reg_lambda=best["XGB"]["Lambda"],
                   reg_alpha=best["XGB"]["Alpha"],
                   updater=updater)


I0000 00:00:1739235042.913290    5203 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739235043.184368    5203 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739235043.184647    5203 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739235043.193707    5203 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739235043.193935    5203 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

# Treino e Teste
## Divisão dos Dados

In [None]:
dataframes_treino = []
dataframes_teste = []

for campus, dados in df_mesclado.groupby("CAMPUS"):
    dados["CAMPUS"] = campus
    for i in range(1, 12 + 1):
        lag = dados['CONSUMO'].shift(i)
        dados[f'LAG_' + '{:02d}'.format(i)] = lag
    dados.dropna(inplace=True)

    treino, teste = train_test_split(dados, test_size=12, random_state=SEED, shuffle=False)

    dataframes_treino.append(treino)
    dataframes_teste.append(teste)

df_features = df_mesclado.drop(df_mesclado.drop(df_features, axis=1).columns.to_list(), axis=1).columns
df_treino = pd.concat(dataframes_treino, ignore_index=True).sort_values("CAMPUS").sort_values("DATA")
df_teste = pd.concat(dataframes_teste, ignore_index=True).sort_values("CAMPUS").sort_values("DATA")


## Regressão


In [26]:
def treino_kfold(modelo, dataframe, features):
    cvs = []
    subdf = {data: dados for data, dados in dataframe.sort_values("CAMPUS", ascending=True).groupby('DATA')}
    for i_treino, i_teste in TimeSeriesSplit(n_splits=5, test_size=1).split(subdf):
        i_treino = [list(subdf.keys())[index] for index in i_treino]
        i_teste = [list(subdf.keys())[index] for index in i_teste]

        dados_treino = pd.concat([subdf[index] for index in i_treino], ignore_index=True)
        dados_teste = pd.concat([subdf[index] for index in i_teste], ignore_index=True)

        x_treino, y_treino = dados_treino[features].astype(np.float32).to_cupy().get(), dados_treino[
            "CONSUMO"].astype(
            np.float32).to_cupy().get()
        x_teste, y_teste = dados_teste[features].astype(np.float32).to_cupy().get(), dados_teste[
            "CONSUMO"].to_cupy().get()

        y_previsto = []
        if isinstance(modelo, RandomForest) or isinstance(modelo, XGBRegressor):
            modelo.fit(x_treino, y_treino)
            for row in x_teste:
                previsao = modelo.predict(row.reshape(1, -1))
                y_previsto.append(previsao)

        else:
            if isinstance(modelo, ESN):
                modelo.fit(x_treino, y_treino)
            else:
                modelo.fit(x_treino, y_treino, shuffle=False, verbose=False, epochs=best["LSTM"]["Epochs"],
                           batch_size=best["LSTM"]["Batch Size"])
            y_previsto = []
            for row in x_teste:
                previsao = modelo.predict(row.reshape(1, -1))[0]
                y_previsto.append(previsao)

        mape = mean_absolute_percentage_error(y_teste, y_previsto)
        cvs.append(mape)

    return modelo, cvs


def teste_split(modelo, dataframe_treino, dataframe_teste, features, horizonte):
    dataframe_treino = dataframe_treino.sort_values("CAMPUS", ascending=True).sort_values("DATA")
    dataframe_teste = dataframe_teste.sort_values("CAMPUS", ascending=True).sort_values("DATA")

    x_treino = dataframe_treino[features].astype(np.float32).to_cupy().get()
    y_treino = dataframe_treino["CONSUMO"].astype(np.float32).to_cupy().get()
    x_teste = dataframe_teste[features].astype(np.float32)
    y_teste = dataframe_teste["CONSUMO"].astype(np.float32).to_cupy().get()

    preds = []
    datasets = dataframe_treino.copy()

    if isinstance(modelo, RandomForest) or isinstance(modelo, XGBRegressor):
        modelo.fit(x_treino, y_treino)
    else:
        if isinstance(modelo, ESN):
            modelo.fit(x_treino, y_treino)
        else:
            modelo.fit(x_treino, y_treino, shuffle=False, verbose=False, epochs=best["LSTM"]["Epochs"],
                       batch_size=best["LSTM"]["Batch Size"])

    for i_test in range(horizonte):
        sx_test = x_teste.iloc[[i_test]]
        sx_test_aux = sx_test.copy()

        datasets = pd.concat([datasets, sx_test_aux], axis=0)
        for lag in range(1, 12 + 1):
            if 'LAG_' + "{:02d}".format(lag) in sx_test.columns:
                sx_test_aux[f'LAG_' + '{:02d}'.format(lag)] = datasets["CONSUMO"].shift(lag)
                datasets[f'LAG_' + '{:02d}'.format(lag)] = datasets["CONSUMO"].shift(lag)

        if isinstance(modelo, RandomForest) or isinstance(modelo, XGBRegressor):
            pred = modelo.predict(sx_test_aux.to_cupy().get().reshape(1, -1))
        else:
            pred = modelo.predict(sx_test_aux.to_cupy().get().reshape(1, -1))[0]

        sx_test_aux["consumption"] = pred
        preds.append(pred)
        datasets.update(sx_test_aux)

    medidas = pd.Series([rrmse(y_teste, preds), smape(y_teste, preds)], index=["RRMSE", "sMAPE"])

    return medidas, preds, datasets


### Treino com todos os campi

### Treino com campi individuais