
# Importações

In [323]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas
import random
import cudf as pd
import tensorflow as tf
import shap

from keras import Sequential
from keras.src.layers import Input, LSTM, Dense
from pyESN import ESN
from shap.plots import colors
from sklearn.ensemble import RandomForestRegressor as RandomForest
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from xgboost import XGBRegressor

SEED = 100


def reset_seed(rnd_seed=SEED):
    os.environ['PYTHONHASHSEED'] = '0'
    random.seed(rnd_seed)
    np.random.seed(rnd_seed)
    tf.random.set_seed(rnd_seed)


def rrmse(actual, predicted) -> int:
    return root_mean_squared_error(actual, predicted) / np.mean(actual)


def smape(actual, predicted) -> int:
    if not all([isinstance(actual, np.ndarray),
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)

    return round(
        np.mean(
            np.abs(predicted - actual) /
            ((np.abs(predicted) + np.abs(actual)) / 2)
        ) * 100, 2
    )


reset_seed()
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/path/to/cuda'

# Carregar Datasets

In [324]:
df_mesclado = pandas.read_csv("./dados/dados_mesclados.csv", sep=';', decimal='.')
df_features = pandas.read_csv("./dados/fitness_features_agrupado.csv", sep=";", decimal=".")

df_features = df_features.sort_values("MAPE").head(1).reset_index(drop=True)
df_features = pandas.DataFrame(
    columns=str(df_features.iloc[0]["FEATURES"]).replace("(", '').replace(")", '').replace("'", "").split(", "))

df_mesclado

Unnamed: 0,CONSUMO,DATA,TEMP_MIN_MÉD_MENS,TEMP_MÉD_MIN_MENS,TEMP_MÉD_MÉD_MENS,TEMP_MÉD_MAX_MENS,TEMP_MÉD_ACC_MENS,TEMP_MAX_MÉD_MENS,PRECIPITAÇÃO_MÉD_MENS,TEMP_MIN_MIN_MENS,...,CURSOS_GRAD_VESPERTINO,CURSOS_GRAD_NOTURNO,CURSOS_POS,FÉRIAS,FERIADO,COVID,GREVE,CAMPUS,REGIÃO,ORDEM
0,0.0,2021-11-30,14.0,21.0,25.0,29.0,764.0,35.0,2.0,14.0,...,0.0,0.0,0.0,0,0,0,0,ARAPONGAS,REGIÃO NORTE,1
1,0.0,2021-12-31,16.0,22.0,26.0,30.0,802.0,38.0,2.0,16.0,...,0.0,0.0,0.0,0,0,0,0,ARAPONGAS,REGIÃO NORTE,2
2,0.0,2022-01-31,18.0,22.0,26.0,30.0,795.0,36.0,7.0,18.0,...,0.0,0.0,0.0,31,0,0,0,ARAPONGAS,REGIÃO NORTE,3
3,0.0,2022-02-28,18.0,24.0,27.0,31.0,750.0,37.0,3.0,18.0,...,0.0,0.0,0.0,0,1,0,0,ARAPONGAS,REGIÃO NORTE,4
4,0.0,2022-03-31,18.0,22.0,26.0,29.0,809.0,36.0,15.0,18.0,...,0.0,0.0,0.0,0,1,0,0,ARAPONGAS,REGIÃO NORTE,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2532,5645.0,2024-06-30,-0.0,8.0,17.0,20.0,499.0,26.0,1.0,-0.0,...,1.0,1.0,0.0,0,0,0,30,UNIÃO DA VITÓRIA,REGIÃO SUL,81
2533,11690.0,2024-07-31,1.0,9.0,13.0,18.0,413.0,25.0,6.0,1.0,...,1.0,1.0,0.0,12,0,0,0,UNIÃO DA VITÓRIA,REGIÃO SUL,82
2534,11127.0,2024-08-31,-3.0,7.0,15.0,21.0,468.0,31.0,1.0,-3.0,...,1.0,1.0,0.0,0,0,0,0,UNIÃO DA VITÓRIA,REGIÃO SUL,83
2535,9690.0,2024-09-30,10.0,13.0,20.0,24.0,591.0,34.0,3.0,10.0,...,1.0,1.0,0.0,0,0,0,0,UNIÃO DA VITÓRIA,REGIÃO SUL,84


# Melhores Parâmetros

In [325]:
best = {}
for modelo in ["ESN", "LSTM", "RF", "XGB"]:
    df_aux = pandas.DataFrame()
    for optimizer in ["GA", "PSO"]:
        for seed in ["10000", "20000", "30000"]:
            new_df = pandas.read_csv(f'parâmetros/{optimizer}-{modelo} ITERS SEED {seed}.csv', sep=";", decimal=".",
                                     header=0)
            df_aux = pandas.concat([df_aux, new_df], axis=0)

    df_aux = df_aux.sort_values(by=["Fitness"])
    df_aux[df_aux.isnull()] = None
    best[f"{modelo}"] = df_aux[:1].iloc[0].to_dict()
best

{'ESN': {'Unnamed: 0': 17.0,
  'Reservoirs': 112.0,
  'Sparsity': 0.3547389248897835,
  'Spectral Radius': 1.5801417429771525,
  'Noise': 0.2967047759609361,
  'Fitness': 0.257027595065815},
 'LSTM': {'Unnamed: 0': 17,
  'Units': 141,
  'Epochs': 67,
  'Batch Size': 139,
  'Activation': 'linear',
  'Bias': False,
  'Fitness': 0.2272814080695359},
 'RF': {'Unnamed: 0': 15.0,
  'N_estimators': 27.0,
  'Max_depth': 266.0,
  'Min_samples_split': 8.0,
  'Min_samples_leaf': 2.0,
  'Fitness': 0.2381346468531902,
  'Base Seed': nan},
 'XGB': {'Unnamed: 0': 14,
  'N_estimators': 155,
  'Max_depth': 8,
  'Booster': 'dart',
  'Lambda': 10.180190699205,
  'Alpha': 8.58630569207298,
  'Fitness': 0.188930536787283,
  'Base Seed': 20000}}

# Criação dos Modelos

In [326]:
#ESN
esn = ESN(n_inputs=df_features.columns.shape[0],
          n_outputs=1,
          n_reservoir=int(best["ESN"]["Reservoirs"]),
          sparsity=best["ESN"]["Sparsity"],
          spectral_radius=best["ESN"]["Spectral Radius"],
          noise=best["ESN"]["Noise"],
          random_state=SEED)

#LSTM
tf.keras.backend.clear_session()
lstm = Sequential([
    Input((df_features.columns.shape[0], 1)),
    LSTM(best["LSTM"]["Units"],
         activation=best["LSTM"]["Activation"],
         use_bias=best["LSTM"]["Bias"],
         seed=SEED),
    Dense(1),
])
lstm.compile(loss='mape')

#RF
rf = RandomForest(random_state=SEED,
                  n_estimators=int(best["RF"]["N_estimators"]),
                  max_depth=int(best["RF"]["Max_depth"]),
                  min_samples_split=int(best["RF"]["Min_samples_split"]),
                  min_samples_leaf=int(best["RF"]["Min_samples_leaf"]))

#XGB
updater = "coord_descent" if best["XGB"]["Booster"] == "gblinear" else None
xgb = XGBRegressor(random_state=SEED,
                   n_estimators=int(best["XGB"]["N_estimators"]),
                   max_depth=int(best["XGB"]["Max_depth"]),
                   booster=best["XGB"]["Booster"],
                   reg_lambda=best["XGB"]["Lambda"],
                   reg_alpha=best["XGB"]["Alpha"],
                   updater=updater)


# Treino e Teste
## Divisão dos Dados

In [327]:
dataframes_treino = []
dataframes_teste = []

for campus, dados in df_mesclado.groupby("CAMPUS"):
    dados["CAMPUS"] = campus
    for i in range(1, 12 + 1):
        lag = dados['CONSUMO'].shift(i)
        dados[f'LAG_' + '{:02d}'.format(i)] = lag
    dados.dropna(inplace=True)

    treino, teste = train_test_split(dados, test_size=12, random_state=SEED, shuffle=False)

    dataframes_treino.append(treino)
    dataframes_teste.append(teste)

df_treino = pd.DataFrame(pandas.concat(dataframes_treino, ignore_index=True).sort_values("CAMPUS").sort_values("DATA"))
df_teste = pd.DataFrame(pandas.concat(dataframes_teste, ignore_index=True).sort_values("CAMPUS").sort_values("DATA"))
features = df_treino.drop(df_treino.drop(df_features, axis=1).columns.to_list(), axis=1).columns
features

Index(['TEMP_MÉD_MÉD_MENS', 'TEMP_MÉD_MAX_MENS', 'TEMP_MÉD_ACC_MENS',
       'TEMP_MIN_MIN_MENS', 'TEMP_MIN_MAX_MENS', 'TEMP_MIN_ACC_MENS',
       'TEMP_MAX_MAX_MENS', 'TEMP_MAX_ACC_MENS', 'PRECIPITAÇÃO_MAX_MENS',
       'DIA_DA_SEMANA_dom', 'DIA_DA_SEMANA_sex', 'MÊS_dez', 'MÊS_fev',
       'MÊS_mar', 'ANO_2021', 'ANO_2022', 'ANO_2023', 'ANO_2024', 'ANO_2015',
       'ANO_2016', 'ANO_2017', 'ANO_2018', 'ANO_2019', 'ANO_2020', 'ANO_2014',
       'CAMPUS_ARAPONGAS', 'CAMPUS_ASSIS CHATEAUBRIAND', 'CAMPUS_ASTORGA',
       'CAMPUS_BARRACÃO', 'CAMPUS_CAMPO LARGO', 'CAMPUS_CAPANEMA',
       'CAMPUS_CASCAVEL', 'CAMPUS_COLOMBO', 'CAMPUS_CORONEL VIVIDA',
       'CAMPUS_CURITIBA', 'CAMPUS_FOZ DO IGUAÇU', 'CAMPUS_GOIOERÊ',
       'CAMPUS_IRATI', 'CAMPUS_IVAIPORÃ', 'CAMPUS_JACAREZINHO',
       'CAMPUS_JAGUARIAÍVA', 'CAMPUS_LONDRINA - CENTRO',
       'CAMPUS_LONDRINA - NORTE', 'CAMPUS_PALMAS', 'CAMPUS_PARANAGUÁ',
       'CAMPUS_PARANAVAÍ', 'CAMPUS_PINHAIS', 'CAMPUS_PITANGA',
       'CAMPUS_QUEDAS DO

## Regressão


In [332]:
def treino_kfold_regr(modelo, dataframe, features):
    cvs = []
    subdf = {data: dados for data, dados in dataframe.sort_values("CAMPUS", ascending=True).groupby('DATA')}
    for i_treino, i_teste in TimeSeriesSplit(n_splits=5, test_size=1).split(subdf):
        i_treino = [list(subdf.keys())[index] for index in i_treino]
        i_teste = [list(subdf.keys())[index] for index in i_teste]

        dados_treino = pd.concat([subdf[index] for index in i_treino], ignore_index=True)
        dados_teste = pd.concat([subdf[index] for index in i_teste], ignore_index=True)

        x_treino, y_treino = dados_treino[features].astype(np.float32).to_cupy().get(), dados_treino[
            "CONSUMO"].astype(
            np.float32).to_cupy().get()
        x_teste, y_teste = dados_teste[features].astype(np.float32).to_cupy().get(), dados_teste[
            "CONSUMO"].to_cupy().get()

        y_previsto = []
        if isinstance(modelo, RandomForest) or isinstance(modelo, XGBRegressor):
            modelo.fit(x_treino, y_treino)
            for row in x_teste:
                previsao = modelo.predict(row.reshape(1, -1))
                y_previsto.append(previsao)

        else:
            if isinstance(modelo, ESN):
                modelo.fit(x_treino, y_treino)
            else:
                modelo.fit(x_treino, y_treino, shuffle=False, verbose=False, epochs=best["LSTM"]["Epochs"],
                           batch_size=best["LSTM"]["Batch Size"])
            for row in x_teste:
                previsao = modelo.predict(row.reshape(1, -1))[0]
                y_previsto.append(previsao)

        mape = mean_absolute_percentage_error(y_teste, y_previsto)
        cvs.append(mape)

    return modelo, cvs


def teste_regr(modelo, dataframe_treino, dataframe_teste, features, horizonte):
    dataframe_treino = dataframe_treino.sort_values("DATA")
    dataframe_teste = dataframe_teste.sort_values("DATA")

    dataframe_treino = dataframe_treino.set_index("DATA")
    dataframe_teste = dataframe_teste.set_index("DATA")

    x_treino = dataframe_treino[features].astype(np.float32).to_cupy().get()
    y_treino = dataframe_treino["CONSUMO"].astype(np.float32).to_cupy().get()
    xy_teste = dataframe_teste[np.append(features, "CONSUMO")].astype(np.float32)[:horizonte]
    y_teste = dataframe_teste["CONSUMO"].astype(np.float32)[:horizonte].to_cupy().get()

    dataframe_conjunto = dataframe_treino[np.append(features, "CONSUMO")].copy().astype(np.float32)

    preds = []

    if isinstance(modelo, RandomForest) or isinstance(modelo, XGBRegressor):
        modelo.fit(x_treino, y_treino)
    else:
        if isinstance(modelo, ESN):
            modelo.fit(x_treino, y_treino)
        else:
            modelo.fit(x_treino, y_treino, shuffle=False, verbose=False, epochs=best["LSTM"]["Epochs"],
                       batch_size=best["LSTM"]["Batch Size"])

    for i_test in range(horizonte):
        row = xy_teste.iloc[[i_test]].copy()

        dataframe_conjunto = pd.concat([dataframe_conjunto, row], axis=0)

        for lag in range(1, 12 + 1):
            if 'LAG_' + "{:02d}".format(lag) in xy_teste.columns:
                row[f'LAG_' + '{:02d}'.format(lag)] = dataframe_conjunto["CONSUMO"].shift(lag)
                dataframe_conjunto[f'LAG_' + '{:02d}'.format(lag)] = dataframe_conjunto["CONSUMO"].shift(lag)

        if isinstance(modelo, RandomForest) or isinstance(modelo, XGBRegressor):
            pred = modelo.predict(row[features].to_cupy().get().reshape(1, -1))
        else:
            pred = modelo.predict(row[features].to_cupy().get().reshape(1, -1))[0]

        row["CONSUMO"] = pred
        preds.append(pred)
        dataframe_conjunto.update(row)

    medidas = pandas.Series([rrmse(y_teste, preds), smape(y_teste, preds)], index=["RRMSE", "sMAPE"])

    return medidas, preds, dataframe_conjunto

### Treino com todos os campi

In [333]:
treino_kfold_regr(esn, df_treino, features)


medidas = {}
previsoes = {}
testes = {}

for campus, dados_teste in df_teste.groupby("CAMPUS"):
    medida, previsao, teste = teste_regr(esn, df_treino, dados_teste, features, 3)
    medidas[campus] = medida
    previsoes[campus] = previsao
    testes[campus] = teste

testes["ARAPONGAS"]

            TEMP_MÉD_MÉD_MENS  TEMP_MÉD_MAX_MENS  TEMP_MÉD_ACC_MENS  \
DATA                                                                  
2023-11-30               26.0               32.0              772.0   

            TEMP_MIN_MIN_MENS  TEMP_MIN_MAX_MENS  TEMP_MIN_ACC_MENS  \
DATA                                                                  
2023-11-30               14.0               27.0              612.0   

            TEMP_MAX_MAX_MENS  TEMP_MAX_ACC_MENS  PRECIPITAÇÃO_MAX_MENS  \
DATA                                                                      
2023-11-30               38.0              933.0                   58.0   

            DIA_DA_SEMANA_dom  ...  LAG_02  LAG_03   LAG_05   LAG_06  LAG_07  \
DATA                           ...                                             
2023-11-30                4.0  ...  5076.0  5793.0  20136.0  12120.0  3927.0   

             LAG_08   LAG_09  LAG_10   LAG_11   LAG_12  
DATA                                            

Unnamed: 0_level_0,TEMP_MÉD_MÉD_MENS,TEMP_MÉD_MAX_MENS,TEMP_MÉD_ACC_MENS,TEMP_MIN_MIN_MENS,TEMP_MIN_MAX_MENS,TEMP_MIN_ACC_MENS,TEMP_MAX_MAX_MENS,TEMP_MAX_ACC_MENS,PRECIPITAÇÃO_MAX_MENS,DIA_DA_SEMANA_dom,...,LAG_03,LAG_05,LAG_06,LAG_07,LAG_08,LAG_09,LAG_10,LAG_11,LAG_12,CONSUMO
DATA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-31,26.0,30.0,813.0,19.0,25.0,681.0,36.0,944.0,52.0,4.0,...,,,,,,,,,,20538.000000
2015-02-28,25.0,29.0,702.0,19.0,24.0,599.0,35.0,804.0,78.0,4.0,...,,,,,,,,,,30308.000000
2015-02-28,26.0,28.0,730.0,20.0,24.0,607.0,33.0,853.0,56.0,4.0,...,,,,,,,,,,14467.000000
2015-03-31,24.0,26.0,738.0,18.0,23.0,637.0,30.0,839.0,62.0,5.0,...,20538.0,,,,,,,,,39407.000000
2015-03-31,25.0,27.0,786.0,18.0,23.0,648.0,33.0,924.0,11.0,5.0,...,30308.0,,,,,,,,,21303.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-31,22.0,26.0,685.0,13.0,22.0,531.0,33.0,840.0,116.0,5.0,...,20136.0,3927.0,10346.0,10213.0,6292.0,32063.0,23703.0,26103.0,11286.0,5076.000000
2023-10-31,19.0,23.0,590.0,10.0,19.0,456.0,30.0,724.0,110.0,5.0,...,6572.0,12120.0,3927.0,10346.0,10213.0,6292.0,32063.0,23703.0,26103.0,7539.000000
2023-11-30,26.0,32.0,772.0,14.0,27.0,612.0,38.0,933.0,58.0,4.0,...,5793.0,20136.0,12120.0,3927.0,10346.0,10213.0,6292.0,32063.0,23703.0,12934.928711
2023-12-31,27.0,30.0,838.0,18.0,24.0,664.0,36.0,1012.0,25.0,5.0,...,5076.0,6572.0,20136.0,12120.0,3927.0,10346.0,10213.0,6292.0,32063.0,7608.417480




### Treino com campi individuais