## **IMPORTAÇÃO DE BIBLIOTECAS**

In [68]:
### BIBLIOTECAS UTILIZADAS NA ETAPA DE MACHINE LEARNING

import pandas as pd
import numpy as np
import glob

## PRÉ-PROCESSAMENTO
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean
from numba import njit

## RANDOM FOREST
from sklearn.ensemble import RandomForestRegressor

## SARIMA
import statsmodels as sm
import pmdarima as pmd

## LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [69]:
### IMPORTAÇÃO DOS DADOS FINAIS
path_combs = r'C:\Users\Emanuel\Desktop\PUCMG\TCC\Dataframes'

files = glob.glob(path_combs + '/*.csv')
many_dfs = [pd.read_csv(file, sep=',') for file in files]
df_fuels = pd.concat(many_dfs, ignore_index=False)

df_fuels['data'] = pd.to_datetime(df_fuels['data'])
df_fuels['tipo_comb'] = df_fuels['tipo_comb'].astype('string')

## **PRÉ-PROCESSAMENTO DOS DADOS PARA MODELAGEM**

In [75]:
## RENOMEANDO AS COLUNAS DE DATA, VARIÁVEL TARGET E IDENTIFICADOR DA SÉRIE TEMPORAL PARA SE ADEQUAR AO MLFORECAST
df_fuels = df_fuels.rename(columns={'data': 'ds', 'preco_medio': 'y', 'tipo_comb': 'unique_id'})
#df_gasolina = df_fuels[df_fuels['unique_id'] == 'GASOLINA']
#df_etanol = df_fuels[df_fuels['unique_id'] == 'ETANOL']

In [82]:
@njit
def difference(x, lag):
    diff_x = np.full_like(x, np.nan)
    for i in range(lag, len(x)):
        diff_x[i] = x[i] - x[i-lag]
    return diff_x


get_features = MLForecast(
    models=[],
    freq='D',
    lags=[1,7],
    lag_transforms={
        1: [RollingMean(window_size=3), RollingMean(window_size=7), RollingMean(window_size=30), (difference, 1), (difference, 7)]
    },
    date_features=['dayofweek', 'month', 'year'],
    num_threads=2
)

In [83]:
df_fuels = get_features.preprocess(df_fuels, id_col='unique_id', time_col='ds', target_col='y', static_features=[])

In [42]:
X_gasolina, X_etanol = df_fuels[df_fuels['unique_id'] == 'GASOLINA'].drop(columns=['y']), df_fuels[df_fuels['unique_id'] == 'ETANOL'].drop(columns=['y'])

In [53]:
df_fuels

Unnamed: 0,ds,unique_id,y,preco_std,preco_min,preco_max,num_postos,lat_medio,lon_medio,ultimo_dolar,variacao_dolar
0,2022-01-03,ETANOL,5.322093,0.282539,4.88,6.99,86,-19.505373,-44.322997,5.6818,2.00
1,2022-01-04,ETANOL,5.330513,0.232159,4.78,5.99,117,-19.894815,-44.500890,5.6770,-0.08
2,2022-01-05,ETANOL,5.159517,0.274655,4.67,5.79,145,-19.534614,-45.781852,5.7087,0.56
3,2022-01-06,ETANOL,5.242297,0.214000,4.85,5.89,74,-19.373599,-45.097493,5.6834,-0.44
4,2022-01-07,ETANOL,5.242297,0.214000,4.85,5.89,74,-19.373599,-45.097493,5.6834,-0.44
...,...,...,...,...,...,...,...,...,...,...,...
625,2024-05-27,GASOLINA,5.840876,0.235639,5.25,6.24,137,-19.832879,-44.560129,5.1708,0.09
626,2024-05-28,GASOLINA,5.811558,0.245491,5.18,6.29,199,-19.991100,-44.690489,5.1611,-0.19
627,2024-05-29,GASOLINA,5.834096,0.250862,5.25,6.49,83,-19.294516,-44.008790,5.2018,0.79
628,2024-05-30,GASOLINA,5.784462,0.220773,5.33,5.99,65,-20.084759,-45.083176,5.2034,0.03
