# Sumário
1. Inicialização
    * Importando Bibliotecas
    * Funções
2. Pré-Processamento
    * Seleção dos arquivos nos quais os dados se encontram
    * Seleção do estado a ser analisado
    * Teste de estacionalidade de Dickey Fuller
    * Diferenciação da série e teste de estacionariedade na série diferenciada
    * Autocorrelação Mensal
3. Modelagem
    * Pré-processamento e Divisão entre Treino e Teste
    * Algoritmo Sarimax
    * Seleção do modelo a ser executado baseado no estado em análise
    * Métricas de Validação do Modelo
4. Gráficos
    * Gráfico com comportamento da variável a ser predita
    * Gráfico com comportamento da variável a ser predita, da média móvel e do desvio padrão móvel
    * Gráficos de decomposição da série
    * Gráficos de diferenciação, autocorrelação (acf) e autocorrelação parcial (pacf)
    * Gráfico Real vs Predito
    * Gráfico com o comportamento do ano predito
    * Gráfico com o comportamento da série original e predita
5. Execução da função principal

# Inicialização
* Importando Bibliotecas
* Funções

## Importando Bibliotecas

In [1]:
## data
import pandas as pd
import numpy as np
import datetime as dt

## viz
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

## model
import statsmodels.api as sm
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima_model import ARIMA

import warnings
warnings.filterwarnings("ignore")
import matplotlib.dates as mdates
import os

%matplotlib inline

# Funções

## Seleção dos arquivos nos quais os dados se encontram

In [2]:
def file_selection(uf, PATH):
    """ Create the dataframe based in the state under analysis.

        Args:
            uf (string): The state under analysis.
            PATH (string): The path where .csv files locate.

        Returns:
            A dataframe with the state under analysis.
    """   
    sudeste = ['SP', 'RJ', 'MG', 'ES']
    sul = ['PR', 'SC', 'RS']
    centrooeste = ['DF', 'MT', 'MS', 'GO']
    nordeste = ['PB', 'PE', 'RN', 'CE', 'SE', 'BA', 'AL']
    
    if(uf in sudeste):
        # Coloque aqui o nome do arquivo referente a base de dados do sudeste
        file_train = 'base_sudeste.csv'
        header = 'ml_ca_energia_consumo_armarios_sudeste_atualizada.'
        
    elif(uf in sul):
        # Coloque aqui o nome do arquivo referente a base de dados do sul
        file_train = 'base_sul.csv'
        header = 'ml_ca_energia_consumo_armarios_sul_atualizada.'
        
    elif(uf in centrooeste):
        # Coloque aqui o nome do arquivo referente a base de dados do centrooeste
        file_train = 'base_centrooeste_V2.csv'
        header = 'ml_ca_energia_consumo_armarios_centrooeste_atualizada.'
        
    elif(uf in nordeste):
        # Coloque aqui o nome do arquivo referente a base de dados do nordeste
        file_train = 'base_nordeste_V2.csv'
        header = 'ml_ca_energia_consumo_armarios_nordeste_atualizada.'
        
    df = pd.read_csv(PATH + file_train, sep = '|')
    df = uf_selection(uf, df, 'consumoacumuladodomes_soma', header)
    
    return df

## Seleção do estado a ser analisado

In [3]:
def uf_selection(uf, dataset, target, pattern):
    """ Create the dataframe based in the state wants to predict.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            uf (string): The state under analysis.
            target (string): The target column that will be predict.
            pattern (string): The state under analysis.

        Returns:
            A dataframe with the state under analysis.
    """   
    dataset.columns = dataset.columns.str.replace(pattern, '')
    dataset['mes'] = dataset['mes'].astype('string')
    dataset['mes'] = dataset['mes'].apply(lambda x: pd.to_datetime(str(x) + '01'))
    dataset = dataset[['mes', target]].loc[dataset['uf'] == uf]
    
    return dataset

## Teste de estacionalidade de Dickey Fuller

In [4]:
def adf_test(dataset, output_file):
    """ Performs Augmented Dickey Fuller test.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.

        Returns:
            The results of adf test.
    """   
    
    output_file.writelines('Resultado do Teste Dickey-Fuller:')
    dftest = adfuller(dataset, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Teste', 'Valor p', '# de lags', '# de observações'])
    for key, value in dftest[4].items():
        dfoutput['Valores Críticos ({})'.format(key)] = value
   
    output_file.writelines(str(dfoutput) + '\n')

## Diferenciação da série e teste de estacionariedade na série diferenciada

In [5]:
def serie_differentiation(dataset, target, output_file, PATH, uf):
    """ Create serie's differentiation.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The target under analysis.
            
        Returns:
            A plot with serie's differentiation, the results of adf test in the new serie and differentiation data.
    """     
     
    fig = plt.figure(figsize=(15, 5))
    df_diff = np.diff(dataset[target])
    plt.plot(df_diff)

    output_file.writelines('\n\nSérie Diferenciada: ' + '\n')
    output_file.writelines(str(adf_test(df_diff, output_file))+ '\n')
    
    plt.savefig(PATH + '/serie_differentiation_' + uf + '.png')
    plt.close(fig)
    
    return df_diff

## Autocorrelação Mensal

In [6]:
def autocorrelation_lag(dataset, target, output_file):
    """ Print autocorrelation based in 1, 3, 6, 9 and 12 months.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The target under analysis.
            
        Returns:
            The autocorrelation based in 1, 3, 6, 9 and 12 months.
    """     
     
    output_file.writelines('\n\nAutocorrelação : ' + '\n')
    autocorrelation_lag1 = dataset[target].autocorr(lag=1)
    output_file.writelines("Um mês: " + str(autocorrelation_lag1) + '\n')

    autocorrelation_lag3 = dataset[target].autocorr(lag=3)
    output_file.writelines("Três meses: " + str(autocorrelation_lag3)+ '\n')

    autocorrelation_lag6 = dataset[target].autocorr(lag=6)
    output_file.writelines("Seis meses: " + str(autocorrelation_lag6)+ '\n')

    autocorrelation_lag9 = dataset[target].autocorr(lag=9)
    output_file.writelines("Nove meses: " + str(autocorrelation_lag9)+ '\n')

    autocorrelation_lag12 = dataset[target].autocorr(lag=12)
    output_file.writelines("Doze meses: " + str(autocorrelation_lag12)+ '\n')

## Pré-processamento e Divisão entre Treino e Teste

In [7]:
def train_test_split_data(dataset, target, start, period, frequency, date_split, date_end, output_file):
    """ Pre processing and split the train and test data.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The target under analysis.
            start (string): The start date to be fill with NaNs to be predict.
            period (int): The number of months to be fill with NaNs to be predict.
            frequency (string): The unit to by fill, i.e., 'M' for months.
            date_split (datetime): The date used for threshold to split the train and test.
            date_end (datetime): The end date used for test dataset.
            normalization (boolean): True if normalization will used and False if not.
            
        Returns:
            The dataset pre processed, start and end intervals, and train and test datasets.
    """     
     
    temp = pd.DataFrame(pd.date_range(start=start, periods=period, freq=frequency), columns=['mes'])
    temp['mes'] = pd.to_datetime(temp['mes'].dt.strftime('%Y-%m-01'))
    temp[target] = np.nan

    df = pd.concat([dataset, temp]).reset_index(drop=True)
    del temp

    xtrain = df[df['mes'] < date_split]
    xtest = df.loc[(df['mes'] >= date_split) & (df['mes'] <= date_end)]
    
    output_file.writelines('\n\nTrain interval: ' + str(xtrain['mes'].min().date()) + ' até ' + str(xtrain['mes'].max().date()))
    output_file.writelines('\nTest interval: ' + str(xtest['mes'].min().date()) + ' até ' + str(xtest['mes'].max().date()))

    interval_start = xtest.index.min()
    interval_end = xtest.index.max()

    output_file.writelines('\nTest interval: ' + str(interval_start) + ' até ' + str(interval_end))

    xtrain.set_index('mes', inplace = True)
    xtest.set_index('mes', inplace = True)
    
    return xtrain, xtest, interval_start, interval_end, df

## Algoritmo Sarimax

In [8]:
def sarimax_algorithm(train_dataset, test_dataset, target, interval_start, interval_end, order, seasonal_order, normalization):
    """ Execute sarimax algorithm.

        Args:
        train_dataset (array-like or iterable, shape=(n_samples,)): The time-series to which to fit the sarimax estimator.
        test_dataset (object/spreadsheet): The test dataset under analysis.
        target (string): The name of target column under analysis.
        interval_start (int): The start index of the test interval.
        interval_end (int): The end index of the test interval.
        order (tuple): The (p,d,q) order of the model for the number of AR parameters, differences, and MA parameters. d must be an integer indicating the integration order of the process, while p and q may either be an integers indicating the AR and MA orders (so that all lags up to those orders are included) or else iterables giving specific AR and / or MA lags to include. Default is an AR(1) model: (1,0,0).
        seasonal_order (tuple): The (P,D,Q,s) order of the seasonal component of the model for the AR parameters, differences, MA parameters, and periodicity. D must be an integer indicating the integration order of the process, while P and Q may either be an integers indicating the AR and MA orders (so that all lags up to those orders are included) or else iterables giving specific AR and / or MA lags to include. s is an integer giving the periodicity (number of periods in season), often it is 4 for quarterly data or 12 for monthly data. Default is no seasonal effect.
        normalization (boolean): True if normalization will used and False if not.
        
        Returns:
            The trained model and the predictions.
    """     
    if(normalization == True):
        x_train_scaled = train_dataset.copy()
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler_data = scaler.fit_transform(x_train_scaled[target].values.reshape(-1, 1))
        x_train_scaled[target] = scaler_data
        xtrain = x_train_scaled.copy()
    
    else:
        xtrain = train_dataset.copy()
        
        
    sarimax = sm.tsa.statespace.SARIMAX(xtrain, order=order, seasonal_order=seasonal_order).fit()
    
    test_dataset['pred'] = sarimax.predict(start=interval_start, end=interval_end, dynamic=True)
    
    if(normalization == True):
        scaler_data = scaler.inverse_transform(test_dataset['pred'].values.reshape(-1, 1))
        test_dataset['pred'] = scaler_data

    return sarimax, test_dataset

## Seleção do modelo a ser executado baseado no estado em análise

In [9]:
def uf_models(uf, xtrain, xtest, target, interval_start, interval_end):
    """ Select the parameters according to the state under analysis and execute sarimax model.

        Args:
            uf (string): The state under analysis.
            xtrain (array-like or iterable, shape=(n_samples,)): The time-series to which to fit the sarimax estimator.
            xtest (object/spreadsheet): The test dataset under analysis.
            target (string): The name of target column under analysis.
            interval_start (int): The start index of the test interval.
            interval_end (int): The end index of the test interval.
            
        Returns:
            The trained model and the predictions.
    """     
    model_111_111 = ['RS', 'SC', 'MG', 'RJ', 'MT', 'GO', 'MS', 'CE']
    model_011_011 = ['ES', 'AL']

    uf_dict = {'SP': [(1, 1, 2),(0, 1, 1, 12)], 
               'BA': [(0, 1, 1),(1, 1, 0, 12)],
               'PE': [(0, 1, 1),(0, 1, 0, 12)],
               'RN': [(0, 1, 1),(2, 1, 0, 12)],
               'SE': [(0, 1, 2),(1, 1, 0, 12)],
               'PR': [(0, 1, 0),(0, 1, 2, 12)],
               'DF': [(2, 1, 0),(0, 0, 0, 12)],
               'PB': [(0, 1, 0),(0, 1, 0, 12)],
    }

    for i in model_111_111:
        uf_dict[i] = [(1, 1, 1),(1, 1, 1, 12)]

    for i in model_011_011:
        uf_dict[i] = [(0, 1, 1),(0, 1, 1, 12)]

    normalization_false = ['PR', 'PB', 'DF']
    if(uf in normalization_false):
        normalization = False
    else:
        normalization = True

    sarimax, df_test = sarimax_algorithm(xtrain, xtest, target, interval_start, interval_end, order=uf_dict[uf][0], seasonal_order=uf_dict[uf][1], normalization= normalization)
    
    return sarimax, df_test

## Métricas de Validação do Modelo

In [10]:
def validation_metrics(y_true, y_pred, results):
    """ Performs the model's validation metrics (Mean Absolute Percentage Error & Root Mean Square Error & Mean Absolute Error).

        Args:
            y_true (object/spreadsheet): The dataset under analysis.
            y_pred (object/spreadsheet): The dataset under analysis.

        Returns:
            The results of adf test.
    """   
    
    _mape = np.mean(np.abs((y_true - y_pred) / y_true )) * 100
    _rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    _mae = np.mean(np.abs(y_pred - y_true))

    results.writelines('MAPE: {:.2f}, RMSE: {:.2f}, MAE: {:.2f}'.format(_mape, _rmse, _mae) + '\n\n')
    
    return 'MAPE: {:.2f}, RMSE: {:.2f}, MAE: {:.2f}'.format(_mape, _rmse, _mae)

## Gráfico com comportamento da variável a ser predita

In [11]:
def plot_target_behavior(dataset, target, PATH, uf):
    """ Create the plot based in the target variable that wants to predict.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The target under analysis.

        Returns:
            A plot with target behavior.
    """  
    fig = plt.figure(figsize=(15,8))

    sns.lineplot(data=dataset, x=dataset['mes'].dt.strftime('%Y%m'), y=target)
    plt.xticks(rotation=90)
    plt.ylabel('Consumo Total')
    
    plt.savefig(PATH + '/Consumo_Total_' + uf + '.png')
    
    plt.close(fig)

## Gráfico com comportamento da variável a ser predita, da média móvel e do desvio padrão móvel

In [12]:
def plot_moving_average(dataset, target, window, PATH, uf):
    """ Create the plot based in the target's moving average.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The target under analysis.
            window (int): The moving average window.

        Returns:
            A plot with target behavior, target's moving average and its moving standard deviation.
    """     
    fig = plt.figure(figsize=(20, 6))

    x = dataset['mes']
    y = dataset[target]
    
    mm3 = dataset[target].rolling(window).mean()
    rolling_std = dataset[target].rolling(window).std()
    
    plt.plot(x, y, label='Real', color='#3CADF2')
    plt.plot(x, mm3, label='Média Móvel de ' + str(window), color='red')
    plt.plot(x, rolling_std, label='Std de ' + str(window), color='green')


    plt.legend()
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    plt.gcf().autofmt_xdate()
    
    plt.savefig(PATH + '/Media_Movel_' + uf + '.png')
    
    plt.close(fig)

## Gráficos de decomposição da série

In [13]:
def serie_decompose(dataset, target, PATH, uf):
    """ Create the plots based in serie decomposition into trend, sazonality and residues.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The target under analysis.
            
        Returns:
            A plot with serie's decompose and the decompose data.
    """     
    
    decompose = dataset.copy()
    decompose.set_index("mes", inplace=True)
    
    if(uf == 'CE'):
        decompose_data = seasonal_decompose(decompose[target], model="aditive")
    else:
        decompose_data = seasonal_decompose(decompose[target], model="multiplicative")
    
    #fig = plt.figure(figsize=(20, 6))
    fig = decompose_data.plot()
    fig.set_size_inches((12, 12))
    
    plt.savefig(PATH + '/Serie_Decompose_' + uf + '.png')
    plt.close(fig)
    
    return decompose_data

## Gráficos de diferenciação, autocorrelação (acf) e autocorrelação parcial (pacf) 

In [14]:
def plot_diff_acf_pacf(dataset, target, PATH, uf):
    """ Create differentiation, autocorrelation and parcial autocorrelation plot.

        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            target (string): The name of target column under analysis.
            
        Returns:
            A plot with differentiation, autocorrelation and parcial autocorrelation plot.
    """     
    
    fig, axes = plt.subplots(3, 3, figsize=(15,15))

    # Original Series
    axes[0, 0].plot(dataset[target]); axes[0, 0].set_title('Original Series')
    fig = sm.graphics.tsa.plot_acf(dataset[target], lags=40, ax=axes[0,1])
    fig = sm.graphics.tsa.plot_pacf(dataset[target], lags=20, ax=axes[0,2])

    # 1st Order Differencing Series
    axes[1, 0].plot(dataset[target].diff()); axes[1, 0].set_title('1st Order Differencing')
    fig = sm.graphics.tsa.plot_acf(dataset[target].diff().dropna(), lags=40, ax=axes[1,1])
    fig = sm.graphics.tsa.plot_pacf(dataset[target].diff().dropna(), lags=20, ax=axes[1,2])

    # 2st Order Differencing Series
    axes[2, 0].plot(dataset[target].diff().diff()); axes[2, 0].set_title('2st Order Differencing')
    fig = sm.graphics.tsa.plot_acf(dataset[target].diff().diff().dropna(), lags=40, ax=axes[2,1])
    fig = sm.graphics.tsa.plot_pacf(dataset[target].diff().diff().dropna(), lags=20, ax=axes[2,2])
    
    plt.savefig(PATH + '/Serie_ACF_PACF_' + uf + '.png')
    plt.close(fig)

## Gráfico Real vs Predito

In [15]:
def plot_predict_x_real(test_dataset, target, prediction, PATH, uf):
    """ Create real vs predict plot.

        Args:
            test_dataset (object/spreadsheet): The test dataset under analysis.
            target (string): The name of target column under analysis.
            prediction (string): The name of column with predicted values.
            
        Returns:
            A plot with the comparison of real vs predict.
    """     

    fig = plt.figure(figsize=(20, 6))

    x = test_dataset[~test_dataset[target].isna()].index
    y = test_dataset[~test_dataset[target].isna()][target]
    p = test_dataset[~test_dataset[target].isna()][prediction]
    plt.plot(x, y, label='Real', color='#3CADF2')
    plt.plot(x, p, label='Predicted', color='red')

    for a, b in zip(x, y):
        plt.annotate("{:.2f}".format(b), (a, b), textcoords="offset points", xytext=(0,10), ha='center')

    for a, b in zip(x, p):
        plt.annotate("{:.2f}".format(b), (a, b), textcoords="offset points", xytext=(0,10), ha='center')

    plt.legend()
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    plt.gcf().autofmt_xdate()
    
    plt.savefig(PATH + '/PredictedxReal_' + uf + '.png')
    plt.close(fig)

## Gráfico com o comportamento do ano predito

In [16]:
def plot_year_behavior(test_dataset, target, prediction, PATH, uf):
    """ Create real/predict plot from a year.

        Args:
            test_dataset (object/spreadsheet): The test dataset under analysis.
            target (string): The name of target column under analysis.
            prediction (string): The name of column with predicted values.
            
        Returns:
            A plot with year behavior.
    """         
    
    fig = plt.figure(figsize=(20, 6))

    x = test_dataset.index
    y = test_dataset[target]
    p = test_dataset[prediction]
    plt.plot(x, y, label='Real', color='#3CADF2')
    plt.plot(x, p, label='Predicted', color='red')

    for a, b in zip(x, y):
        plt.annotate("{:.2f}".format(b), (a, b), textcoords="offset points", xytext=(0,10), ha='center')

    for a, b in zip(x, p):
        plt.annotate("{:.2f}".format(b), (a, b), textcoords="offset points", xytext=(0,10), ha='center')

    plt.legend()
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    plt.gcf().autofmt_xdate()
    
    plt.savefig(PATH + '/Year_Behavior_' + uf + '.png')
    plt.close(fig)

## Gráfico com o comportamento da série original e predita

In [17]:
def plot_serie_behavior(dataset, test_dataset, interval_start, target, prediction, PATH, uf):
    """ Create serie's behavior plot.

        Args:
            dataset (object/spreadsheet): The full dataset under analysis.
            test_dataset (object/spreadsheet): The test dataset under analysis.
            interval_start (int): The start index of the test interval.
            target (string): The name of target column under analysis.
            prediction (string): The name of column with predicted values.
            title (string): The title wants to put in the plot.
            
        Returns:
            A plot with year behavior.
    """           
    
    fig = plt.figure(figsize=(20, 6))

    df_final = pd.DataFrame()
    df_final = pd.concat([dataset[:interval_start].set_index('mes'), test_dataset])
    df_final.reset_index(inplace = True)

    x = df_final['mes'].dt.date
    _real = df_final[target]
    _pred = df_final[prediction]

    plt.plot(x, _real, label='Real', color='#3CADF2')
    plt.plot(x, _pred, label='Predicted', color='red')
    plt.legend()
    plt.title("PREVISÃO PARA O SEGUNDO SEMESTRE DO ANO DE 2022 EM " + uf)

    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    plt.gcf().autofmt_xdate()
    
    plt.savefig(PATH + '/Serie_Behavior_' + uf + '.png')
    plt.close(fig)

# Função principal

In [18]:
def main():
    
    # Coloque aqui o caminho ao qual os arquivos se encontram
    PATH = '../Dados/Armarios/Bases/'

    ufs = ['SP', 'RJ', 'MG', 'ES', 'PR', 'SC', 'RS', 'DF', 'MT', 'MS', 'GO', 'PB', 'PE', 'RN', 'CE', 'SE', 'BA', 'AL']
    
    # Estados cujo histórico para o modelo não começa em 2017
    uf_exceptions = ['MS', 'MT', 'DF', 'PB']
    
    for uf in ufs:
        
        print("Executando previsão para o estado de " + uf)
        
        # Seleciona o arquivo no qual o histórico do estado se encontra
        df = file_selection(uf, PATH)
        
        # Cria pastas para cada estado
        temporary_PATH = PATH + uf
        if not os.path.exists(temporary_PATH):
            os.mkdir(temporary_PATH)
        
        # Arquivo de log estatístico
        output_file = open(temporary_PATH + '/log_statistics.txt', 'a')
        
        # Plota o comportamento do histórico da variável a ser predita
        plot_target_behavior(df,'consumoacumuladodomes_soma', temporary_PATH, uf)
        
        # Executa o teste de estacionariedade
        adf_test(df['consumoacumuladodomes_soma'], output_file)
        
        # Plota o comportamento do históricoa média móvel da variável a ser predita
        plot_moving_average(df,'consumoacumuladodomes_soma', 3, temporary_PATH, uf)
        
        # Executa a decomposição da série temporal em tendência, sazonalidade e resíduos
        decompose_data = serie_decompose(df,'consumoacumuladodomes_soma', temporary_PATH, uf)
        
        # Plota o comportamento do acf e pacf da variável a ser predita
        plot_diff_acf_pacf(df, 'consumoacumuladodomes_soma', temporary_PATH, uf)

        # Executa a diferenciação da série e o teste de estacionariedade na série diferenciada
        df_diff = serie_differentiation(df,'consumoacumuladodomes_soma', output_file, temporary_PATH, uf)

        # Executa a autocorrelação da série temporal em 1,3,6,9 ou 12 meses
        autocorrelation_lag(df, 'consumoacumuladodomes_soma', output_file)
        
        # Seleção do período utilizado nos estados cujo histórico para o modelo não começa em 2017
        if(uf in uf_exceptions):
            
            if(uf == 'DF'):
                df = df.reset_index()
                df = df[['mes', 'consumoacumuladodomes_soma']].iloc[48:,:]
            else:
                df = df.reset_index()
                df = df[['mes', 'consumoacumuladodomes_soma']].iloc[12:,]
                
        
        # Divisão em treino e teste utilizando o date_split como primeira data do conjunto de teste e o date_end como a última data de teste
        date_split = dt.datetime(2022, 1, 1)
        date_end = dt.datetime(2022, 12, 1)

        # '2022-06-01' (início onde não tem dados na base), 7 (prever 7 períodos para frente), 'M' (M = meses)
        # Prever de junho/22 7 períodos para frente, ou seja, até dezembro/22
        xtrain, xtest, interval_start, interval_end, dataset = train_test_split_data(df, 'consumoacumuladodomes_soma', '2022-06-01', 7, 'M', date_split, date_end, output_file)
        
        # Execução dos modelos de cada estado, salvando as métricas de validação (calculadas a partir das datas de 
        # janeiro à maio de 2022, datas nas quais o faturamento está fechado e temos os valores na base) do modelo no arquivo
        results = open(temporary_PATH + '/predictions_validation.txt', 'a')
        sarimax, df_test = uf_models(uf, xtrain, xtest, 'consumoacumuladodomes_soma', interval_start, interval_end)
        
        y_true = df_test[~df_test['consumoacumuladodomes_soma'].isna()]['consumoacumuladodomes_soma'].values
        y_pred = df_test[~df_test['consumoacumuladodomes_soma'].isna()]['pred'].values
    
        metrics = validation_metrics(y_true, y_pred, results)
        
        # Plota o comportamento dos valores reais vs valores preditos
        plot_predict_x_real(df_test, 'consumoacumuladodomes_soma', 'pred', temporary_PATH, uf)
        
        # Plota o comportamento dos valores reais vs valores preditos no ano de 2022
        plot_year_behavior(df_test, 'consumoacumuladodomes_soma', 'pred', temporary_PATH, uf)
        
        # Plota o comportamento do histórico do estado, com a curva de comportamento dos valores reais (em azul) e 
        # a curva de comportamento dos valores preditos (em vermelho)
        plot_serie_behavior(df, df_test, interval_start, 'consumoacumuladodomes_soma', 'pred', temporary_PATH, uf)
        
        # Salva os valores de teste reais e preditos no arquivo
        df_test.to_csv(results, index = 'False', line_terminator='\n')
        
        # Fecha o arquivo de log e de resultados
        output_file.close()
        results.close()

## Execução da função principal

In [19]:
main()

Executando previsão para o estado de SP
Executando previsão para o estado de RJ
Executando previsão para o estado de MG
Executando previsão para o estado de ES
Executando previsão para o estado de PR
Executando previsão para o estado de SC
Executando previsão para o estado de RS
Executando previsão para o estado de DF
Executando previsão para o estado de MT
Executando previsão para o estado de MS
Executando previsão para o estado de GO
Executando previsão para o estado de PB
Executando previsão para o estado de PE
Executando previsão para o estado de RN
Executando previsão para o estado de CE
Executando previsão para o estado de SE
Executando previsão para o estado de BA
Executando previsão para o estado de AL
