In [1]:
#!pip install seaborn
#!pip install openpyxl


In [2]:
import pandas as pd
import numpy as np
import random as rd
import time
import csv
import seaborn as sbs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split ,GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

In [3]:
file_data = 'DS_Agua_2017_2022_por_ponto.csv'
path_name='data/'
path_name_results='results/'
file_result = 'Result_MLP_Model_Day.csv'

In [4]:
dataset = pd.read_csv(f'{path_name}{file_data}', sep =';', encoding = 'latin1', decimal='.')


In [5]:
dataset.head()      

Unnamed: 0,DT_MEDICAO_HORA,PRECIPITACAO,PRESSAO_ATMOSFERICA,TEMPERATURA_DO_AR_C,UMIDADE_RELATIVA_DO_AR,VELOCIDADE_VENTO,SK_PONTO,VL_MEDICAO
1,2017-01-01 00:00:00,0.0,1009.5,25.9,75.0,1.3,5,938.69
2,2017-01-01 00:00:00,0.0,1009.5,25.9,75.0,1.3,6,297.1078
3,2017-01-01 00:00:00,0.0,1009.5,25.9,75.0,1.3,7,544.5944
4,2017-01-01 00:00:00,0.0,1009.5,25.9,75.0,1.3,11,816.790594
5,2017-01-01 00:00:00,0.0,1009.5,25.9,75.0,1.3,12,336.596097


In [6]:
#verifica se existe variáveis nulas 
dataset.isna().sum()

DT_MEDICAO_HORA           0
PRECIPITACAO              0
PRESSAO_ATMOSFERICA       0
TEMPERATURA_DO_AR_C       0
UMIDADE_RELATIVA_DO_AR    0
VELOCIDADE_VENTO          0
SK_PONTO                  0
VL_MEDICAO                0
dtype: int64

In [7]:
def salvar_resultado(sk_ponto, ds_best_param, n_time_steps, MSE, RMSE, MAE, MAPE, Duration):
  #Script to write training cycle results
  data = [sk_ponto, ds_best_param, n_time_steps, MSE, RMSE, MAE, MAPE, Duration]
  fields = ['SK_PONTO','Best Params','N_Past_Vl','MSE','RMSE','MAE','MAPE','Duration']
  with open(f'{path_name_results}{file_result}', "a",newline='') as csv_file:
    writer = csv.writer(csv_file,delimiter=';')
    writer.writerow(data)  
  print(fields)
  print(data)
    
#Script to create the results file
def criar_arquivo_resultado():
  fields = ['SK_PONTO','Best Params','N_Past_Vl','MSE','RMSE','MAE','MAPE','Duration']
  with open(f'{path_name_results}{file_result}', "w",newline='') as csv_file:
    writer = csv.writer(csv_file,delimiter=';')
    writer.writerow(fields)    

In [8]:
#crop date time to date 
dataset['DATA']=dataset['DT_MEDICAO_HORA'].str.split(' ').str[0]
dataset.drop(['DT_MEDICAO_HORA'],axis=1, inplace=True)

In [9]:
# Grouping data for Day
# #cols = 'PRECIPITACAO':'sum','PRESSAO_ATMOSFERICA':'mean','TEMPERATURA_DO_AR_C':'mean','UMIDADE_RELATIVA_DO_AR':'mean','VELOCIDADE_VENTO':'mean','VL_MEDICAO':'sum'
dataset = dataset.groupby(['SK_PONTO','DATA'], as_index=False).agg({'PRECIPITACAO':'sum','PRESSAO_ATMOSFERICA':'mean','TEMPERATURA_DO_AR_C':'mean','UMIDADE_RELATIVA_DO_AR':'mean','VELOCIDADE_VENTO':'mean','VL_MEDICAO':'sum'})
#df_result.rename(columns=cols)

In [10]:
def previsao_MLP(sk_ponto, dataset, n_time_steps):
  # dataframe tratament
  df = pd.DataFrame()
  df['VL_MEDICAO']=dataset['VL_MEDICAO']  

    # time serire transform - shit 1 step time

  for n_step in range(1,n_time_steps+1,1):
    df['vl-'+str(n_step)]=dataset['VL_MEDICAO'].shift(n_step)  
    
  df['tp-1']=dataset['TEMPERATURA_DO_AR_C']
  df['pr-1']=dataset['PRESSAO_ATMOSFERICA']
  df['vv-1']=dataset['VELOCIDADE_VENTO']
  df['ur-1']=dataset['UMIDADE_RELATIVA_DO_AR']
  df['ch-1']=dataset['PRECIPITACAO']  

  df.dropna(inplace=True)
  
  
  
  #Split dataset in treinam /  75% treinam  25% test
  nlinhas = int(np.round(df.shape[0] *0.75)) # 
  
  max_size_train_split = int(np.round(nlinhas / 5)) 
  max_size__test_split = int(np.round((df.shape[0] - nlinhas) / 5))
  size_split = 5
  X_train = df.iloc[0:nlinhas,1:6 + n_time_steps]
  y_train = df.iloc[0:nlinhas,0].values

  X_test = df.iloc[nlinhas:dataset.shape[0],1:6 + n_time_steps] 
  y_test = df.iloc[nlinhas:dataset.shape[0],0].values
  
  
  # Stores the training execution start time
  Hora_Inicio = time.time()
  
  # Cross-validated for time series
  ts_cv = TimeSeriesSplit(
      n_splits=size_split, # Number of divisions
      max_train_size=max_size_train_split,    # maximum size of each set. of training
      gap=2, # number of samples to exclude between each training and testing set
      test_size=max_size__test_split, # maximum size of each set. of test.
  )
  param_grid = {
      'hidden_layer_sizes': [(4,6,1),(2,6,1),(6,12,1),(6,18,1)], # MLP layers
      'max_iter': [ 500], # maximum iterations
      'activation': [ 'relu','identity'], # activation function
      'solver': ['adam'], # weight optimization algorithm
      'alpha': [0.0001, 0.001, 0.01],  # alpha strength of regularization
  }
  
  modelo = MLPRegressor(random_state=0)
  
  grid = GridSearchCV(modelo, param_grid, n_jobs= -1,scoring='neg_mean_absolute_percentage_error', cv=ts_cv, verbose=1)
  grid.fit(np.array(X_train), np.array(y_train))
  
  resultado = str(grid.best_params_)
  

  predict=grid.predict(np.array(X_test))  

  # Stores the training execution end time
  Hora_Fim = time.time()  

  #Calculate the duration of the training execution
  Duracao = Hora_Fim - Hora_Inicio  

  #Mean Squared Error (Mean Squared Difference Between Estimated Values and Actual Values) - MSE
  MSE = mean_squared_error(y_test, predict)  

  # Square Root of Mean Error - RMSE
  RMSE = np.sqrt(mean_squared_error(y_test, predict))  

  # Mean Absolute Distance or Mean Absolute Error - MAE
  MAE= median_absolute_error(y_pred=predict, y_true = y_test)  

  #Calculate the MAPE (Mean Absolute Percentage Error)
  MAPE = ((np.mean(np.abs(y_test -predict) / (y_test)))) * 100   

  salvar_resultado(sk_ponto, resultado, n_time_steps, MSE, RMSE, MAE, MAPE, Duracao)

In [11]:
#selects the identification points of each reservoir
df_unique_pontos = dataset['SK_PONTO'].unique()

#create file to results
criar_arquivo_resultado()

for sk in np.array(df_unique_pontos):
    print('forecast for sk_ponto = ',sk)
    df_ponto = dataset[dataset['SK_PONTO']==sk]
    df_ponto = df_ponto.drop('SK_PONTO',axis=1)
    for n_time_steps in range(1,7): #predict with 1 to 6 past values of medition
        grid = previsao_MLP(sk, df_ponto, n_time_steps)

forecast for sk_ponto =  1
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[1, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (6, 12, 1), 'max_iter': 200, 'solver': 'adam'}", 1, 4897656.338495648, 2213.0649196297086, 679.030729274391, 18.724169271767874, 25.921007871627808]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[1, "{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (6, 12, 1), 'max_iter': 200, 'solver': 'adam'}", 2, 6149392.913405437, 2479.796950035514, 745.627891601458, 25.905418534848906, 18.94400668144226]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[1, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (4, 6, 1), 'max_iter': 100, 'solver': 'adam'}", 3, 6600632.934615946, 2569.1696975123978, 645.0729762487717, 28.06752588452589, 18.667006254196167]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[1, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (4, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 4, 5479945.842777855, 2340.9284147059802, 731.2060699057929, 24.904532779259846, 23.384008169174194]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[1, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 5, 5650478.1130696535, 2377.073434513468, 788.7985023870497, 25.568802237243844, 20.14000940322876]
forecast for sk_ponto =  4
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[4, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 1, 2957203.840207362, 1719.652243974741, 359.5431522957151, 9.167545467052877, 40.27400851249695]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[4, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 150, 'solver': 'adam'}", 2, 3036864.009431306, 1742.6600383985701, 364.8647661303203, 10.893834834843867, 41.07401490211487]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[4, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 150, 'solver': 'adam'}", 3, 3928793.9410816515, 1982.1185486952215, 362.01940418602317, 14.603654010121394, 32.21301054954529]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[4, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (4, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 4, 2934761.6064022714, 1713.1145923149074, 397.11201780105057, 12.408396806704552, 34.990012645721436]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[4, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 5, 2869851.621454345, 1694.063641500621, 399.6459286388772, 12.696033261856101, 31.576010942459106]
forecast for sk_ponto =  5
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[5, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (4, 6, 1), 'max_iter': 150, 'solver': 'adam'}", 1, 4646822.677568973, 2155.6490153939653, 770.3326044684527, 9.640894058350213, 40.35701322555542]
Fitting 5 folds for each of 72 candidates, totalling 360 fits
['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[5, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 2, 4213275.48131633, 2052.6264836341584, 659.4295986703946, 9.177371497046002, 37.95801258087158]
Fitting 5 folds for each of 72 candidates, totalling 360 fits
['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[5, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (6, 12, 1), 'max_iter': 200, 'solver': 'adam'}", 3, 3933343.5481503527, 1983.265879339014, 687.3808586119903, 8.83131522303



['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[6, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (4, 6, 1), 'max_iter': 100, 'solver': 'adam'}", 3, 262875.39501533576, 512.7137554379985, 184.59714293857087, 5.465488862420855, 35.396013021469116]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[6, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 4, 260553.5691760359, 510.4444819723648, 191.26019542020958, 5.438640013202156, 48.794015407562256]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[6, "{'activation': 'identity', 'alpha': 0.05, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 150, 'solver': 'adam'}", 5, 269535.7112598803, 519.1682879952128, 200.94856303374718, 5.776430846917637, 41.05601167678833]
forecast for sk_ponto =  7
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[7, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (6, 12, 1), 'max_iter': 150, 'solver': 'adam'}", 1, 6144314.952226244, 2478.7728722547863, 1264.591174295414, 17.018839214189462, 45.467015981674194]
Fitting 5 folds for each of 72 candidates, totalling 360 fits
['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[7, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 2, 5061143.135097463, 2249.6984542594732, 1219.9455918743897, 16.183581680771045, 44.80901551246643]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[7, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (6, 12, 1), 'max_iter': 200, 'solver': 'adam'}", 3, 4719338.123309196, 2172.4037661791135, 1184.2678309963849, 16.031978950831373, 49.07201647758484]
Fitting 5 folds for each of 72 candidates, totalling 360 fits
['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[7, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (4, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 4, 4518706.427075381, 2125.7249180162944, 1151.4053504534768, 15.867943049002095, 52.58301782608032]
Fitting 5 folds for each of 72 candidates, totalling 360 fits
['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[7, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 200, 'solver': 'adam'}", 5, 4464403.328048157, 2112.913469134067, 1163.4737404058687, 15.926



['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[8, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 150, 'solver': 'adam'}", 1, 65586.73343832105, 256.09906957722643, 48.91062142014607, 2.9974694472389434, 48.92617678642273]
Fitting 5 folds for each of 72 candidates, totalling 360 fits




['SK_PONTO', 'Best Params', 'N_Past_Vl', 'MSE', 'RMSE', 'MAE', 'MAPE', 'Duration']
[8, "{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': (2, 6, 1), 'max_iter': 150, 'solver': 'adam'}", 2, 60278.29698132269, 245.5163884169908, 46.68785313944636, 3.054010175929144, 52.478017807006836]
Fitting 5 folds for each of 72 candidates, totalling 360 fits


PermissionError: [Errno 13] Permission denied: 'results/Result_MLP_Model_Day.csv'