In [1]:
#!pip install seaborn
#!pip install openpyxl
#!pip install sklearn

In [2]:
import pandas as pd
import numpy as np
import random as rd
import seaborn as sbs
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVR
from matplotlib import pyplot
import time
import csv
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

In [3]:

file_data = 'lynx.csv'
path_name='../datasets/'
path_name_results='../results/'
file_result = 'Result_SVR_canadian_lynx.csv'

In [4]:
dataset = pd.read_csv(f'{path_name}{file_data}', sep =',', encoding = 'latin1', decimal='.')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   rownames  114 non-null    int64
 1   time      114 non-null    int64
 2   value     114 non-null    int64
dtypes: int64(3)
memory usage: 2.8 KB


In [5]:
#verifica se existe variáveis nulas 
dataset.isna().sum()

rownames    0
time        0
value       0
dtype: int64

In [6]:
def salvar_resultado(nm_dataset, ds_best_param, n_time_steps, MSE, RMSE, MAE, MAPE, sMAPE, Duration):
  #Script to write training cycle results
  data = [nm_dataset, ds_best_param, n_time_steps, MSE, RMSE, MAE, MAPE, sMAPE, Duration]
  fields = ['Dataset','Best Params','n_time_steps','MSE','RMSE','MAE','MAPE','sMAPE','Duration']
  with open(f'{path_name_results}{file_result}', "a",newline='') as csv_file:
    writer = csv.writer(csv_file,delimiter=';')
    writer.writerow(data)  
  print(fields)
  print(data)
    
#Script to create the results file
def criar_arquivo_resultado():
  fields = ['Dataset','Best Params','n_time_steps','MSE','RMSE','MAE','MAPE','sMAPE','Duration']
  with open(f'{path_name_results}{file_result}', "w",newline='') as csv_file:
    writer = csv.writer(csv_file,delimiter=';')
    writer.writerow(fields)    

In [7]:
def previsao_MLP(nm_dataset, dataset, n_time_steps):

    # dataframe tratament
  df = pd.DataFrame()
  df['value']=dataset['value']  

  # time serire transform - shit 1 step time

  for n_step in range(1,n_time_steps+1,1):
    df['vl-'+str(n_step)]=dataset['value'].shift(n_step)  
      
  df.dropna(inplace=True)
  
  
  
  #Split dataset in treinam /  80% treinam  20% test
  nlinhas = int(np.round(df.shape[0] *0.80)) # 
  
  max_size_train_split = int(np.round(nlinhas / 5)) 
  max_size__test_split = int(np.round((df.shape[0] - nlinhas) / 5))
  size_split = 5
  X_train = df.iloc[0:nlinhas,1: 1 + n_time_steps]
  y_train = df.iloc[0:nlinhas,0].values

  X_test = df.iloc[nlinhas:dataset.shape[0],1: 1 + n_time_steps] 
  y_test = df.iloc[nlinhas:dataset.shape[0],0].values
  
  
  # Stores the training execution start time
  Hora_Inicio = time.time()
  
  #Create SVR Regressor
  modelo  = SVR(kernel='rbf')
  
  # Validados cruzado para séries temporais para 5 conjuntos de dados
  ts_cv = TimeSeriesSplit(
      n_splits=size_split, # Number of divisions
      max_train_size=max_size_train_split,    # maximum size of each set. of training
      gap=2, # number of samples to exclude between each training and testing set
      test_size=max_size__test_split, # maximum size of each set. of test.
  )
  C = [ 12550, 125550, 1255555] # Parâmetro de regularização
  gamma = [ 0.00001, 0.000001, 0.0000001, 0.00000001] # Coeficiente da função kernel  
  epsilon = [0.1, 0.01, 0.001, 0.0001]

  hyper_params = [{'kernel': ['rbf'],'C': C, 'gamma':gamma, 'epsilon':epsilon}]  

  modelo = SVR(max_iter=500)
  grid = GridSearchCV(modelo,param_grid=hyper_params,verbose=20,n_jobs=-4,cv=ts_cv, scoring='neg_mean_absolute_percentage_error')
  grid.fit(np.array(X_train), np.array(y_train))
  
  resultado = str(grid.best_params_)
  

  predict=grid.predict(np.array(X_test))   

  # Stores the training execution end time
  Hora_Fim = time.time()   

  #Calculate the duration of the training execution
  Duracao = Hora_Fim - Hora_Inicio 
  
  #Mean Squared Error (Mean Squared Difference Between Estimated Values and Actual Values) - MSE
  MSE = mean_squared_error(y_test, predict)   

  # Square Root of Mean Error - RMSE
  RMSE = np.sqrt(mean_squared_error(y_test, predict))   

  # Mean Absolute Distance or Mean Absolute Error - MAE
  MAE= median_absolute_error(y_pred=predict, y_true = y_test)    

  #Calculate the MAPE (Mean Absolute Percentage Error)
  MAPE = ((np.mean(np.abs(y_test -predict) / (y_test)))) * 100    

  sMAPE = round(
		np.mean(
			np.abs(predict - y_test) /
			((np.abs(predict) + np.abs(y_test)))
		)*100, 2
	)

  salvar_resultado(nm_dataset, resultado, n_time_steps, MSE, RMSE, MAE, MAPE, sMAPE, Duracao)



In [8]:
#create file to results
criar_arquivo_resultado()

print('forecast for new sunspot number')
for n_time_steps in range(1,13): #predict with 1 to 12 past values of medition
    grid = previsao_MLP('c.lynx', dataset, n_time_steps)

forecast for new sunspot number
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-05, 'kernel': 'rbf'}", 1, 2518400.557963053, 1586.9469297878404, 625.5845624242525, 120.55838673168083, 29.13, 29.529592990875244]
Fitting 5 folds for each of 48 candidates, totalling 240 fits
['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.1, 'gamma': 1e-08, 'kernel': 'rbf'}", 2, 639219.6785151914, 799.5121503236779, 476.0247939558194, 80.98749987504651, 22.53, 1.2035036087036133]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-08, 'kernel': 'rbf'}", 3, 616059.0459098592, 784.8942896402415, 406.85125604620134, 79.36534010542911, 24.13, 0.9286584854125977]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-07, 'kernel': 'rbf'}", 4, 629685.1640984992, 793.5270405591099, 232.01536267640165, 56.269532270867174, 22.36, 0.7125344276428223]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.1, 'gamma': 1e-08, 'kernel': 'rbf'}", 5, 576528.6199609194, 759.2948175517329, 393.18258862345556, 97.28817632926442, 30.31, 0.8168356418609619]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.1, 'gamma': 1e-08, 'kernel': 'rbf'}", 6, 568688.0288824802, 754.1140688798215, 363.4116776408764, 77.61999609086675, 29.29, 0.8418071269989014]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-07, 'kernel': 'rbf'}", 7, 825997.6200071902, 908.8441120495803, 515.3196603534321, 114.46267846233376, 33.02, 0.7331738471984863]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-08, 'kernel': 'rbf'}", 8, 549384.6626770089, 741.2048722701496, 339.1571724776363, 78.9955976054279, 31.21, 0.6903131008148193]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-07, 'kernel': 'rbf'}", 9, 1236136.0973614135, 1111.8165754122456, 703.7499917572811, 242.1715828284818, 38.84, 0.6286771297454834]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-08, 'kernel': 'rbf'}", 10, 536799.1305776215, 732.6657700327083, 252.61457357922154, 56.65705106987951, 32.08, 0.6401553153991699]
Fitting 5 folds for each of 48 candidates, totalling 240 fits




['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.0001, 'gamma': 1e-08, 'kernel': 'rbf'}", 11, 611168.0547431485, 781.7723803916, 268.7614324236879, 58.92083493003375, 33.38, 0.6707601547241211]
Fitting 5 folds for each of 48 candidates, totalling 240 fits
['Dataset', 'Best Params', 'n_time_steps', 'MSE', 'RMSE', 'MAE', 'MAPE', 'sMAPE', 'Duration']
['c.lynx', "{'C': 12550, 'epsilon': 0.1, 'gamma': 1e-08, 'kernel': 'rbf'}", 12, 641315.2299439554, 800.8215968266312, 265.5970080366327, 60.31721383113291, 32.61, 0.843402624130249]


