In [1]:
import optuna
import itertools
import numpy as np
import pandas as pd #from prophet import Prophet
import matplotlib.pyplot as plt
from neuralprophet import NeuralProphet
from sklearn.metrics import mean_squared_error as mse 
from neuralprophet import set_random_seed

set_random_seed(0)
import warnings
warnings.filterwarnings('ignore')




## Obtenção dos dados e pre processamento no formato do prophet: 

In [2]:
PATH = '/Users/eduardoaraujo/Documents/Github/paper-dengue-sc/data/cases'

def get_data(state, geocode): 
    
    df = pd.read_parquet(f'{PATH}/{state}_dengue.parquet')
    
    df = df.loc[df.municipio_geocodigo == geocode]
    
    df.index = pd.to_datetime(df.index)
    
    df = df.sort_index()
    
    df.reset_index(inplace = True)
    
    df = df.rename(columns = {'data_iniSE': 'ds', 'casos': 'y'})
    
    df = df[['ds', 'y']]
    
    df.y = np.log(df.y)
    
    return df

def update_columns(df):
    
    for col in df.columns:
        
        if col.startswith('y'):
            
            df[col] = np.exp(df[col])
            
    return df

In [3]:
state = 'PR'
geocode = 4108304 

df = get_data(state, geocode)

df = df.loc[df.ds < '2023-01-01']

df.tail()

Unnamed: 0,ds,y
673,2022-11-27,5.811141
674,2022-12-04,5.669881
675,2022-12-11,5.774552
676,2022-12-18,5.327876
677,2022-12-25,5.298317


## Tunning dos hiperparâmetros usando o Optuna

In [10]:
def objective(trial, df):   

    
    params = {'n_lags':trial.suggest_int('n_lags', 8,52, 16), 
              'quantiles':[0.025, 0.975], 
              'n_changepoints':trial.suggest_int('n_changepoints', 15, 25),
              'n_forecasts': 4,
              'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.01),
              'ar_layers': trial.suggest_categorical( 'ar_layers', [ [32, 32, 32],[32, 32, 32, 32, 32],
                          [16,16, 16, 16, 16,16], [8,8,8,8]]),
              'ar_reg': trial.suggest_float("ar_reg",0, 1), 
               # 'lagged_reg_layers': [[32, 32, 32]], 
              'changepoints_range': 0.05, 
              #'trend_reg': trial.suggest_float('trend_reg', 0, 0.1),
              'seasonality_mode':trial.suggest_categorical( 'seasonality_mode', ['multiplicative', 'additive'])
              #'loss_func': 'MSE'
     }
        
        
    m =  NeuralProphet(**params)
    
    df_train_val, df_test = m.split_df(df=df, freq="W", valid_p=0.2)
    
    folds = m.crossvalidation_split_df(df_train_val, freq="W", k=3, fold_pct=0.20, fold_overlap_pct=0.5)    
    
    rmse_fold = []
    
    for df_train, df_val in folds:
        m = NeuralProphet(**params)
        #m.set_plotting_backend("plotly-static")
        train = m.fit(df=df_train, freq="W", epochs = 150, validation_df = df_val, early_stopping = 20, progress=None)
        
        test = m.predict(df_test)

        s = test[['ds', 'y', 'yhat4']] 

        s = s.loc[ s.yhat4.isna() ==False]
        
        rmse_fold.append(mse(np.exp(s.y), np.exp(s.yhat4), squared = False))

    return np.mean(rmse_fold)

In [5]:
cities = [2704302, 2927408, 2111300, 2211001,
            2800308, 2408102, 2304400, 2507507, 2611606]
states  = ['AL', 'BA', 'MA', 'PI', 'SE', 'RN', 'CE', 'PB', 'PE'] 


df_pars_br = pd.DataFrame()

df_pars_br['geocode'] = cities

df_pars_br['params'] = np.nan
df_pars_br['best_rmse'] = np.nan

In [6]:
%%time 
for c, s in zip(cities, states):
    df = get_data(s, c)
    
    df = df.loc[df.ds < '2023-01-01']
    
    study = optuna.create_study(direction='minimize')
            
    study.optimize(lambda trial: objective(trial, df=df), n_trials=75)


    df_pars_br.loc[df_pars_br.geocode == c , 'params'] = str(study.best_params)
    df_pars_br.loc[df_pars_br.geocode == c,'best_rmse'] = study.best_value


[I 2023-12-03 14:23:52,282] A new study created in memory with name: no-name-3100de65-74c2-4f19-8a4d-6cde682cc42f
INFO - (NP.df_utils._infer_frequency) - Major frequency W-SUN corresponds to 99.558% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - W
INFO - (NP.data.processing._handle_missing_data_single_id) - 1 NaN values in column y were auto-imputed.
INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column
INFO - (NP.df_utils.return_df_in_original_format) - Returning df with no ID column
INFO - (NP.df_utils._infer_frequency) - Major frequency W-SUN corresponds to 99.818% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - W
INFO - (NP.df_utils._infer_frequency) - Major frequency W-SUN corresponds to 99.711% of the data.
INFO - (NP.df_utils._infer_frequency) - Defined frequency is equal to major frequency - W
INFO - (NP.config.init_data_params) - Setting norma

CPU times: user 3h 4min 43s, sys: 10min 11s, total: 3h 14min 54s
Wall time: 3h 14min 50s


In [7]:
df_pars_br

Unnamed: 0,geocode,params,best_rmse
0,2704302,"{'n_lags': 8, 'n_changepoints': 20, 'learning_...",166.46569
1,2927408,"{'n_lags': 40, 'n_changepoints': 15, 'learning...",21.762376
2,2111300,"{'n_lags': 8, 'n_changepoints': 22, 'learning_...",30.012326
3,2211001,"{'n_lags': 40, 'n_changepoints': 16, 'learning...",183.789246
4,2800308,"{'n_lags': 8, 'n_changepoints': 15, 'learning_...",45.831992
5,2408102,"{'n_lags': 8, 'n_changepoints': 19, 'learning_...",138.859212
6,2304400,"{'n_lags': 40, 'n_changepoints': 18, 'learning...",565.926428
7,2507507,"{'n_lags': 8, 'n_changepoints': 18, 'learning_...",172.322693
8,2611606,"{'n_lags': 8, 'n_changepoints': 17, 'learning_...",220.989447


In [9]:
df_pars_br.to_csv('best_params.csv')