In [1]:
import time
import numpy as np
import pandas as pd
import preprocess_data as prep
from keras.optimizers import Adam
from models import build_lstm_att, build_baseline, sum_regions_predictions
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

'''
This script is used to train the model for a specific STATE and forecast the cases on a 
specific year (TEST_YEAR). The model is trained with the regional health data before the year selected. 
'''

# Load the cases and enso data
df_all = prep.load_cases_data()
enso = prep.load_enso_data()
df_pop_region = pd.read_csv('./data/pop_regional.csv')
df_all_epi = pd.read_csv('./data/episcanner_regional.csv.gz')


# flag to decide if the model will be applied or not
apply = True 

### Definindo os callbacks do modelo:

In [2]:
import tensorflow as tf
TB_callback = TensorBoard(
                log_dir="./tensorboard",
                histogram_freq=0,
                write_graph=True,
                write_images=True,
                update_freq='epoch',
                # embeddings_freq=10
            )

class ResetStatesCallback(tf.keras.callbacks.Callback):
    #def __init__(self):
    #   self.counter = 0

    def on_batch_begin(self, batch, logs={}):
        for layer in self.model.layers:
            if hasattr(layer, 'reset_states'):
                layer.reset_states()
        #if self.counter % max_len == 0:
        #self.counter += 1
        

In [3]:
def regional_train_samples(model, df, enso, test_year, columns_to_normalize = ['casos', 'epiweek', 'enso'], episcanner = False, clima = False, min_year = None):
    '''
    Generate the train date from all the health regions 
    '''

    features = len(columns_to_normalize)+1
    X_train = np.empty((0, 89, features))
    y_train = np.empty((0, 52))

    list_of_enso_indicators = ['enso', 'iod', 'pdo']

    indicators = [item for item in list_of_enso_indicators if item in columns_to_normalize]

    for geo in df.regional_geocode.unique():
        
        if clima:
            df_w = prep.aggregate_data_clima(df, geo, column = 'regional_geocode')
        else: 
            df_w = prep.aggregate_data(df, geo, column = 'regional_geocode')

        
        #df_w['inc'] = 10*df_w['casos']/df_pop_region.loc[df_pop_region.regional_geocode==geo]['pop'].values[0]
        df_w['pop_norm'] = df_pop_region.loc[df_pop_region.regional_geocode==geo]['pop_norm'].values[0]

        if episcanner: 

            df_w = df_w.reset_index().merge( df_all_epi.loc[df_all_epi.code_region == geo][['year',
                                                          'R0', 
                                                          'peak_week',
                                                          'total_cases', 
                                                          'perc_geocode']], how = 'left', left_on = 'year', right_on = 'year').set_index('date')

            df_w = df_w.fillna(0)
        
        data = df_w.merge(enso[indicators], left_index = True, right_index = True)

        X_train, y_train, norm_values = prep.get_train_data(data.loc[data.year < test_year], columns_to_normalize= columns_to_normalize, min_year = min_year)

        X_train, X_val, y_train, y_val = train_test_split(
                                        X_train, y_train, test_size=0.2, shuffle = False, random_state=42)

        #print(X_val.shape)
        
        hist = model.fit(
                    X_train,
                    y_train,
                    batch_size=1,
                    epochs=100,
                    verbose=0,
                    shuffle = False, 
                    validation_data=(X_val, y_val),
                    callbacks=[TB_callback,ResetStatesCallback(), EarlyStopping(monitor='val_loss', min_delta=0,  patience=15)]
                )
        
    return model 

In [4]:
STATE  = 'CE'

TEST_YEAR = 2023 

if STATE == 'PR': 

    min_year = 2019

else: 

    min_year = 2013

#columns used in the model
cols_to_norm = ['casos','epiweek', 'enso',  'R0', 'total_cases',
                          'peak_week', 'perc_geocode'] 

print(STATE)
print(TEST_YEAR)

df = df_all.loc[df_all.uf == STATE]

CE
2023


In [5]:
LOSS = 'msle'
batch_size = 1
model_name = 'base_reg_stat'

#create model
model = build_baseline(hidden=64, features=8, predict_n=52, look_back=89, loss=LOSS, 
                    stateful = True, batch_size = batch_size,  optimizer = 'adam', activation = 'relu')

  super().__init__(**kwargs)


None


In [6]:
# train model 
model = regional_train_samples(model, df, enso, TEST_YEAR, columns_to_normalize = cols_to_norm, episcanner = True, clima = False, min_year = None)
   
# save model 
model.save(f'saved_models/model_{STATE}_{TEST_YEAR-1}_{model_name}.keras')

if apply:
    df_preds = sum_regions_predictions(model, df, enso, TEST_YEAR, cols_to_norm, True, False)
    df_preds['adm_1'] = STATE
    df_preds['adm_0'] = 'BR'
    df_preds['adm_2'] = pd.NA
    df_preds.to_csv(f'./predictions/preds_{STATE}_{TEST_YEAR}_{model_name}.csv', index = False)

TEST_YEAR = 2024 
print(TEST_YEAR)

model.compile(loss=LOSS, optimizer = Adam(learning_rate = 0.0005), metrics=["accuracy", "mape", "mse"])

model = regional_train_samples(model, df, enso, TEST_YEAR, columns_to_normalize = cols_to_norm, episcanner = True, clima = False, min_year = None)
  
# save the model
model.save(f'saved_models/model_{STATE}_{TEST_YEAR-1}_{model_name}.keras')

if apply: 
    df_preds = sum_regions_predictions(model, df, enso, TEST_YEAR, cols_to_norm, True, False)
    df_preds['adm_1'] = STATE
    df_preds['adm_0'] = 'BR'
    df_preds['adm_2'] = pd.NA
    df_preds.to_csv(f'./predictions/preds_{STATE}_{TEST_YEAR}_{model_name}.csv', index = False)



2024
