<a href="https://colab.research.google.com/github/gveloso9983/CSC/blob/main/csc_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt


#for replicability purposes
tf.random.set_seed(91195003)
np.random.seed(91190530)
#for an easy reset backend session state
tf.keras.backend.clear_session()

################
#1ª PARTE
################

#Load dataset
def load_dataset(path):
    return pd.read_csv(path)


#split data into training and validation sets
def split_data(training, perc=10):
    train_idx = np.arange(0, int(len(training)*(100-perc)/100))
    val_idx = np.arange(int(len(training)*(100-perc)/100+1), len(training))
    return train_idx, val_idx


#Plot time series data
def plot_confirmed_cases(data):
    plt.figure(figsize=(8,6))
    plt.plot(range(len(data)), data)
    plt.title('Confirmed Cases of COVID-19')
    plt.ylabel('Cases')
    plt.xlabel('Days')
    plt.show()

def data_normalization(df, norm_range=(-1, 1)):
    #[-1, 1] for LSTM due to the internal use of tanh by the memory cell
    scaler = MinMaxScaler(feature_range=norm_range)
    #df[['cases']] = scaler.fit_transform(df[['cases']])
    df[['Nr_acidentes']] = scaler.fit_transform(df[['Nr_acidentes']])
    return scaler

#plot learning curve
def plot_learning_curves(hist_list, approach):
    pass

################
#2ª PARTE
################

# build our supervised problem
#Preparing the dataset for the LSTM
def to_supervised(df, timesteps):
    data = df.values
    #print(data)
    X, y = list(), list()

    #iterate over the training set to create X and y, X é um array com 5_timesteps, y será um array com o valor seguinte ao 5 timestep
    dataset_size = len(data)

    for curr_pos in range(dataset_size):
        #end of the input sequence is the current position + the number of timesteps of the input sequence
        input_index = curr_pos + timesteps
        #end of the labels corresponds to the end of the input sequence + 1
        label_index = input_index + 1
        #if we have enough data for this sequence
        if label_index < dataset_size:
            X.append(data[curr_pos:input_index, :])
            y.append(data[input_index:label_index, 0])

    #print(X)
    #print(y)

    #using np.float32 for GPU performance
    return np.array(X).astype('float32'), np.array(y).astype('float32')


################
#3ª PARTE
################

#Building the model
def rmse(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))


def build_model(timesteps, features, h_neurons=64, activation='tanh'):

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.LSTM(h_neurons, input_shape=(timesteps, features)))
    model.add(tf.keras.layers.Dense(h_neurons, activation=activation))
    model.add(tf.keras.layers.Dense(1, activation='linear'))

    #model summary (and save it as PNG)
    tf.keras.utils.plot_model(model, 'covid19_model.png', show_shapes=True)

    return model


################
#4ª PARTE
################
#Compiling and fit the model

def compile_and_fit(model, epochs, batch_size):

    # compile
    model.compile(loss=rmse, optimizer=tf.keras.optimizers.Adam(), metrics=['mae', rmse])

    # fit
    hist_list = list()
    loss_list = list()

    # Time Series Cross Validator
    tscv = TimeSeriesSplit(n_splits=cv_splits)
    #print(tscv.split(X))

    for train_index, test_index in tscv.split(X):
        train_idx, val_idx = split_data(train_index, perc=10)  # further split into training and validation sets
        # build data
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        X_test, y_test = X[test_index], y[test_index]
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                        epochs=epochs, batch_size=batch_size, shuffle=False)
        metrics = model.evaluate(X_test, y_test)
        hist_list.append(history)
        loss_list.append(metrics[2])

    #plot_learning_curves(hist_list, approach='history')
    #plot_learning_curves(loss_list, approach='loss')
    return model, hist_list, loss_list





################
#5ª PARTE - Previsão para os próximos 5 dias
################
#Recursive Multi-Step Forecast!!!
def forecast(model, df, timesteps, multisteps, scaler):
    input_seq = df[-timesteps:].values #getting the last sequence of known value
    inp = input_seq
    forecasts = list()

    #multisteps tells us how many iterations we want to perform, i.e., how many days we want to predict
    for step in range(1, multisteps+1):
        inp = inp.reshape(1,timesteps,1)
        yhat = model.predict(inp) #dá o valor predito normalizado
        yhat_desnormalized = scaler.inverse_transform(yhat) #dá valor predito desnormalizado
        forecasts.append(yhat_desnormalized) #adicionar previsao à lista final de previsões
        #preparar novo input para fazer previsão para o dia seguinte
        inp= np.append(inp[0],yhat) #adiciona previsão recente ao input
        inp = inp[-timesteps:] #vai ao input buscar os ultimos timesteps registados

    return forecasts

def plot_forecast(data, forecasts):
    plt.figure(figsize=(8,6))

    #print("Zerro len : ",len(data))
    plt.plot(range(len(data)), data, color='green', label='Confirmed')

    #print("Primeiro len : ",(len(data) - 1))
    #print("Segundoo len : ",len(forecasts) - 1)

    #x=range(len(data)-1, len(data)+len(forecasts)-1)

    #for xs in x:
    #    print(xs)
    fi=[]
    for f in forecasts:
        fi.append(f[0][0])
        print(f)
    print(fi)

    #plt.plot(range(len(data)-1, len(data)+len(forecasts)-1), forecasts, color='red', label='Forecasts')
    plt.plot(range(len(data)-1, len(data)+len(forecasts)-1), fi, color='red', label='Forecasts')
    plt.title('Confirmed Cases of COVID-19')
    plt.ylabel('Cases')
    plt.xlabel('Days')
    plt.legend()
    plt.show()


# Main Execution
timesteps = 7  # number of days that make up a sequence
univariate = 1  # number of features used by the model (using conf. cases to predict conf. cases)
multisteps = 2  # number of days to forecast – we will forecast the next 5 days
cv_splits = 3  # time series cross validator
epochs = 25
batch_size = 7  # 7 sequences of 5 days - which corresponds to a window of 7 days in a batch
path = 'time_series_covid19_confirmed_global.csv'
#########################
df_raw = pd.read_csv("dataset_test.csv")
df_raw["Data"] = pd.to_datetime(df_raw["Data"])
df_raw = df_raw.sort_values("Data")
df_raw = df_raw.set_index("Data")

#########################



df = df_raw.copy()
#print(df_data.dtypes)
#plot_confirmed_cases(df_data)  # the plot you saw previously
scaler = data_normalization(df)  # scaling data to [-1, 1]
#print(df.head())

# our supervised problem
X, y = to_supervised(df, timesteps)
print("Training shape:", X.shape)
print("Training labels shape:", y.shape)

# fitting the model
model = build_model(timesteps, univariate)
model, hist_list, loss_list = compile_and_fit(model, epochs, batch_size)

    # Now that we have “tuned” our model, we should retrain it with all the available data
    # (as we did in SBS) and obtain real predictions for tomorrow, the day after, …
    # We want to forecast the next five days after today!


# Recursive Multi-Step Forecast!!!
forecasts = forecast(model, df, timesteps, multisteps, scaler)
for forecast in forecasts:
  print(forecast)

#plot_forecast(df_data, forecasts)


