In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
import os

In [2]:
files = os.listdir("C:\\Users\\kevin\\ziknet-trends-rolling\\data\\Brazil\\processed_data\\")
weeks_ahead = 3

In [3]:
datasets = {}
for file in files:
    datasets[file] = pd.read_csv("C:\\Users\\kevin\\ziknet-trends-rolling\\data\\Brazil\\processed_data\\{}".format(file))
    print(file)
    print(datasets[file].head())
    print("Number of observations: ", len(datasets[file]))

Bahia_2016-2017.csv
         Date  Searches        Cases
0  04/01/2016        20   316.198743
1  11/01/2016        28  1053.232914
2  18/01/2016        22  1790.267086
3  25/01/2016        40  2527.301257
4  01/02/2016        45  3264.335428
Number of observations:  104
MatoGrosso_2016-2017.csv
         Date  Searches        Cases
0  04/01/2016       100  1455.781452
1  11/01/2016       100  1676.927151
2  18/01/2016       100  1898.072849
3  25/01/2016       100  2119.218548
4  01/02/2016        91  2340.364247
Number of observations:  104
MinasGerais_2016-2017.csv
         Date  Searches       Cases
0  04/01/2016        16 -139.037456
1  11/01/2016        23   70.654181
2  18/01/2016        36  280.345819
3  25/01/2016        53  490.037456
4  01/02/2016        66  699.729093
Number of observations:  104
RioDeJaneiro_2016-2017.csv
         Date  Searches        Cases
0  04/01/2016        50  3123.170605
1  11/01/2016        64  3387.890202
2  18/01/2016        70  3652.609798
3  25/0

In [4]:
def series_to_supervised(df, outputColumn, n_in=1, n_out=1, dropnan=True):
    n_vars = df.shape[1]
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [("{}(t-{})".format(col, i)) for col in df.columns]

    # Append next observation[outputColumn] at n_out obs
    cols.append(df[outputColumn].shift(-n_out+1))
    names+=[outputColumn + "(t+{})".format(n_out-1)]

    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [5]:
def getXY(dataset, state, weeksAhead):
    n_features = dataset.shape[1]
    
    n_weeks = 4
    reframed = series_to_supervised(dataset, "Cases",  n_weeks, weeksAhead)
    values = reframed.values
    
    totalFeatures = values.shape[1]

    x,y = values[:, :totalFeatures-1], values[:, totalFeatures-1] #Y is the last column, X is all the previous columns 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y

In [6]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Flatten, Dense, LSTM
from keras.optimizers import Adam
from keras.layers.merge import concatenate

def LSTM_NN_Model():
    input_layer = Input(shape=(4,2))
    b1_out = LSTM(64, return_sequences=False)(input_layer)

    b2_out = Dense(32, activation="relu", kernel_regularizer="l2")(input_layer)
    b2_out = Flatten()(b2_out)

    concatenated = concatenate([b1_out, b2_out])
    out = Dense(4, activation="relu", kernel_regularizer="l2")(concatenated)
    out = Dense(4, activation="relu", kernel_regularizer="l2")(out)
    # out = Dense(1, activation="linear", kernel_constraint=non_neg(), name='output_layer')(out)
    out = Dense(1, activation="linear", name='output_layer')(out)

    model = Model([input_layer], out)
    model.compile(loss=["mse"], optimizer=Adam(0.0001), metrics=["mae"])

    return model

Using TensorFlow backend.


In [7]:
for file in datasets:
    dataset = datasets[file]
    x, y = getXY(dataset[["Searches", "Cases"]], "", weeks_ahead)
    model = LSTM_NN_Model()
    
    xDim1 = x[0].shape[0]
    xDim2 = x[0].shape[1]
    
    splitIndex = len(x) - 52

    train_X = x[:splitIndex]
    train_y = y[:splitIndex]

    test_y = y[splitIndex:]
    outDataset = pd.DataFrame()
    outDataset["Observerd"] = test_y

    model.fit(
        train_X,
        train_y,
        epochs=200,
        batch_size=32,
        verbose=0,
        shuffle=False)

    predicted_y_history = []

    while(splitIndex < len(y)):
        predicted_y = \
            model.predict(x[splitIndex].reshape(1, xDim1, xDim2))[0]

        model.fit(
            x[:splitIndex+1],
            y[:splitIndex+1],
            epochs=1,
            batch_size=32,
            verbose=0,
            shuffle=False)

        predicted_y_history.append(predicted_y[0])
        splitIndex += 1

    outDataset["PREDICTED"] = predicted_y_history
    print("{} RMSE: {}".format(file, mean_squared_error(test_y, predicted_y_history)))
    outDataset.to_csv(file)

Bahia_2016-2017.csv RMSE: 1840.6563177792336
MatoGrosso_2016-2017.csv RMSE: 6382.556362647708
MinasGerais_2016-2017.csv RMSE: 515.9185777977034
RioDeJaneiro_2016-2017.csv RMSE: 6526.937578130544
SaoPaulo_2016-2017.csv RMSE: 239.36991364772882
