In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
import os

In [0]:
files = os.listdir("drive/My Drive/ziknet-trends-rolling/data/Colombia/processed_data")
weeks_ahead = 2

In [4]:
datasets = {}
minNDictionary = {}
for file in files:
    datasets[file] = pd.read_csv("drive/My Drive/ziknet-trends-rolling/data/Colombia/processed_data/{}".format(file))
    datasets[file]["Searches"] /= 100
    print(file)
    print(datasets[file].tail())
    print("Number of observations: ", len(datasets[file]))

    

valle_cauca_2016-2017.csv
           Date  Searches  Cases
99   26/11/2017      0.21    -12
100  03/12/2017      0.14     24
101  10/12/2017      0.01      8
102  17/12/2017      0.10    -49
103  24/12/2017      0.00      3
Number of observations:  104
santander_norte_2016-2017.csv
           Date  Searches  Cases
99   26/11/2017      0.28      0
100  03/12/2017      0.00      1
101  10/12/2017      0.04      4
102  17/12/2017      0.00      4
103  24/12/2017      0.65      3
Number of observations:  104
huila_2016-2017.csv
           Date  Searches  Cases
99   26/11/2017      1.00     -1
100  03/12/2017      1.00      1
101  10/12/2017      0.00      1
102  17/12/2017      0.71      0
103  24/12/2017      0.00      0
Number of observations:  104
santander_2016-2017.csv
           Date  Searches  Cases
99   26/11/2017      0.30     -3
100  03/12/2017      0.00      7
101  10/12/2017      0.02      4
102  17/12/2017      0.00     -1
103  24/12/2017      0.33     -5
Number of observation

In [0]:
def series_to_supervised(df, outputColumn, n_in=1, n_out=1, dropnan=True):
    n_vars = df.shape[1]
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [("{}(t-{})".format(col, i)) for col in df.columns]

    # Append next observation[outputColumn] at n_out obs
    cols.append(df[outputColumn].shift(-n_out+1))
    names+=[outputColumn + "(t+{})".format(n_out-1)]

    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [0]:
def getXY(dataset, state, weeksAhead):
    n_features = dataset.shape[1]
    
    n_weeks = 4
    reframed = series_to_supervised(dataset, "Cases",  n_weeks, weeksAhead)
    values = reframed.values
    
    totalFeatures = values.shape[1]

    x,y = values[:, :totalFeatures-1], values[:, totalFeatures-1] #Y is the last column, X is all the previous columns 

    x = x.reshape((x.shape[0], n_weeks, n_features)) # Reshape as 3-D
    return x, y

In [7]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Flatten, Dense, LSTM, BatchNormalization
from keras.optimizers import Adam
from keras.layers.merge import concatenate
from keras.constraints import non_neg
def LSTM_NN_Model():
    input_layer = Input(shape=(4,2))
    
    
    b1_out = LSTM(16, return_sequences=False)(input_layer)

    b2_out = Dense(128, activation="relu", kernel_regularizer="l2")(input_layer)
    b2_out = Flatten()(b2_out)

    concatenated = concatenate([b1_out, b2_out])
    
    out = Dense(8, activation="relu", kernel_regularizer="l2")(concatenated)
    out = Dense(8, activation="relu", kernel_regularizer="l2")(out)
#     out = Dense(1, activation="linear", kernel_constraint=non_neg(), name='output_layer')(out)
    out = Dense(1, activation="linear", name='output_layer')(out)

    model = Model([input_layer], out)
    model.compile(loss=["mse"], optimizer=Adam(0.0001), metrics=["mae"])

    return model

Using TensorFlow backend.


In [8]:
for file in datasets:
    dataset = datasets[file]
    x, y = getXY(dataset[["Searches", "Cases"]], "", weeks_ahead)
    model = LSTM_NN_Model()
    
    xDim1 = x[0].shape[0]
    xDim2 = x[0].shape[1]
    
    splitIndex = len(x) - 52

    train_X = x[:splitIndex]
    train_y = y[:splitIndex]

    test_y = y[splitIndex:]
    outDataset = pd.DataFrame()
    outDataset["Observerd"] = test_y

    model.fit(
        train_X,
        train_y,
        epochs=200,
        batch_size=4,
        verbose=0,
        shuffle=False)

    predicted_y_history = []

    while(splitIndex < len(y)):
        predicted_y = \
            model.predict(x[splitIndex].reshape(1, xDim1, xDim2))[0]
        
        model.fit(
            x[:splitIndex+1],
            y[:splitIndex+1],
            epochs=40,
            batch_size=4,
            verbose=0,
            shuffle=False)

        predicted_y_history.append(predicted_y[0])
        splitIndex += 1
    
#     predicted_y_history = np.exp(predicted_y_history)
#     predicted_y_history -= minNDictionary[file]
    
#     test_y = np.exp(test_y)
#     test_y -= minNDictionary[file]
    
    outDataset["PREDICTED"] = predicted_y_history
    print("{} MSE: {}".format(file, mean_squared_error(test_y, predicted_y_history)))
    outDataset.to_csv("drive/My Drive/ziknet-trends-rolling/Colombia/LSTM-NN/2Week/"+file)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
valle_cauca_2016-2017.csv MSE: 872.6370848436125
santander_norte_2016-2017.csv MSE: 40.160101343968286
huila_2016-2017.csv MSE: 9.693074431589839
santander_2016-2017.csv MSE: 58.578134715413434
tolima_2016-2017.csv MSE: 27.809384799788955
