# Long Short-term Cognitive Networks

In [1]:
import numpy as np
import pandas as pd

import os, sys, random, time
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
from sklearn.base import MultiOutputMixin, BaseEstimator

from hampel import hampel

import lstcn
from lstcn.LSTCN import LSTCN
from lstcn.STCN import STCN

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
def reset_random_seeds():
    os.environ['PYTHONHASHSEED']=str(42)
    np.random.seed(42)
    random.seed(42)

# Preparing the data

In [3]:
def load_data(source, n_steps, split=0.8):

    """ Prepare the time series for learning.

    Parameters
    ----------
    source  :   {string} path to the CSV (variables must appear by column)
    n_steps :   {int} Number of steps-ahead to be forecast.
    split   :   {float} Proportion of data used for training.
    Returns
    ----------
    X_train, Y_train, X_test, Y_test, n_features

    """

    df = pd.read_csv(source, na_values='?')
    n_features = len(df.columns)

    for col in df.columns: 

        # imputing missing values using kNN
        df[col].interpolate(method='nearest', inplace=True)
        df[col] = df[col].astype('float64')

        # removing outliers to obtain more realistic errors
        df[col] = hampel(df[col], window_size=5, n=3, imputation=True)

        # normalize dataset to facilitate error analysis
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    data = df.to_numpy()
    # splitting the data to create the datasets
    data_train = data[:(int(split * data.shape[0])) + n_steps,:]
    data_test = data[-int((1-split) * data.shape[0]):,:]

    # creating X_train, Y_train, X_test, Y_test
    X_train, Y_train = create_dataset(data_train, n_features, n_steps)
    X_test, Y_test = create_dataset(data_test, n_features, n_steps)

    return data_train, X_train, Y_train, X_test, Y_test, n_features

def create_dataset(data, n_features, n_steps):
    
    """ Create X and Y from a portion of the time series.

    Parameters
    ----------
    data         :   {array-like} Portion of the time series.
    n_features   :   {int} Number of features in the time series. 
    n_steps      :   {int} Number of steps-ahead to be forecast.
    Returns
    ----------
    X_train, Y_train or X_test, Y_test

    """

    X_data = []
    Y_data = []

    for index in range(0, data.shape[0]-(2*n_steps-1)):

        # the moving windows is set to one
        X_data.append(data[index:(index + n_steps),:])
        Y_data.append(data[(index + n_steps):(index + 2*n_steps),:])

    # reshape data into 2D-NumPy arrays
    X = np.reshape(np.array(X_data), (len(X_data), n_steps*n_features))
    Y = np.reshape(np.array(Y_data), (len(Y_data), n_steps*n_features))

    return X, Y

# Performing grid-search

In [4]:
folder = 'data'
with open("output.csv", "w") as output:

    # printing the header of the output file
    output.write('dataset,train,test,step,time\n')
    n_datasets = len(os.listdir(folder))

    for dataset in os.listdir(folder):

        for n_steps in range(1,6):

            data, X_train, Y_train, X_test, Y_test, n_features = load_data(
                folder + os.sep + dataset, n_steps)

            param_search = {
                'alpha': [1.0E-4, 1.0E-2, 1.0, 1.0E+2, 1.0E+4],
                'n_blocks' : range(1,11)
            }

            reset_random_seeds()
            tscv = TimeSeriesSplit(n_splits=5)
            model = LSTCN(n_features, n_steps)

            start = time.time()
            scorer = make_scorer(model.score, greater_is_better=False)
            gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, refit=True,
                                   n_jobs=-1, error_score='raise', scoring=scorer)

            gsearch.fit(X_train, Y_train)
            best_model = gsearch.best_estimator_
            end = time.time()

            train_error = round(best_model.score(best_model.predict(X_train), Y_train),4)
            test_error = round(best_model.score(best_model.predict(X_test), Y_test),4)

            output.write(dataset + ',' + str(train_error) + ',' + str(test_error) + ',')
            output.write(str(n_steps) + ',' + str(end-start) + '\n')
            output.flush()

output.close()