# 1. Load Packages Required

In [35]:
!pip install keras-tuner
!pip install msoffcrypto-tool



In [36]:
from datetime import datetime
from matplotlib import pyplot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Activation, Dense
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from tensorflow.keras.layers import concatenate
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
#from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from matplotlib import pyplot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Activation, Dense, Dropout
from kerastuner.applications import HyperResNet
from kerastuner.tuners import Hyperband
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold

import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import msoffcrypto
import tensorflow as tf
import kerastuner as kt

# 3. Load Raw Data

In [None]:

df_list = list()
    
raw = pd.concat(df_list)

# 6. Functions for Models

## Backfill Data

In [67]:
def missing_data(df, targeted_length):
    
    curr_length = len(df)

    if curr_length != targeted_length:
        df_2016_03 = df[(df['Year'] == 2016) & (df['Month'] == 4)]

        if len(df_2016_03) != 0 and (curr_length + 1) == targeted_length:
            df_2016_03 = df_2016_03.reset_index()
            df_2016_03.loc[0, 'Month'] = 3

            frames = [df, df_2016_03]
            df_final = pd.concat(frames)
            df_final = df_final.reset_index(drop=True)
            df_final = df_final.sort_values(['Year', 'Month'], ascending=[True, True])

        else:
            mux = pd.MultiIndex.from_product([df['placeholder'].unique(),
                                      range(2015,2021),
                                      range(1, 13)], names=['placeholder','Year','Month'])

            df_final = df.set_index(['placeholder','Year','Month']).reindex(mux, fill_value=0).reset_index()
    
    result = df_final
    return result

## Stationarity Test Functions

In [68]:
def adt_check(ts):
    # Store Stationary/Non Stationary for 3 cases, Normal, Detrended and Lagged(Differenced)
    patterns = []
    #data = [ts.data, ts.z_data.dropna(), ts.zp_data.dropna()]
    data = [ts.data]
    for i in data:
        dftest = adfuller(i, autolag='AIC')
        if dftest[4]['5%'] < dftest[0]:
            patterns.append([0,1])
        else:
            patterns.append([1,0])
    return patterns
    
def kpss_check(ts):
    # Store Stationary/Non Stationary for 3 cases, Normal, Detrended and Lagged(Differenced)
    patterns = []
    data = [ts.data]
    for i in data:
        dftest = kpss(ts.data, regression='c')
        if dftest[3]['5%'] < dftest[0]:
            patterns.append([0,1])
        else:
            patterns.append([1,0])
    return patterns
    
def stationary_check(ts):
    ts['z_data'] = np.log(ts['data'])
    ts['zp_data'] = ts['z_data'] - ts['z_data'].shift(1)
    adt_status = adt_check(ts)
    kpss_status = kpss_check(ts) 
    patterns = [x + y for x, y in zip(adt_status, kpss_status)]
    # If both stationary
    if patterns[0] == [1, 0, 1, 0]:
        return 'stationary'
    elif patterns[0] == [0, 1, 0, 1]:
        return 'non_stationary'
    else:
        # ADF - Stationary and KPSS - Non Stationary
        if patterns[0][0] == 1 and patterns[0][3] == 1:
            return 'trend'
        elif patterns[0][1] == 1 and patterns[0][2] == 1:
            return 'difference'

In [69]:
def log_transform(data, inverse=False):
    if inverse: 
        return np.exp(data) + 1
    return np.log(data + 1)

def trend_transform(data, mean=None, std=None, inverse=False):
    if inverse:
        std = std[-(len(data)):]
        mean = mean[-(len(data)):]
        step1 = [x * y for (x, y) in zip(data, std)]
        step2 = [x + y for (x, y) in zip(step1, mean)]
        return step2
    return ((data - data.rolling(window=12).mean()) / data.rolling(window=12).std()), data.rolling(window=12).mean().values, data.rolling(window=12).std().values

def difference_transform(data, inverse=False):
    if inverse:
        shifted = data[0] + data[:-1]
        return  [x + y for (x, y) in zip(data, shifted)]
    return data - data.shift(1)

## LSTM Model Functions

In [70]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [71]:
def loss_plots(model, train_X, train_y, test_X, test_y):
    history = model.fit(train_X, train_y, 
                        epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)

    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.title('placeholder')
    title = 'placeholder' + '.png'
    pyplot.savefig(title)

In [72]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing, Holt
import xgboost
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
!pip install pmdarima
#!pip install pandas==0.24
import pmdarima as pm



# 7. Run Models

In [73]:

# Holt ES
def holt_es(train, test):
    model = Holt(np.asarray(train))
    fit1 = model.fit(optimized=True)
    pred1 = fit1.forecast(len(test))
    return pred1

# baseline model - persistence model
def model_persistence(x):
    return x

def baseline_model(test):
    predictions = list()
    for x in test:
        yhat = model_persistence(x)
        predictions.append(yhat)
    return predictions

def arima_model(train, test):
    try:
        model = pm.auto_arima(train, start_p=0, start_q=1,
                            test='adf',       # use adftest to find optimal 'd'
                            max_p=6, max_q=6, # maximum p and q
                            m=1,              # frequency of series
                            d=None,           # let model determine 'd'
                            seasonal=False,   # No Seasonality
      #                       start_P=0, 
                            D=0, 
      #                       trace=True,
                            error_action='ignore',  
                            suppress_warnings=True, 
                            stepwise=True)
    except:
        model = pm.auto_arima(train, start_p=0, start_q=1,
                            max_p=6, max_q=6, # maximum p and q
                            m=1,              # frequency of series
                            d=None,           # let model determine 'd'
                            seasonal=False,   # No Seasonality
      #                       start_P=0, 
                            D=0, 
      #                       trace=True,
                            error_action='ignore',  
                            suppress_warnings=True, 
                            stepwise=True)
    model_fit = model
    # make predictions
    predictions = model_fit.predict(n_periods = len(test))
    return predictions 




In [None]:
output_dict = dict()
for k in ['hi']:
    output_dict[k] = dict()
    for i in range(len('placeholder')):
        try:
            df_final = df_final.astype(int)

            # Do Stationary Checks for Cases
            ts = pd.DataFrame(df_final[k])
            ts.columns = ['data']
            ts = ts.sort_index()
            stnry = stationary_check(ts)
        except Exception as e:
            continue
        if stnry == 'non_stationary':
            df_final[k] = log_transform(df_final[k])
        elif stnry == 'trend':
            df_final[k], mean_k, std_k = trend_transform(df_final[k])
        elif stnry == 'difference':
            df_final[k] = difference_transform(df_final[k])
        df_final = df_final.dropna() 

        # UNIVARIATE
        values = pd.DataFrame(df_final[k])
        # Hold out
        num_holdout = 16
        # Running baseline
        #create lag
        lagged_df = pd.concat([values.shift(1), values], axis=1)
        lagged_df.columns = ['t-1', 't+1']
        # load dataset
        values = lagged_df.values
        # ensure all data is float
        values = values.astype('float32')
        # scale data
        scaler = MinMaxScaler(feature_range=(0, 3))
        scaled = scaler.fit_transform(values)
        values = scaled

        #split into train and test set
        num_holdout = 16
        train = values[1:len(values)-num_holdout, :]
        test = values[len(values)-num_holdout:, :]
        train_X, train_y = train[:,0], train[:,1]
        test_X, test_y = test[:,0], test[:,1]
        predictions = baseline_model(test_X)

        if stnry == 'non_stationary':
            test_y_t, predictions = log_transform(test_y, inverse=True), log_transform(predictions, inverse=True)
        elif stnry == 'trend':
            test_y_t, predictions = trend_transform(test_y, mean=mean_k, std=std_k, inverse=True), trend_transform(predictions, mean=mean_k, std=std_k, inverse=True)
        elif stnry == 'difference':
            test_y_t, predictions = difference_transform(test_y, inverse=True), difference_transform(predictions, inverse=True)
        else:
            test_y_t = test_y
        df_final = df_final.dropna() 
        rmse_test = sqrt(mean_squared_error(test_y_t, predictions))
        mae_test = mean_absolute_error(test_y_t, predictions)
        names += [f'{k}_baseline_rmse', f'{k}_baseline_mae']
        tests += [rmse_test, mae_test]

        # Ridge Lasso
        df_final = df_final.sort_index()
        df_lagged = df_final[[k]]

        for i in range(1,7):
            df_lagged["lag_{}".format(i)] = df_lagged[k].shift(i)
        y = df_lagged.dropna()[k]
        X = df_lagged.dropna().drop([k], axis=1)
        train_y, test_y = y[:len(values)-num_holdout], y[len(values)-num_holdout:]
        train_X, test_X = X[:len(values)-num_holdout], X[len(values)-num_holdout:]
        Ridge = RidgeCV(alphas=np.arange(0.1,10,0.1), cv=10)
        Ridge.fit(train_X, train_y)
        predictions = Ridge.predict(test_X)
        if stnry == 'non_stationary':
            test_y_t, predictions = log_transform(test_y, inverse=True), log_transform(predictions, inverse=True)
        elif stnry == 'trend':
            test_y_t, predictions = trend_transform(test_y, mean=mean_k, std=std_k, inverse=True), trend_transform(predictions, mean=mean_k, std=std_k, inverse=True)
        elif stnry == 'difference':
            test_y_t, predictions = difference_transform(test_y, inverse=True), difference_transform(predictions, inverse=True)
        else:
            test_y_t = test_y
        rmse_test = sqrt(mean_squared_error(test_y_t, predictions))
        mae_test = mean_absolute_error(test_y_t, predictions)
        
        rmse_test = sqrt(mean_squared_error(test_y_t, predictions))
        mae_test = mean_absolute_error(test_y_t, predictions)
        names += [f'{k}_ridge_rmse', f'{k}_ridge_mae']
        tests += [rmse_test, mae_test]
        
        # ARIMA 
        values = pd.DataFrame(df_final[k]).values
        train = values[:len(values)-num_holdout,0]
        test = values[len(values)-num_holdout:, 0]
        predictions = arima_model(train, test)
        # evaluate model
        if stnry == 'non_stationary':
            test_y_t, predictions = log_transform(test, inverse=True), log_transform(predictions, inverse=True)
        elif stnry == 'trend':
            test_y_t, predictions = trend_transform(test, mean=mean_k, std=std_k, inverse=True), trend_transform(predictions, mean=mean_k, std=std_k, inverse=True)
        elif stnry == 'difference':
            test_y_t, predictions = difference_transform(test, inverse=True), difference_transform(predictions, inverse=True)
        else:
            test_y_t = test
        rmse_test = sqrt(mean_squared_error(test_y_t, predictions))
        mae_test = mean_absolute_error(test_y_t, predictions)

        rmse_test = sqrt(mean_squared_error(test_y_t, predictions))
        mae_test = mean_absolute_error(test_y_t, predictions)
        names += [f'{k}_arima_rmse', f'{k}_arima_mae']
        tests += [rmse_test, mae_test]

        output = pd.DataFrame([tests], columns=names)
        output_dict[k]['placeholder'] = output



