# seq2seq

## Basic structure of encoder - decoder

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 

Image(url='https://miro.medium.com/max/1400/1*w9hrrIEEIl5Uk3BiL4WFog.png')

## Basic structure of a LSTM cell

In [None]:
Image(url='https://media.springernature.com/full/springer-static/image/art%3A10.1007%2Fs42979-020-0101-1/MediaObjects/42979_2020_101_Fig1_HTML.png?as=webp')

In [None]:
Image(url='https://www.mcdonalds.com/is/image/content/dam/ch/nutrition/nfl-product/product/hero/t-mcdonalds-cheeseburger.jpg?$Product_Desktop$')

In [None]:
Image(url='https://www.mcdonalds.com/is/image/content/dam/de/nutrition/items/hero/desktop/6163_thumb.jpg?$Product_Desktop$')

# Encoder - Decoder 好處

1. 減少計算量
2. 客製化 Encoder- Decoder 結構
3. 客製化 Encoder -> Decoder 的過程


Reading: 
https://levelup.gitconnected.com/building-seq2seq-lstm-with-luong-attention-in-keras-for-time-series-forecasting-1ee00958decb

In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, RepeatVector, TimeDistributed, Flatten, LSTM, Input, Concatenate, Conv1D, Dropout, \
MaxPooling1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
# from IPython.display import Image
# from IPython.core.display import HTML 

os.chdir(os.path.dirname(os.getcwd()))

from main.utils.data import load_data

# prediction for four weeks
n_out = 4
# 7 weeks gap
n_gap = 7
# 28 days input
n_input = 28


def raw_data_preparation():
    
    df_parsed = load_data()

    df = df_parsed.copy()
    df = df.resample('1D').sum()
    df.fillna(0, inplace=True)

    df['week_day'] = [idx.weekday() for idx in df.index]

    for idx, row in df.iterrows():
        if row['week_day'] == 0:
            idx_start = idx
            break  # 跳出 for loop

    for idx, row in df.iloc[::-1].iterrows():  # 從後面數回來
        if row['week_day'] == 6:
            idx_end = idx
            break
    
    return df.loc[idx_start: idx_end]


def split_dataset(data):
    
    scale = 100000
    
    # split into standard weeks
    train, test = data[:-50*7], data[-(50+n_gap+n_out)*7:]
    y_train, y_test = train[:, 0]/scale, test[:, 0]/scale
    
    # data normalization
    scaler = MinMaxScaler()
    train_norm = scaler.fit_transform(train)
    test_norm = scaler.transform(test)
    
    # restructure into windows of weekly data
    train_norm = np.array(np.split(train_norm, len(train_norm)/7))
    test_norm = np.array(np.split(test_norm, len(test_norm)/7))
    return train_norm, test_norm, y_train, y_test


def evaluate_forecasts(actual, predicted):
    
    msle = mean_squared_log_error(actual, np.clip(predicted, 0, 100))
    mse = mean_squared_error(actual, predicted)
    
    return msle, mse
            

def to_supervised(train, train_label, n_input, n_out=6, n_gap=7):
    
    '''
    n_input: days
    n_out: measured in weeks
    n_future: measured in weeks
    '''
    
    # Multivariant input
    # flatten data
    data = train.reshape((train.shape[0]*train.shape[1], train.shape[2]))
    X, y, X_weekly = list(), list(), list()
    in_start = 0
    # step over the entire history one time step at a time
    for _ in range(len(data) - (n_out + n_gap) * 7):
        # define the end of the input sequence
        in_end = in_start + n_input
        out_start = in_end + 7 * n_gap
        out_end = out_start + 7 * n_out
        # ensure we have enough data for this instance
        if out_end <= len(data):
            # Univariant version
            '''
            x_input = data[in_start:in_end, 0]
            x_input = x_input.reshape((len(x_input), 1))
            '''
            X.append(data[in_start:in_end, :])
            y.append(np.array(np.split(train_label[out_start: out_end], n_out)).sum(axis=1))
            X_weekly.append(np.array(np.split(data[in_start:in_end, :], n_out)).sum(axis=1))
        # move along one time step
        in_start += 7
    return np.array(X), np.array(y), np.array(X_weekly)


def forecast(model, history, n_input, n_out):
    # flatten data
    data = np.array(history)
    data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
    
    # retrieve last observations for input data
    input_x = data[-n_input:, :]
    # reshape into [1, n_input, n_feature] Multivariant input
    input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
    
    # weekly aggregated data
    input_x_2 = data[-n_input:, :]
    external = np.array(np.split(input_x_2, n_out)).sum(axis=1)
    input_weekly = np.expand_dims(external, axis=0)
    
    # forecast the next week
    yhat = model.predict([input_x, input_weekly], verbose=0)
    # we only want the vector forecast
    yhat = yhat[0]
    return yhat
                     
                     
def evaluation_model(train, test, label_train, label_test, n_input, n_out=6, n_gap=7):
    
    model = build_model(train, label_train, n_input=n_input, n_out=n_out, n_gap=n_gap)
    history = [x for x in train[:-(n_out + n_gap)]]
    
    predictions = list()
    observations = list()
    
    for i in tqdm(range(len(test) - (n_out + n_gap))):
        history.append(test[i, :])
        yhat_sequence = forecast(model, history, n_input, n_out)
        predictions.append(yhat_sequence)
        observation = np.split(label_test[(i + n_gap) * 7: (i + n_out + n_gap) * 7], n_out)
        observations.append(np.array(observation).sum(axis=1))
    predictions = np.array(predictions)[:, :, 0]
    observations = np.array(observations)
    
    return predictions, observations

In [None]:
df = raw_data_preparation()

df['A_diff'] = df['A'].diff()

daily_data = df[['A', 'C', 'G', 'A_diff']]
# daily_data = df[['A', 'C', 'G', 'A_diff', 'week', 'month', ...., 'holiday']]
daily_data.fillna(0, inplace=True)

Considering data with four features: A, C, G, A_diff

## 365 days base line

In [None]:
train, test, label_train, label_test = split_dataset(daily_data.values)

prediction_input = label_train[:-(n_out + n_gap) * 7]

predictions = list()
observations = list()

for i in tqdm(range(len(test) - (n_out + n_gap))):
    
    prediction_input = np.concatenate((prediction_input, label_test[:7]))
    
    yhat = np.array([np.mean(prediction_input[-365:]) * 7] * 4)
    y = label_test[(1 + n_gap) * 7: (1 + n_gap + n_out) * 7]  # in days 
    y = np.array(np.split(y, n_out)).sum(axis=1)
    
    predictions.append(yhat)
    observations.append(y)
    
    label_test = label_test[7:]
    
predictions = np.array(predictions)
oobservations = np.array(observations)

In [None]:
baseline_rmse = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
baseline_rmse

## Encoder - Decoder LSTM Model

In [None]:
Image(url= "https://media.springernature.com/lw685/springer-static/image/art%3A10.1038%2Fs41598-019-55320-6/MediaObjects/41598_2019_55320_Fig3_HTML.png?as=webp")

In [None]:
def build_model(train, train_label, n_input, n_out=6, n_gap=7):
    # prepare data
    train_x, train_y, train_x_weekly = to_supervised(train, train_label, n_input, n_out=n_out, n_gap=n_gap) 
    
    # define parameters
    verbose, epochs, batch_size = 1, 35, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    tf.random.set_seed(42)
    
    main_inputs = Input(shape=(n_timesteps, n_features), name='main_inputs')
    x, state_h, state_c = LSTM(64, activation='relu', return_state=True)(main_inputs)
    
    _, _, dim = train_x_weekly.shape
    
    weekly_inputs = Input(shape=(n_outputs, dim), name='weekly_inputs')
    y = LSTM(64, activation='relu', return_sequences=True)(weekly_inputs, initial_state=[state_h, state_c])
    
    y = TimeDistributed(Dense(12, activation='relu'))(y)
    outputs = TimeDistributed(Dense(1), name='outputs')(y)
    
    model = Model(inputs=[main_inputs, weekly_inputs], outputs=outputs)
    model.compile(loss='mse', optimizer='adam')
    # fit network
    model.fit({'main_inputs': train_x, 'weekly_inputs': train_x_weekly}, 
              {'outputs': train_y}, epochs=epochs, batch_size=batch_size, verbose=verbose)
    return model

In [None]:
train, test, label_train, label_test = split_dataset(daily_data.values)

predictions, observations = evaluation_model(train, test, label_train, label_test, n_input=n_input,
                                             n_out=n_out, n_gap=n_gap)

lstm_rmse = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
lstm_rmse

In [None]:
baseline_rmse

In [None]:
# what if we simply use the last hidden state as inputs for the decoder?

def build_model(train, train_label, n_input, n_out=6, n_gap=7):
    # prepare data
    train_x, train_y, train_x_weekly = to_supervised(train, train_label, n_input, n_out=n_out, n_gap=n_gap) 
    
    # define parameters
    verbose, epochs, batch_size = 1, 30, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    tf.random.set_seed(42)
    
    main_inputs = Input(shape=(n_timesteps, n_features), name='main_inputs')
    x, state_h, state_c = LSTM(64, activation='relu', return_state=True)(main_inputs)
    
    decoder_input = RepeatVector(n_out)(x)  # Repeatvector(n_out)(state_h)  
    
#     state_h = BatchNormalization(momentum=0.6)(state_h)
#     state_c = BatchNormalization(momentum=0.6)(state_c)
    
    _, _, dim = train_x_weekly.shape
    weekly_inputs = Input(shape=(n_outputs, dim), name='weekly_inputs') #無作用
    y = LSTM(64, activation='elu', return_sequences=True, dropout=0.3, 
             recurrent_dropout=0.1)(decoder_input, initial_state=[state_h, state_c])
    
    y = TimeDistributed(Dense(12, activation='relu'))(y)
    outputs = TimeDistributed(Dense(1), name='outputs')(y)
    
    model = Model(inputs=[main_inputs, weekly_inputs], outputs=outputs)
    model.compile(loss='mse', optimizer='adam')
    # fit network
    model.fit({'main_inputs': train_x, 'weekly_inputs': train_x_weekly}, 
              {'outputs': train_y}, epochs=epochs, batch_size=batch_size, verbose=verbose)
    return model

In [None]:
train, test, label_train, label_test = split_dataset(daily_data.values)

predictions, observations = evaluation_model(train, test, label_train, label_test, n_input=n_input,
                                             n_out=n_out, n_gap=n_gap)

lstm_rmse_v2 = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
lstm_rmse_v2

## CNN-LSTM

In [None]:
def build_model(train, train_label, n_input, n_out=6, n_gap=7):
    # prepare data
    train_x, train_y, train_x_weekly = to_supervised(train, train_label, n_input, n_out=n_out, n_gap=n_gap) 
    
    # define parameters
    verbose, epochs, batch_size = 1, 25, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    tf.random.set_seed(42)
    
    main_inputs = Input(shape=(n_timesteps, n_features), name='main_inputs')
    x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', # 加權平均
               input_shape=(n_timesteps,n_features))(main_inputs)
    x = Dropout(0.3)(x)
    x, state_h, state_c = LSTM(64, activation='relu', return_state=True)(x)
    
    _, _, dim = train_x_weekly.shape
    
    weekly_inputs = Input(shape=(n_outputs, dim), name='weekly_inputs')
    x = LSTM(64, activation='relu', return_sequences=True)(weekly_inputs, initial_state=[state_h, state_c])
    x = TimeDistributed(Dense(12, activation='relu'))(x)
    outputs = TimeDistributed(Dense(1), name='outputs')(x)
    
    model = Model(inputs=[main_inputs, weekly_inputs], outputs=outputs)
    model.compile(loss='mse', optimizer='adam')
    # fit network
    model.fit({'main_inputs': train_x, 'weekly_inputs': train_x_weekly}, 
              {'outputs': train_y}, epochs=epochs, batch_size=batch_size, verbose=verbose)
    return model

In [None]:
train, test, label_train, label_test = split_dataset(daily_data.values)

predictions, observations = evaluation_model(train, test, label_train, label_test, n_input=n_input,
                                             n_out=n_out, n_gap=n_gap)
cnnlstm_rmse = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
cnnlstm_rmse

In [None]:
def build_model(train, train_label, n_input, n_out=6, n_gap=7):
    # prepare data
    train_x, train_y, train_x_weekly = to_supervised(train, train_label, n_input, n_out=n_out, n_gap=n_gap) 
    
    # define parameters
    verbose, epochs, batch_size = 1, 25, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    tf.random.set_seed(42)
    
    main_inputs = Input(shape=(n_timesteps, n_features), name='main_inputs')
    x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', 
               input_shape=(n_timesteps,n_features))(main_inputs)
    x = Dropout(0.3)(x)
    x, state_h, state_c = LSTM(64, activation='relu', return_state=True)(x)
    decoder_input = RepeatVector(n_out)(x)
    _, _, dim = train_x_weekly.shape
    
    weekly_inputs = Input(shape=(n_outputs, dim), name='weekly_inputs')
    
    decoder_input = Concatenate(axis=2)([weekly_inputs, decoder_input])
    
    x = LSTM(64, activation='elu', return_sequences=True, dropout=0.3, 
             recurrent_dropout=0.1)(decoder_input, initial_state=[state_h, state_c])
    x = TimeDistributed(Dense(12, activation='relu'))(x)
    outputs = TimeDistributed(Dense(1), name='outputs')(x)
    
    model = Model(inputs=[main_inputs, weekly_inputs], outputs=outputs)
    model.compile(loss='mse', optimizer='adam')
    # fit network
    model.fit({'main_inputs': train_x, 'weekly_inputs': train_x_weekly}, 
              {'outputs': train_y}, epochs=epochs, batch_size=batch_size, verbose=verbose)
    return model

In [None]:
train, test, label_train, label_test = split_dataset(daily_data.values)

predictions, observations = evaluation_model(train, test, label_train, label_test, n_input=n_input,
                                             n_out=n_out, n_gap=n_gap)
# cnnlstm_rmse_v2 = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
cnnlstm_rmse

In [None]:
cnnlstm_rmse_v2

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))

ax.plot(np.arange(4), baseline_rmse, '-o', label='365 days prediction')
ax.plot(np.arange(4), lstm_rmse, '-o', label='LSTM')
ax.plot(np.arange(4), cnnlstm_rmse, '-o', label='CNN-LSTM')
ax.plot(np.arange(4), lstm_rmse_v2, '-o', label='LSTM_v2')
ax.plot(np.arange(4), cnnlstm_rmse_v2, '-o', label='CNN-LSTM_v2')

ax.legend()
ax.set_ylabel('RMSE (100k)')
ax.set_xticks(np.arange(4))
ax.set_xticklabels(np.arange(4) + 1)
ax.set_xlabel("7 + n Week")

## Hyperparameter Optimization

- feature extractor:
    LSTM filter
    TimeDistributed Dense filter

- Adam optimizer:
    learning rate: 0.001
    beta_1: 0.9
    beta_2: 0.999
    epsilon: 1e-7
    
- training:
    epochs (large epoch may result in overfitting)
    

### LSTM

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# callbacks = [EarlyStopping(monitor='val_msle', patience=20, restore_best_weights=True),
#              ModelCheckpoint("best_model.h5", monitor="val_msle", save_weights_only=False,
#                              save_best_only=True, verbose=1),
#              ReduceLROnPlateau(monitor='val_msle', patience=5, factor=0.9, min_lr=0.00001, verbose=1)]

In [None]:
def evaluation_lstm_model(train, test, label_train, label_test, n_input, lstm_filter, dense_filter_decoder,
                          learning_rate, beta_1, beta_2, epsilon, epochs, n_out=6, n_gap=7):
    
    model = build_lstm_model(train, label_train, n_input=n_input, lstm_filter=int(lstm_filter), 
                            dense_filter_decoder=int(dense_filter_decoder), learning_rate=learning_rate, 
                            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, epochs=int(epochs), n_out=n_out, n_gap=n_gap)
    history = [x for x in train[:-(n_out + n_gap)]]
    
    predictions = list()
    observations = list()
    
    for i in tqdm(range(len(test) - (n_out + n_gap))):
        history.append(test[i, :])
        yhat_sequence = forecast(model, history, n_input, n_out)
        predictions.append(yhat_sequence)
        observation = np.split(label_test[(i + n_gap) * 7: (i + n_out + n_gap) * 7], n_out)
        observations.append(np.array(observation).sum(axis=1))
    predictions = np.array(predictions)[:, :, 0]
    observations = np.array(observations)
    
    return predictions, observations


def build_lstm_model(train, train_label, n_input, lstm_filter, dense_filter_decoder,
                     learning_rate, beta_1, beta_2, epsilon, epochs=35, n_out=6, n_gap=7):
    # prepare data
    train_x, train_y, train_x_weekly = to_supervised(train, train_label, n_input, n_out=n_out, n_gap=n_gap) 
    
    # define parameters
    verbose, batch_size = 0, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    tf.random.set_seed(42)
    
    main_inputs = Input(shape=(n_timesteps, n_features), name='main_inputs')
    x, state_h, state_c = LSTM(lstm_filter, activation='relu', return_state=True)(main_inputs)
    
    _, _, dim = train_x_weekly.shape
    
    weekly_inputs = Input(shape=(n_outputs, dim), name='weekly_inputs')
    y = LSTM(lstm_filter, activation='relu', return_sequences=True)(weekly_inputs, initial_state=[state_h, state_c])
    
    y = TimeDistributed(Dense(dense_filter_decoder, activation='relu'))(y)
    outputs = TimeDistributed(Dense(1), name='outputs')(y)
    
    optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
                     clipvalue=0.1)
    
    model = Model(inputs=[main_inputs, weekly_inputs], outputs=outputs)
    model.compile(loss='mse', optimizer=optimizer, metrics=['msle'])
    # fit network
    model.fit({'main_inputs': train_x, 'weekly_inputs': train_x_weekly}, 
              {'outputs': train_y}, epochs=epochs, batch_size=batch_size, verbose=verbose, 
              shuffle=False)
    
    return model

In [None]:
from typing import Dict

from bayes_opt import BayesianOptimization

# Bayesian Optimization

def optimization_process(fn, pbounds: Dict) -> Dict:

    """
    Bayesian optimization process interface. Returns hyperparameters of machine learning algorithms and the
    corresponding out-of-fold (oof) predictions

    Args:
        fn: functional that will be optimized
        pbounds: a dictionary having the boundary of parameters of fn

    Returns:
        A tuple of dictionary containing optimized hyperparameters and oof-predictions
    """

    optimizer = BayesianOptimization(
        f=fn,
        pbounds=pbounds,
        random_state=1)

    optimizer.maximize(init_points=8, n_iter=24)
    
    optimized_parameters = optimizer.max['params']

    return optimized_parameters

def lstm_training_process(epochs, lstm_filter, dense_filter_decoder,
                          learning_rate, beta_1, beta_2, epsilon):
    
    
    n_input = 28
    n_out = 4
    n_gap = 7
    
    train, test, label_train, label_test = split_dataset(daily_data.values)
    
    model = build_lstm_model(train, label_train, n_input=n_input, lstm_filter=int(lstm_filter), 
                            dense_filter_decoder=int(dense_filter_decoder), learning_rate=learning_rate, 
                            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, epochs=int(epochs), n_out=n_out, n_gap=n_gap)
    history = [x for x in train[:-(n_out + n_gap)]]
    
    predictions = list()
    observations = list()
    
    for i in range(len(test) - (n_out + n_gap)):
        history.append(test[i, :])
        yhat_sequence = forecast(model, history, n_input, n_out)
        predictions.append(yhat_sequence)
        observation = np.split(label_test[(i + n_gap) * 7: (i + n_out + n_gap) * 7], n_out)
        observations.append(np.array(observation).sum(axis=1))
    predictions = np.array(predictions)[:, :, 0]
    observations = np.array(observations)
    
    rmse = mean_squared_error(observations, predictions)
    
    return -rmse

# lstm_filter = 64
# dense_filter_decoder = 12
# learning_rate = 0.001
# beta_1 = 0.9
# beta_2 = 0.999
# epsilon = 1e-7

pbounds = {'epochs': (10, 40),
           'lstm_filter': (48, 130),
           'dense_filter_decoder': (8, 20),
           'learning_rate': (0.0001, 0.003),
           'beta_1': (0.5, 0.95),
           'beta_2': (0.7, 0.9999),
           'epsilon': (0.0000001, 0.00001)}

optimized_parameters = optimization_process(lstm_training_process, pbounds)

optimization parameters:

- beta_1: 0.5
- beta_2: 0.7
- dense_filter_decoder: 11
- epochs: 17
- epsilon: 1e-5
- learning_rate: 0.0018176
- lstm_filter: 48

In [None]:
optimized_parameters = {'beta_1': 0.5,
                        'beta_2': 0.7,
                        'dense_filter_decoder': 11,
                        'epochs': 17,
                        'epsilon': 1e-5,
                        'learning_rate': 0.0018176,
                        'lstm_filter': 48}

train, test, label_train, label_test = split_dataset(daily_data.values)

predictions, observations = evaluation_lstm_model(train, test, label_train, label_test, n_input, n_out=4, n_gap=7, **optimized_parameters)

In [None]:
optimized_lstm_rmse = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
optimized_lstm_rmse

### CNN-LSTM

In [None]:
def evaluation_cnn_lstm_model(train, test, label_train, label_test, n_input, attention_filter, lstm_filter, dense_filter_decoder,
                              learning_rate, beta_1, beta_2, epsilon, epochs, n_out=6, n_gap=7):
    
    model = build_cnn_lstm_model(train, label_train, n_input=n_input, attention_filter=int(attention_filter), lstm_filter=int(lstm_filter), 
                                 dense_filter_decoder=int(dense_filter_decoder), learning_rate=learning_rate, 
                                 beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, epochs=int(epochs), n_out=n_out, n_gap=n_gap)
    history = [x for x in train[:-(n_out + n_gap)]]
    
    predictions = list()
    observations = list()
    
    for i in tqdm(range(len(test) - (n_out + n_gap))):
        history.append(test[i, :])
        yhat_sequence = forecast(model, history, n_input, n_out)
        predictions.append(yhat_sequence)
        observation = np.split(label_test[(i + n_gap) * 7: (i + n_out + n_gap) * 7], n_out)
        observations.append(np.array(observation).sum(axis=1))
    predictions = np.array(predictions)[:, :, 0]
    observations = np.array(observations)
    
    return predictions, observations


def build_cnn_lstm_model(train, train_label, n_input, attention_filter, lstm_filter, dense_filter_decoder,
                         learning_rate, beta_1, beta_2, epsilon, epochs=35, n_out=6, n_gap=7):
    # prepare data
    train_x, train_y, train_x_weekly = to_supervised(train, train_label, n_input, n_out=n_out, n_gap=n_gap) 
    
    # define parameters
    verbose, batch_size = 0, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    tf.random.set_seed(42)
    
    
    main_inputs = Input(shape=(n_timesteps, n_features), name='main_inputs')
    x = Conv1D(filters=attention_filter, kernel_size=3, activation='relu', padding='same', 
               input_shape=(n_timesteps,n_features))(main_inputs)
    x = Dropout(0.3)(x)
    x, state_h, state_c = LSTM(lstm_filter, activation='relu', return_state=True)(x)
    
    _, _, dim = train_x_weekly.shape
    
    weekly_inputs = Input(shape=(n_outputs, dim), name='weekly_inputs')
    x = LSTM(lstm_filter, activation='relu', return_sequences=True)(weekly_inputs, initial_state=[state_h, state_c])
    x = TimeDistributed(Dense(dense_filter_decoder, activation='relu'))(x)
    outputs = TimeDistributed(Dense(1), name='outputs')(x)
    
    optimizer = Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
                     clipvalue=0.1)
    
    model = Model(inputs=[main_inputs, weekly_inputs], outputs=outputs)
    model.compile(loss='mse', optimizer=optimizer, metrics=['msle'])
    # fit network
    model.fit({'main_inputs': train_x, 'weekly_inputs': train_x_weekly}, 
              {'outputs': train_y}, epochs=epochs, batch_size=batch_size, verbose=verbose, 
              shuffle=False)
    
    return model

In [None]:
def cnn_lstm_training_process(epochs, attention_filter, lstm_filter, dense_filter_decoder,
                              learning_rate, beta_1, beta_2, epsilon):
    
    
    n_input = 28
    n_out = 4
    n_gap = 7
    
    train, test, label_train, label_test = split_dataset(daily_data.values)
    
    model = build_cnn_lstm_model(train, label_train, n_input=n_input, attention_filter=int(attention_filter), 
                                 lstm_filter=int(lstm_filter), dense_filter_decoder=int(dense_filter_decoder), 
                                 learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, 
                                 epochs=int(epochs), n_out=n_out, n_gap=n_gap)
    history = [x for x in train[:-(n_out + n_gap)]]
    
    predictions = list()
    observations = list()
    
    for i in range(len(test) - (n_out + n_gap)):
        history.append(test[i, :])
        yhat_sequence = forecast(model, history, n_input, n_out)
        predictions.append(yhat_sequence)
        observation = np.split(label_test[(i + n_gap) * 7: (i + n_out + n_gap) * 7], n_out)
        observations.append(np.array(observation).sum(axis=1))
    predictions = np.array(predictions)[:, :, 0]
    observations = np.array(observations)
    
    rmse = mean_squared_error(observations, predictions)
    
    return -rmse

# attention_filter = 64
# lstm_filter = 64
# dense_filter_decoder = 12
# learning_rate = 0.001
# beta_1 = 0.9
# beta_2 = 0.999
# epsilon = 1e-7

pbounds = {'epochs': (10, 25),
           'attention_filter': (48, 130),
           'lstm_filter': (16, 130),
           'dense_filter_decoder': (10, 40),
           'learning_rate': (0.0001, 0.01),
           'beta_1': (0.3, 0.95),
           'beta_2': (0.3, 0.9999),
           'epsilon': (0.0000001, 0.00001)}

cnn_lstm_optimized_parameters = optimization_process(cnn_lstm_training_process, pbounds)

optimization parameters:

- attention_filter: 95
- beta_1: 0.3954
- beta_2: 0.7125
- dense_filter_decoder: 30
- epochs: 11
- epsilon: 4.19915e-06
- learning_rate: 0.006974
- lstm_filter: 63

In [None]:
cnn_lstm_optimized_parameters = {'attention_filter': 95,
                        'beta_1': 0.3954,
                        'beta_2': 0.7125,
                        'dense_filter_decoder': 30,
                        'epochs': 11,
                        'epsilon': 4.19915e-06,
                        'learning_rate': 0.006974,
                        'lstm_filter': 63}


train, test, label_train, label_test = split_dataset(daily_data.values)

predictions, observations = evaluation_cnn_lstm_model(train, test, label_train, label_test, n_input, n_out=4, n_gap=7, **cnn_lstm_optimized_parameters)

optimized_cnn_lstm_rmse = np.sqrt(np.square(predictions-observations).mean(axis=0))

In [None]:
optimized_cnn_lstm_rmse

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))

ax.plot(np.arange(4), baseline_rmse, '-o', label='365 days prediction')
ax.plot(np.arange(4), lstm_rmse, '-o', label='LSTM')
ax.plot(np.arange(4), cnnlstm_rmse, '-o', label='CNN-LSTM')
ax.plot(np.arange(4), lstm_rmse_v2, '-o', label='LSTM_v2')
ax.plot(np.arange(4), cnnlstm_rmse_v2, '-o', label='CNN-LSTM_v2')
ax.plot(np.arange(4), optimized_lstm_rmse, '-o', label='LSTM, with fine tune')
ax.plot(np.arange(4), optimized_cnn_lstm_rmse, '-o', label='CNN-LSTM, with fine tune')

ax.legend()
ax.set_ylabel('RMSE (100k)')
ax.set_xticks(np.arange(4))
ax.set_xticklabels(np.arange(4) + 1)
ax.set_xlabel("7 + n Week")

# Prophet

- Dependence on time

In [None]:
from prophet import Prophet

scheduled_off = pd.to_datetime(['2018-02-15', '2018-02-16', '2018-02-17', '2018-02-18', '2018-12-30', '2018-12-31', '2019-01-01', 
                                '2019-02-02', '2019-02-03', '2019-02-04', '2019-02-05', '2019-02-06', '2019-02-07', '2019-02-08', 
                                '2019-02-09', '2019-02-10', '2019-12-30', '2019-12-31', '2020-01-23', '2020-01-24', '2020-01-25', 
                                '2020-01-26', '2020-01-27', '2021-01-01', '2021-02-11', '2021-02-12', '2021-02-13', '2021-02-14', 
                                '2019-04-05', '2019-04-06', '2019-04-07', '2016-02-06', '2016-02-07', '2016-02-08', '2016-02-09',
                                '2016-02-10', '2016-04-04', '2016-09-16', '2016-12-29', '2016-12-30', '2016-12-31', '2017-01-27', 
                                '2017-01-28', '2017-01-29', '2017-01-30'])

scheduled_off_df = pd.DataFrame({'holiday': 'scheduled_off',
                                 'ds': scheduled_off,
                                 'lower_window': -2,  # range of impact of off days
                                 'upper_window': 2})

In [None]:
# In order to achieve the specification of Prophet

df.index.name = 'ds'  # specify timestamp index name as ds
df.reset_index(inplace=True)  # make index into column
df.rename(columns={'A': 'y'}, inplace=True)  # rename the target column as y

# train-test split

df_train = df.iloc[:-350]
df_test = df.iloc[-350:] 

In [None]:
# Prophet also has built-in yearly, weekly, and daily seasonality
# Because we have only consumptions per day, we also turned off daily seasonality

m = Prophet(holidays=scheduled_off_df, 
            growth='linear',
            changepoint_prior_scale=0.005,
            changepoint_range=0.9,
            yearly_seasonality=10,
            weekly_seasonality=True,
            daily_seasonality=False)

#  Add quarter seasonality by hand
#  Give the name of your seasonality and the corresponding period
m.add_seasonality(name='month', period=30, fourier_order=5) 
m.fit(df_train)

In [None]:
#timestamp

In [None]:
scheduled_off_df.head(5)

In [None]:
future = m.make_future_dataframe(periods=350)

prediction = m.predict(future)
fig = m.plot_components(prediction)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(16, 6))

ax.plot(df_test['y'].values, label='Observation')
ax.plot(prediction.iloc[-350:]['yhat'].values, label='Prophet prediction')

ax.set_xticks(np.arange(0, 350, 50))
ax.set_xticklabels([str(a)[:10] for a in df_test['ds'].values[0:350:50]])
ax.set_title("Prophet Prediction")

# Embedding

In [None]:
Image(url='https://blog.floydhub.com/content/images/2018/12/queen-man-woman.png')

月，週，日，季(1-3, 4-6, 7-9, 10-12)