In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import os

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.utils.time_series_procs import split_sequences
from src.models.transformer_ts_tf import Transformer

In [2]:
def get_times_series_data(df, cut_date):
    columns = list(df.columns.drop(['R28D']))
    columns.append('R28D')
    df = df.reindex(columns=columns)
    df['Data/Hora'] = pd.to_datetime(df['Data/Hora'])
    df_train = df[df['Data/Hora'] < cut_date]
    df_test = df[df['Data/Hora'] >= cut_date]
    df_train = df_train.drop(['Data/Hora'], axis=1)
    df_test = df_test.drop(['Data/Hora'], axis=1)
    x_train, y_train = split_sequences(df_train.values, TIME_STEPS)
    x_test, y_test = split_sequences(df_test.values, TIME_STEPS)
    return x_train, y_train, x_test, y_test

In [3]:
class StandardScaler3DShape():
    def __init__(self):
        self.scaler = StandardScaler()
    def fit_transform(self,X, y=None):
        X_new = self.scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new
        
    def fit(self,X, y=None):
        X_new = X.reshape(-1, X.shape[-1])
        self.scaler.fit(X_new)
        return self
    
    def transform(self,X,y=None):
        X_new = self.scaler.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

In [4]:
def get_baseline_model(num_features, time_steps, num_heads=2, num_layers_enc=1, num_feed_forward=32):
    model = Transformer(num_hid=num_features,
                        time_steps=time_steps,
                        num_head=num_heads,
                        num_layers_enc=num_layers_enc,
                        num_feed_forward=num_feed_forward)

    opt = tf.keras.optimizers.Adam(learning_rate=0.01)
    model.compile(optimizer=opt, loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")])
    return model

In [5]:
def make_repeated_time_series_k_fold(x, 
                                     y, 
                                     train_period,
                                     test_period,
                                     repeats=10,
                                     show_individual_results=True,
                                     show_final_result=True):
    results = []
    max_samples = x.shape[0]

    for _ in range(repeats):
        i = 0
        scores = []
        for _ in range(0, max_samples//train_period):
            x_train = x[i:i+train_period]
            y_train = y[i:i+train_period]
            x_test = x[i+train_period: i+train_period+test_period]
            y_test = y[i+train_period: i+train_period+test_period]
            i += train_period
            x_train = scaler.fit_transform(x_train.reshape(-1, x_train.shape[-1])).reshape(x_train.shape)
            x_test = scaler.transform(x_test.reshape(-1, x_test.shape[-1])).reshape(x_test.shape)
            model = get_baseline_model(num_features=x_train.shape[-1], time_steps=x_train.shape[-2])
            model.fit(x_train, y_train, epochs=100, batch_size=32, verbose=0)
            rmse = model.evaluate(x_test, y_test, verbose=0)
            scores.append(rmse[1])
        results.append(scores)
        if show_individual_results:
            print('RMSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
    if show_final_result:
        print('\nRMSE: %.3f (%.3f)' % (np.mean(results), np.std(results)))
    return results

In [6]:
def make_timesteps_repeated_time_series_k_fold(df,
                                               train_period,
                                               test_period,
                                               repeats=10,
                                               timesteps_list=[3],
                                               show_results=True):
    results = {}
    
    for timesteps in timesteps_list:
        x, y = split_sequences(df.values, timesteps)
        scores = make_repeated_time_series_k_fold(x,
                                                  y,
                                                  train_period,
                                                  test_period,
                                                  repeats=10,
                                                  show_individual_results=False,
                                                  show_final_result=False)
        results[timesteps] = scores
        if show_results:
            print('TIMESTEPS: %d RMSE: %.3f (%.3f)' % (timesteps, np.mean(scores), np.std(scores)))
    return results

<h3>Disable Tensorflow Warnings</h3>

In [7]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [8]:
name = "dados-historicos-partner_i-cement-CPIIE40.csv"
df_r3d_and_r7d = pd.read_csv("../../data/interim/" + "r3d_and_r7d-no-resampling-with-fillna-" + name)
df_r3d_only = pd.read_csv("../../data/interim/" + "r3d_only-no-resampling-with-fillna-" + name)
df_no_r3d_r7d = pd.read_csv("../../data/interim/" + "no-r3d-r7d-no-resampling-with-fillna-" + name)

In [9]:
seed = 47
scaler = StandardScaler()

timesteps_list = [1, 3, 5, 7, 10, 15, 20]
repeats = 10

# Transformer Encoder

# Configs:

1. TIMESTEPS: [1, 3, 5, 7, 10, 15, 20]

2. Epochs: 100

3. Batch size: 32

4. num heads: 2

5. FF Layers enc: 1

      5.1 Units 32

6. Optimization Algorithm: Adam
    
    6.1. Learning Rate: 1e-2

# Times Series Repeated KFold Cross validation - different Timesteps values

Here we will make a Times Series KFold Cross validation with all 3 datasets. Here we are using the Blocking Time Series Split method. And with timeseries Split method. We also do it with different number of timesteps


<h3> Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

In [10]:
df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora'], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [11]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [12]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.667 (0.309)
TIMESTEPS: 3 RMSE: 2.518 (0.698)
TIMESTEPS: 5 RMSE: 2.593 (0.400)
TIMESTEPS: 7 RMSE: 2.448 (0.393)
TIMESTEPS: 10 RMSE: 2.592 (0.443)
TIMESTEPS: 15 RMSE: 2.833 (0.542)
TIMESTEPS: 20 RMSE: 3.196 (1.218)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [13]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [14]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 3.013 (0.715)
TIMESTEPS: 3 RMSE: 3.388 (0.319)
TIMESTEPS: 5 RMSE: 3.492 (0.202)
TIMESTEPS: 7 RMSE: 3.570 (0.475)
TIMESTEPS: 10 RMSE: 2.727 (0.460)
TIMESTEPS: 15 RMSE: 3.132 (0.310)
TIMESTEPS: 20 RMSE: 2.693 (0.551)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [15]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [16]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.830 (0.577)
TIMESTEPS: 3 RMSE: 2.783 (0.483)
TIMESTEPS: 5 RMSE: 2.351 (0.447)
TIMESTEPS: 7 RMSE: 2.616 (0.888)
TIMESTEPS: 10 RMSE: 2.640 (0.627)
TIMESTEPS: 15 RMSE: 2.497 (0.381)
TIMESTEPS: 20 RMSE: 2.156 (0.377)


<h3> Time Series Repeated KFold Cross Validation - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset only R3D variable is considered.

In [17]:
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora'], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [18]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [19]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.661 (0.296)
TIMESTEPS: 3 RMSE: 2.528 (0.644)
TIMESTEPS: 5 RMSE: 2.658 (0.426)
TIMESTEPS: 7 RMSE: 2.386 (0.313)
TIMESTEPS: 10 RMSE: 2.606 (0.414)
TIMESTEPS: 15 RMSE: 2.864 (0.545)
TIMESTEPS: 20 RMSE: 2.942 (0.520)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [20]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [21]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.833 (0.510)
TIMESTEPS: 3 RMSE: 3.392 (0.308)
TIMESTEPS: 5 RMSE: 3.677 (0.331)
TIMESTEPS: 7 RMSE: 3.711 (0.552)
TIMESTEPS: 10 RMSE: 2.646 (0.353)
TIMESTEPS: 15 RMSE: 3.295 (0.359)
TIMESTEPS: 20 RMSE: 2.813 (0.470)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [22]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [23]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.632 (0.452)
TIMESTEPS: 3 RMSE: 2.749 (0.576)
TIMESTEPS: 5 RMSE: 2.466 (0.323)
TIMESTEPS: 7 RMSE: 2.938 (0.616)
TIMESTEPS: 10 RMSE: 3.025 (0.854)
TIMESTEPS: 15 RMSE: 2.747 (0.450)
TIMESTEPS: 20 RMSE: 2.289 (1.003)


<h3> Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

In [24]:
df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora'], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [25]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [26]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.665 (0.346)
TIMESTEPS: 3 RMSE: 2.390 (0.621)
TIMESTEPS: 5 RMSE: 2.610 (0.485)
TIMESTEPS: 7 RMSE: 2.511 (0.435)
TIMESTEPS: 10 RMSE: 2.631 (0.598)
TIMESTEPS: 15 RMSE: 2.833 (0.555)
TIMESTEPS: 20 RMSE: 2.721 (0.538)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [27]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [28]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.448 (0.592)
TIMESTEPS: 3 RMSE: 3.433 (0.277)
TIMESTEPS: 5 RMSE: 3.457 (0.354)
TIMESTEPS: 7 RMSE: 3.539 (0.260)
TIMESTEPS: 10 RMSE: 2.862 (0.599)
TIMESTEPS: 15 RMSE: 3.514 (0.485)
TIMESTEPS: 20 RMSE: 2.724 (0.356)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [29]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [30]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.809 (0.605)
TIMESTEPS: 3 RMSE: 2.693 (0.402)
TIMESTEPS: 5 RMSE: 2.626 (0.586)
TIMESTEPS: 7 RMSE: 2.665 (0.392)
TIMESTEPS: 10 RMSE: 2.610 (0.746)
TIMESTEPS: 15 RMSE: 2.730 (0.486)
TIMESTEPS: 20 RMSE: 2.231 (0.577)
