In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import os

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.utils.time_series_procs import split_sequences
from src.models.transformer_ts_tf import Transformer

In [2]:
def get_times_series_data(df, cut_date):
    columns = list(df.columns.drop(['R28D']))
    columns.append('R28D')
    df = df.reindex(columns=columns)
    df['Data/Hora'] = pd.to_datetime(df['Data/Hora'])
    df_train = df[df['Data/Hora'] < cut_date]
    df_test = df[df['Data/Hora'] >= cut_date]
    df_train = df_train.drop(['Data/Hora'], axis=1)
    df_test = df_test.drop(['Data/Hora'], axis=1)
    x_train, y_train = split_sequences(df_train.values, TIME_STEPS)
    x_test, y_test = split_sequences(df_test.values, TIME_STEPS)
    return x_train, y_train, x_test, y_test

In [3]:
class StandardScaler3DShape():
    def __init__(self):
        self.scaler = StandardScaler()
    def fit_transform(self,X, y=None):
        X_new = self.scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new
        
    def fit(self,X, y=None):
        X_new = X.reshape(-1, X.shape[-1])
        self.scaler.fit(X_new)
        return self
    
    def transform(self,X,y=None):
        X_new = self.scaler.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

In [4]:
def get_baseline_model(num_features, time_steps, num_heads=2, num_layers_enc=1, num_feed_forward=32):
    model = Transformer(num_hid=num_features,
                        time_steps=time_steps,
                        num_head=num_heads,
                        num_layers_enc=num_layers_enc,
                        num_feed_forward=num_feed_forward)

    opt = tf.keras.optimizers.Adam(learning_rate=0.01)
    model.compile(optimizer=opt, loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")])
    return model

In [5]:
def make_repeated_time_series_k_fold(x, 
                                     y, 
                                     train_period,
                                     test_period,
                                     repeats=10,
                                     show_individual_results=True,
                                     show_final_result=True):
    results = []
    max_samples = x.shape[0]

    for _ in range(repeats):
        i = 0
        scores = []
        for _ in range(0, max_samples//train_period):
            x_train = x[i:i+train_period]
            y_train = y[i:i+train_period]
            x_test = x[i+train_period: i+train_period+test_period]
            y_test = y[i+train_period: i+train_period+test_period]
            i += train_period
            x_train = scaler.fit_transform(x_train.reshape(-1, x_train.shape[-1])).reshape(x_train.shape)
            x_test = scaler.transform(x_test.reshape(-1, x_test.shape[-1])).reshape(x_test.shape)
            model = get_baseline_model(num_features=x_train.shape[-1], time_steps=x_train.shape[-2])
            model.fit(x_train, y_train, epochs=100, batch_size=32, verbose=0)
            rmse = model.evaluate(x_test, y_test, verbose=0)
            scores.append(rmse[1])
        results.append(scores)
        if show_individual_results:
            print('RMSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
    if show_final_result:
        print('\nRMSE: %.3f (%.3f)' % (np.mean(results), np.std(results)))
    return results

In [6]:
def make_timesteps_repeated_time_series_k_fold(df,
                                               train_period,
                                               test_period,
                                               repeats=10,
                                               timesteps_list=[3],
                                               show_results=True):
    results = {}
    
    for timesteps in timesteps_list:
        x, y = split_sequences(df.values, timesteps)
        scores = make_repeated_time_series_k_fold(x,
                                                  y,
                                                  train_period,
                                                  test_period,
                                                  repeats=10,
                                                  show_individual_results=False,
                                                  show_final_result=False)
        results[timesteps] = scores
        if show_results:
            print('TIMESTEPS: %d RMSE: %.3f (%.3f)' % (timesteps, np.mean(scores), np.std(scores)))
    return results

<h3>Disable Tensorflow Warnings</h3>

In [7]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [8]:
name = "dados-historicos-partner_i-cement-CPIIF40.csv"
df_r3d_and_r7d = pd.read_csv("../../data/interim/" + "r3d_and_r7d-no-resampling-with-fillna-" + name)
df_r3d_only = pd.read_csv("../../data/interim/" + "r3d_only-no-resampling-with-fillna-" + name)
df_no_r3d_r7d = pd.read_csv("../../data/interim/" + "no-r3d-r7d-no-resampling-with-fillna-" + name)

In [9]:
seed = 47
scaler = StandardScaler()

timesteps_list = [1, 3, 5, 7, 10, 15, 20]
repeats = 10

# Transformer Encoder

# Configs:

1. TIMESTEPS: [1, 3, 5, 7, 10, 15, 20]

2. Epochs: 100

3. Batch size: 32

4. num heads: 2

5. FF Layers enc: 1

      5.1 Units 32

6. Optimization Algorithm: Adam
    
    6.1. Learning Rate: 1e-2

# Times Series Repeated KFold Cross validation - different Timesteps values

Here we will make a Times Series KFold Cross validation with all 3 datasets. Here we are using the Blocking Time Series Split method. And with timeseries Split method. We also do it with different number of timesteps


<h3> Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

In [10]:
df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora'], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [11]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [12]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 1.999 (0.232)
TIMESTEPS: 3 RMSE: 2.057 (0.324)
TIMESTEPS: 5 RMSE: 1.974 (0.356)
TIMESTEPS: 7 RMSE: 2.234 (0.541)
TIMESTEPS: 10 RMSE: 2.478 (0.461)
TIMESTEPS: 15 RMSE: 3.050 (0.779)
TIMESTEPS: 20 RMSE: 2.812 (0.451)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [13]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [14]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 1.940 (0.272)
TIMESTEPS: 3 RMSE: 2.316 (0.441)
TIMESTEPS: 5 RMSE: 2.479 (0.406)
TIMESTEPS: 7 RMSE: 2.408 (0.914)
TIMESTEPS: 10 RMSE: 2.573 (0.339)
TIMESTEPS: 15 RMSE: 2.076 (0.140)
TIMESTEPS: 20 RMSE: 2.508 (0.271)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [15]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [16]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.902 (0.380)
TIMESTEPS: 3 RMSE: 3.101 (0.403)
TIMESTEPS: 5 RMSE: 3.370 (0.343)
TIMESTEPS: 7 RMSE: 3.627 (0.231)
TIMESTEPS: 10 RMSE: 3.399 (0.504)
TIMESTEPS: 15 RMSE: 3.266 (0.336)
TIMESTEPS: 20 RMSE: 2.879 (0.418)


<h3> Time Series Repeated KFold Cross Validation - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset only R3D variable is considered.

In [17]:
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora'], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [18]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [19]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 1.939 (0.334)
TIMESTEPS: 3 RMSE: 2.136 (0.659)
TIMESTEPS: 5 RMSE: 2.366 (0.730)
TIMESTEPS: 7 RMSE: 2.573 (0.490)
TIMESTEPS: 10 RMSE: 2.092 (0.572)
TIMESTEPS: 15 RMSE: 1.919 (0.641)
TIMESTEPS: 20 RMSE: 2.428 (0.965)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [20]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [21]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.607 (0.334)
TIMESTEPS: 3 RMSE: 3.869 (0.478)
TIMESTEPS: 5 RMSE: 3.514 (0.378)
TIMESTEPS: 7 RMSE: 3.153 (0.518)
TIMESTEPS: 10 RMSE: 3.643 (0.424)
TIMESTEPS: 15 RMSE: 2.044 (0.418)
TIMESTEPS: 20 RMSE: 2.245 (0.582)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [22]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [23]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 1.426 (0.126)
TIMESTEPS: 3 RMSE: 2.332 (0.326)
TIMESTEPS: 5 RMSE: 2.665 (0.376)
TIMESTEPS: 7 RMSE: 2.425 (0.410)
TIMESTEPS: 10 RMSE: 2.520 (0.408)
TIMESTEPS: 15 RMSE: 2.771 (0.699)
TIMESTEPS: 20 RMSE: 2.846 (0.514)


<h3> Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

In [24]:
df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora'], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [25]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [26]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.076 (0.338)
TIMESTEPS: 3 RMSE: 2.228 (0.649)
TIMESTEPS: 5 RMSE: 2.533 (0.829)
TIMESTEPS: 7 RMSE: 2.162 (0.728)
TIMESTEPS: 10 RMSE: 2.630 (0.621)
TIMESTEPS: 15 RMSE: 2.152 (1.007)
TIMESTEPS: 20 RMSE: 2.343 (0.676)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [27]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [28]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 2.403 (0.348)
TIMESTEPS: 3 RMSE: 3.185 (0.614)
TIMESTEPS: 5 RMSE: 3.388 (0.710)
TIMESTEPS: 7 RMSE: 3.562 (0.243)
TIMESTEPS: 10 RMSE: 3.158 (0.450)
TIMESTEPS: 15 RMSE: 2.393 (0.620)
TIMESTEPS: 20 RMSE: 1.860 (0.716)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [29]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [30]:
results = make_timesteps_repeated_time_series_k_fold(df, train_period, test_period, repeats, timesteps_list, True)

TIMESTEPS: 1 RMSE: 1.639 (0.235)
TIMESTEPS: 3 RMSE: 2.269 (0.598)
TIMESTEPS: 5 RMSE: 2.625 (0.384)
TIMESTEPS: 7 RMSE: 2.685 (0.779)
TIMESTEPS: 10 RMSE: 2.399 (0.420)
TIMESTEPS: 15 RMSE: 3.332 (0.838)
TIMESTEPS: 20 RMSE: 2.854 (0.338)
