In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import os

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.utils.time_series_procs import split_sequences
from src.models.transformer_ts_tf import Transformer

In [2]:
def get_times_series_data(df, cut_date):
    columns = list(df.columns.drop(["R28D"]))
    columns.append("R28D")
    df = df.reindex(columns=columns)
    df["Data/Hora"] = pd.to_datetime(df["Data/Hora"])
    df_train = df[df["Data/Hora"] < cut_date]
    df_test = df[df["Data/Hora"] >= cut_date]
    df_train = df_train.drop(["Data/Hora"], axis=1)
    df_test = df_test.drop(["Data/Hora"], axis=1)
    x_train, y_train = split_sequences(df_train.values, TIME_STEPS)
    x_test, y_test = split_sequences(df_test.values, TIME_STEPS)
    return x_train, y_train, x_test, y_test

In [3]:
class StandardScaler3DShape:
    def __init__(self):
        self.scaler = StandardScaler()

    def fit_transform(self, X, y=None):
        X_new = self.scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

    def fit(self, X, y=None):
        X_new = X.reshape(-1, X.shape[-1])
        self.scaler.fit(X_new)
        return self

    def transform(self, X, y=None):
        X_new = self.scaler.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

In [4]:
def get_baseline_model(
    num_features, time_steps, num_heads=2, num_layers_enc=1, num_feed_forward=32
):
    model = Transformer(
        num_hid=num_features,
        time_steps=time_steps,
        num_head=num_heads,
        num_layers_enc=num_layers_enc,
        num_feed_forward=num_feed_forward,
    )

    opt = tf.keras.optimizers.Adam(learning_rate=0.01)
    model.compile(
        optimizer=opt,
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
    )
    return model

In [5]:
def make_repeated_time_series_k_fold(
    x,
    y,
    train_period,
    test_period,
    repeats=10,
    show_individual_results=True,
    show_final_result=True,
):
    results = []
    max_samples = x.shape[0]

    for _ in range(repeats):
        i = 0
        scores = []
        for _ in range(0, max_samples // train_period):
            x_train = x[i : i + train_period]
            y_train = y[i : i + train_period]
            x_test = x[i + train_period : i + train_period + test_period]
            y_test = y[i + train_period : i + train_period + test_period]
            i += train_period
            x_train = scaler.fit_transform(
                x_train.reshape(-1, x_train.shape[-1])
            ).reshape(x_train.shape)
            x_test = scaler.transform(x_test.reshape(-1, x_test.shape[-1])).reshape(
                x_test.shape
            )
            model = get_baseline_model(
                num_features=x_train.shape[-1], time_steps=x_train.shape[-2]
            )
            model.fit(x_train, y_train, epochs=100, batch_size=32, verbose=0)
            rmse = model.evaluate(x_test, y_test, verbose=0)
            scores.append(rmse[1])
        results.append(scores)
        if show_individual_results:
            print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))
    if show_final_result:
        print("\nRMSE: %.3f (%.3f)" % (np.mean(results), np.std(results)))
    return results

In [6]:
def make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats=10, timesteps_list=[3], show_results=True
):
    results = {}

    for timesteps in timesteps_list:
        x, y = split_sequences(df.values, timesteps)
        scores = make_repeated_time_series_k_fold(
            x,
            y,
            train_period,
            test_period,
            repeats=10,
            show_individual_results=False,
            show_final_result=False,
        )
        results[timesteps] = scores
        if show_results:
            print(
                "TIMESTEPS: %d RMSE: %.3f (%.3f)"
                % (timesteps, np.mean(scores), np.std(scores))
            )
    return results

<h3>Disable Tensorflow Warnings</h3>

In [7]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [8]:
df_r3d_and_r7d = pd.read_csv(
    "../../../../data/interim/"
    + "r3d_and_r7d-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)
df_r3d_only = pd.read_csv(
    "../../../../data/interim/"
    + "r3d_only-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)
df_no_r3d_r7d = pd.read_csv(
    "../../../../data/interim/"
    + "no-r3d-r7d-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)

In [9]:
seed = 47
scaler = StandardScaler()

timesteps_list = [1, 3, 5, 7, 10, 15, 20]
repeats = 10

## Feature Selection

### In this notebook we remove variables that could potentially cause data leakage. Thus we are performing testings removing the following variables:

#### IP - Initial setting time
#### FP - Final setting time


#### BL - Blaine specific surface (This is both removed and use)

# Transformer Encoder

# Configs:

1. TIMESTEPS: [1, 3, 5, 7, 10, 15, 20]

2. Epochs: 100

3. Batch size: 32

4. num heads: 2

5. FF Layers enc: 1

      5.1 Units 32

6. Optimization Algorithm: Adam
    
    6.1. Learning Rate: 1e-2

# Times Series Repeated KFold Cross validation - different Timesteps values

Here we will make a Times Series KFold Cross validation with all 3 datasets. Here we are using the Blocking Time Series Split method. And with timeseries Split method. We also do it with different number of timesteps


## 1.1 Dataset: no-r3d-r7d-no-resampling-with-fillna

<h3>1.1.1 Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [10]:
df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [11]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [12]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.537 (0.315)
TIMESTEPS: 3 RMSE: 2.529 (0.574)
TIMESTEPS: 5 RMSE: 2.624 (0.408)
TIMESTEPS: 7 RMSE: 2.474 (0.414)
TIMESTEPS: 10 RMSE: 2.437 (0.420)
TIMESTEPS: 15 RMSE: 2.583 (0.546)
TIMESTEPS: 20 RMSE: 2.698 (0.463)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [13]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [14]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.913 (0.544)
TIMESTEPS: 3 RMSE: 3.053 (0.372)
TIMESTEPS: 5 RMSE: 3.414 (0.368)
TIMESTEPS: 7 RMSE: 3.634 (0.300)
TIMESTEPS: 10 RMSE: 2.575 (0.367)
TIMESTEPS: 15 RMSE: 3.146 (0.439)
TIMESTEPS: 20 RMSE: 2.891 (0.466)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [15]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [16]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.329 (0.563)
TIMESTEPS: 3 RMSE: 2.560 (0.502)
TIMESTEPS: 5 RMSE: 2.679 (0.632)
TIMESTEPS: 7 RMSE: 2.969 (0.818)
TIMESTEPS: 10 RMSE: 3.398 (1.095)
TIMESTEPS: 15 RMSE: 2.324 (0.404)
TIMESTEPS: 20 RMSE: 2.423 (0.642)


<h3>1.1.2 Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [17]:
df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP", "BL"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [18]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [19]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.521 (0.268)
TIMESTEPS: 3 RMSE: 2.537 (0.551)
TIMESTEPS: 5 RMSE: 2.563 (0.570)
TIMESTEPS: 7 RMSE: 2.223 (0.318)
TIMESTEPS: 10 RMSE: 2.573 (0.348)
TIMESTEPS: 15 RMSE: 2.696 (0.469)
TIMESTEPS: 20 RMSE: 2.796 (0.452)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [20]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [21]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 3.001 (0.580)
TIMESTEPS: 3 RMSE: 3.203 (0.281)
TIMESTEPS: 5 RMSE: 3.567 (0.239)
TIMESTEPS: 7 RMSE: 3.226 (0.505)
TIMESTEPS: 10 RMSE: 2.700 (0.230)
TIMESTEPS: 15 RMSE: 3.274 (0.343)
TIMESTEPS: 20 RMSE: 2.544 (0.489)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [22]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [23]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.275 (0.517)
TIMESTEPS: 3 RMSE: 2.227 (0.245)
TIMESTEPS: 5 RMSE: 2.911 (0.822)
TIMESTEPS: 7 RMSE: 2.189 (0.498)
TIMESTEPS: 10 RMSE: 2.962 (0.733)
TIMESTEPS: 15 RMSE: 2.287 (0.354)
TIMESTEPS: 20 RMSE: 2.495 (0.711)


<h3>1.1.3 Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> 

1. The IP, FP and BL variables are discarded.

2. High correlated variables are removed

In [24]:
to_drop = [
    "Data/Hora",
    "IP",
    "FP",
    "BL",
    "CALCITA",
    "DESIDRATAÇÃO",
    "GIPSITA",
    "TIO2",
    "AL2O3",
    "PF",
    "FERRITA",
]

df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(to_drop, axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [25]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [26]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.367 (0.237)
TIMESTEPS: 3 RMSE: 2.345 (0.472)
TIMESTEPS: 5 RMSE: 2.480 (0.397)
TIMESTEPS: 7 RMSE: 2.293 (0.292)
TIMESTEPS: 10 RMSE: 2.487 (0.304)
TIMESTEPS: 15 RMSE: 2.751 (0.344)
TIMESTEPS: 20 RMSE: 2.571 (0.509)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [27]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [28]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.816 (0.270)
TIMESTEPS: 3 RMSE: 3.106 (0.355)
TIMESTEPS: 5 RMSE: 3.151 (0.390)
TIMESTEPS: 7 RMSE: 2.936 (0.433)
TIMESTEPS: 10 RMSE: 2.765 (0.602)
TIMESTEPS: 15 RMSE: 2.653 (0.578)
TIMESTEPS: 20 RMSE: 2.373 (0.594)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [29]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [30]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 1.689 (0.568)
TIMESTEPS: 3 RMSE: 2.289 (0.421)
TIMESTEPS: 5 RMSE: 2.186 (0.590)
TIMESTEPS: 7 RMSE: 2.402 (0.640)
TIMESTEPS: 10 RMSE: 2.514 (0.636)
TIMESTEPS: 15 RMSE: 2.311 (0.591)
TIMESTEPS: 20 RMSE: 2.377 (0.508)


## 1.2 Dataset: df_r3d_only

<h3>1.2.1 Time Series Repeated KFold Cross Validation - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset only R3D variable is considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [31]:
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [32]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [33]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.580 (0.252)
TIMESTEPS: 3 RMSE: 2.505 (0.628)
TIMESTEPS: 5 RMSE: 2.502 (0.304)
TIMESTEPS: 7 RMSE: 2.346 (0.281)
TIMESTEPS: 10 RMSE: 2.670 (0.438)
TIMESTEPS: 15 RMSE: 2.717 (0.409)
TIMESTEPS: 20 RMSE: 3.153 (1.506)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [34]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [35]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.664 (0.428)
TIMESTEPS: 3 RMSE: 3.393 (0.331)
TIMESTEPS: 5 RMSE: 3.549 (0.231)
TIMESTEPS: 7 RMSE: 3.221 (0.744)
TIMESTEPS: 10 RMSE: 2.575 (0.365)
TIMESTEPS: 15 RMSE: 3.474 (0.348)
TIMESTEPS: 20 RMSE: 2.662 (0.654)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [36]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [37]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.452 (0.368)
TIMESTEPS: 3 RMSE: 2.785 (0.527)
TIMESTEPS: 5 RMSE: 2.516 (0.464)
TIMESTEPS: 7 RMSE: 2.899 (0.625)
TIMESTEPS: 10 RMSE: 2.770 (0.853)
TIMESTEPS: 15 RMSE: 2.704 (0.752)
TIMESTEPS: 20 RMSE: 2.094 (0.540)


<h3>1.2.2 Time Series Repeated KFold Cross Validation with vary timesteps - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [38]:
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP", "BL"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [39]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [40]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.581 (0.263)
TIMESTEPS: 3 RMSE: 2.392 (0.518)
TIMESTEPS: 5 RMSE: 2.528 (0.484)
TIMESTEPS: 7 RMSE: 2.347 (0.368)
TIMESTEPS: 10 RMSE: 2.590 (0.454)
TIMESTEPS: 15 RMSE: 2.800 (0.583)
TIMESTEPS: 20 RMSE: 3.405 (1.119)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [41]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [42]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.661 (0.409)
TIMESTEPS: 3 RMSE: 3.132 (0.434)
TIMESTEPS: 5 RMSE: 3.454 (0.211)
TIMESTEPS: 7 RMSE: 3.327 (0.367)
TIMESTEPS: 10 RMSE: 2.437 (0.292)
TIMESTEPS: 15 RMSE: 3.162 (0.369)
TIMESTEPS: 20 RMSE: 2.702 (0.354)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [43]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [44]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.200 (0.413)
TIMESTEPS: 3 RMSE: 2.883 (0.467)
TIMESTEPS: 5 RMSE: 2.732 (0.509)
TIMESTEPS: 7 RMSE: 2.754 (0.318)
TIMESTEPS: 10 RMSE: 2.643 (0.555)
TIMESTEPS: 15 RMSE: 3.052 (1.436)
TIMESTEPS: 20 RMSE: 2.472 (0.930)


<h3>1.2.3 Time Series Repeated KFold Cross Validation with vary timesteps - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b>

1. The IP, FP and BL variables are discarded.

2. High correlated variables are removed

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [45]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [46]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.563 (0.332)
TIMESTEPS: 3 RMSE: 2.455 (0.580)
TIMESTEPS: 5 RMSE: 2.670 (0.915)
TIMESTEPS: 7 RMSE: 2.338 (0.456)
TIMESTEPS: 10 RMSE: 2.791 (0.347)
TIMESTEPS: 15 RMSE: 2.677 (0.479)
TIMESTEPS: 20 RMSE: 2.953 (0.639)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [47]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [48]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.807 (0.375)
TIMESTEPS: 3 RMSE: 3.393 (0.399)
TIMESTEPS: 5 RMSE: 3.414 (0.224)
TIMESTEPS: 7 RMSE: 3.395 (0.460)
TIMESTEPS: 10 RMSE: 2.540 (0.240)
TIMESTEPS: 15 RMSE: 3.203 (0.322)
TIMESTEPS: 20 RMSE: 2.924 (0.407)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [49]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [50]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.135 (0.444)
TIMESTEPS: 3 RMSE: 2.580 (0.380)
TIMESTEPS: 5 RMSE: 2.573 (0.685)
TIMESTEPS: 7 RMSE: 2.533 (0.541)
TIMESTEPS: 10 RMSE: 2.599 (0.381)
TIMESTEPS: 15 RMSE: 2.824 (0.592)
TIMESTEPS: 20 RMSE: 2.738 (1.051)


## 1.3 Dataset: df_r3d_and_r7d

<h3>1.3.1 Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [51]:
df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [52]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [53]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.658 (0.287)
TIMESTEPS: 3 RMSE: 2.372 (0.517)
TIMESTEPS: 5 RMSE: 2.459 (0.425)
TIMESTEPS: 7 RMSE: 2.454 (0.403)
TIMESTEPS: 10 RMSE: 2.646 (0.427)
TIMESTEPS: 15 RMSE: 2.980 (0.542)
TIMESTEPS: 20 RMSE: 3.283 (0.828)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [54]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [55]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.495 (0.426)
TIMESTEPS: 3 RMSE: 3.256 (0.318)
TIMESTEPS: 5 RMSE: 3.523 (0.332)
TIMESTEPS: 7 RMSE: 3.320 (0.549)
TIMESTEPS: 10 RMSE: 2.534 (0.405)
TIMESTEPS: 15 RMSE: 3.374 (0.491)
TIMESTEPS: 20 RMSE: 2.964 (0.465)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [56]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [57]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.234 (0.537)
TIMESTEPS: 3 RMSE: 2.717 (0.510)
TIMESTEPS: 5 RMSE: 2.528 (0.531)
TIMESTEPS: 7 RMSE: 2.432 (0.445)
TIMESTEPS: 10 RMSE: 2.326 (0.499)
TIMESTEPS: 15 RMSE: 2.525 (0.472)
TIMESTEPS: 20 RMSE: 2.033 (0.287)


<h3>1.3.2 Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b> The IP, FP, and BL variables are discarded.

In [58]:
df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP", "BL"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [59]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [60]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.652 (0.299)
TIMESTEPS: 3 RMSE: 2.398 (0.552)
TIMESTEPS: 5 RMSE: 2.470 (0.413)
TIMESTEPS: 7 RMSE: 2.315 (0.382)
TIMESTEPS: 10 RMSE: 2.636 (0.386)
TIMESTEPS: 15 RMSE: 2.673 (0.428)
TIMESTEPS: 20 RMSE: 3.123 (0.602)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [61]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [62]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.400 (0.316)
TIMESTEPS: 3 RMSE: 3.298 (0.314)
TIMESTEPS: 5 RMSE: 3.416 (0.330)
TIMESTEPS: 7 RMSE: 3.466 (0.411)
TIMESTEPS: 10 RMSE: 2.626 (0.276)
TIMESTEPS: 15 RMSE: 3.305 (0.526)
TIMESTEPS: 20 RMSE: 2.667 (0.500)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [63]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [64]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.204 (0.599)
TIMESTEPS: 3 RMSE: 2.712 (0.499)
TIMESTEPS: 5 RMSE: 2.313 (0.653)
TIMESTEPS: 7 RMSE: 2.683 (0.626)
TIMESTEPS: 10 RMSE: 2.551 (0.555)
TIMESTEPS: 15 RMSE: 2.405 (0.541)
TIMESTEPS: 20 RMSE: 2.700 (0.794)


<h3>1.3.3 Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b>
    1. The IP, FP and BL variables are discarded.
    2. High correlated variables are removed

In [65]:
to_drop = [
    "Data/Hora",
    "IP",
    "FP",
    "BL",
    "CALCITA",
    "DESIDRATAÇÃO",
    "GIPSITA",
    "TIO2",
    "AL2O3",
    "PF",
    "FERRITA",
]

df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(to_drop, axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [66]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [67]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.444 (0.266)
TIMESTEPS: 3 RMSE: 2.234 (0.431)
TIMESTEPS: 5 RMSE: 2.422 (0.365)
TIMESTEPS: 7 RMSE: 2.215 (0.382)
TIMESTEPS: 10 RMSE: 2.431 (0.400)
TIMESTEPS: 15 RMSE: 2.729 (0.411)
TIMESTEPS: 20 RMSE: 2.867 (0.871)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [68]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [69]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.382 (0.293)
TIMESTEPS: 3 RMSE: 2.929 (0.394)
TIMESTEPS: 5 RMSE: 3.161 (0.364)
TIMESTEPS: 7 RMSE: 3.174 (0.478)
TIMESTEPS: 10 RMSE: 2.311 (0.351)
TIMESTEPS: 15 RMSE: 2.850 (0.658)
TIMESTEPS: 20 RMSE: 2.524 (0.490)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [70]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [71]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, repeats, timesteps_list, True
)

TIMESTEPS: 1 RMSE: 2.027 (0.504)
TIMESTEPS: 3 RMSE: 2.655 (0.394)
TIMESTEPS: 5 RMSE: 2.385 (0.650)
TIMESTEPS: 7 RMSE: 2.661 (0.456)
TIMESTEPS: 10 RMSE: 2.605 (0.848)
TIMESTEPS: 15 RMSE: 2.550 (0.881)
TIMESTEPS: 20 RMSE: 3.122 (1.513)
