In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow as tf
import time
import os 

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.utils.time_series_procs import split_sequences

In [2]:
def get_times_series_data(df, cut_date, time_steps=3):
    columns = list(df.columns.drop(["R28D"]))
    columns.append("R28D")
    df = df.reindex(columns=columns)
    df["Data/Hora"] = pd.to_datetime(df["Data/Hora"])
    df_train = df[df["Data/Hora"] < cut_date]
    df_test = df[df["Data/Hora"] >= cut_date]
    df_train = df_train.drop(["Data/Hora"], axis=1)
    df_test = df_test.drop(["Data/Hora"], axis=1)
    x_train, y_train = split_sequences(df_train.values, time_steps)
    x_test, y_test = split_sequences(df_test.values, time_steps)
    return x_train, y_train, x_test, y_test

In [3]:
class StandardScaler3DShape:
    def __init__(self):
        self.scaler = StandardScaler()

    def fit_transform(self, X, y=None):
        X_new = self.scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

    def fit(self, X, y=None):
        X_new = X.reshape(-1, X.shape[-1])
        self.scaler.fit(X_new)
        return self

    def transform(self, X, y=None):
        X_new = self.scaler.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

In [4]:
def get_baseline_model(n_units=64, learning_rate=0.01):
    model = tf.keras.Sequential()

    model.add(tf.keras.layers.LSTM(units=n_units))
    model.add(tf.keras.layers.Dense(1))

    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=opt,
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
    )
    return model

In [5]:
def create_keras_model(
    activation="tanh", dropout_rate=0.0, n_units=64, n_layers=1, learning_rate=1e-3
):
    model = tf.keras.Sequential()

    for i in range(n_layers):
        if i + 1 < n_layers:
            model.add(
                tf.keras.layers.LSTM(
                    units=n_units, activation=activation, return_sequences=True
                )
            )
        else:
            model.add(
                tf.keras.layers.LSTM(
                    units=n_units, activation=activation, return_sequences=False
                )
            )
    if dropout_rate > 0.0:
        model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Dense(1))
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
    )
    return model

In [6]:
def make_repeated_time_series_k_fold(
    x,
    y,
    train_period,
    test_period,
    grid=None,
    repeats=10,
    show_individual_results=True,
    show_final_result=True,
    params=None,
):
    results = []
    max_samples = x.shape[0]

    if grid:
        activation = grid.best_params_["model__activation"]
        dropout_rate = grid.best_params_["model__dropout_rate"]
        n_units = grid.best_params_["model__n_units"]
        n_layers = grid.best_params_["model__n_layers"]
        learning_rate = grid.best_params_["model__learning_rate"]
        batch_size = grid.best_params_["model__batch_size"]
        epochs = grid.best_params_["model__epochs"]
    else:
        activation = params["activation"]
        dropout_rate = params["dropout_rate"]
        n_units = params["n_units"]
        n_layers = params["n_layers"]
        learning_rate = params["learning_rate"]
        batch_size = params["batch_size"]
        epochs = params["epochs"]

    for _ in range(repeats):
        i = 0
        scores = []
        for _ in range(0, max_samples // train_period):
            x_train = x[i : i + train_period]
            y_train = y[i : i + train_period]
            x_test = x[i + train_period : i + train_period + test_period]
            y_test = y[i + train_period : i + train_period + test_period]
            i += train_period
            x_train = scaler.fit_transform(
                x_train.reshape(-1, x_train.shape[-1])
            ).reshape(x_train.shape)
            x_test = scaler.transform(x_test.reshape(-1, x_test.shape[-1])).reshape(
                x_test.shape
            )
            model = create_keras_model(
                activation=activation,
                dropout_rate=dropout_rate,
                n_units=n_units,
                n_layers=n_layers,
                learning_rate=learning_rate,
            )
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
            rmse = model.evaluate(x_test, y_test, verbose=0)
            scores.append(rmse[1])
        results.append(scores)
        if show_individual_results:
            print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))
    if show_final_result:
        print("\nRMSE: %.3f (%.3f)" % (np.mean(results), np.std(results)))
    return results


In [7]:
def make_timesteps_repeated_time_series_k_fold(
    df,
    train_period,
    test_period,
    params=None,
    grid=None,
    repeats=10,
    timesteps_list=[3],
    show_results=True,
):
    results = {}

    for timesteps in timesteps_list:
        x, y = split_sequences(df.values, timesteps)
        scores = make_repeated_time_series_k_fold(
            x, y, train_period, test_period, grid, repeats, False, False, params
        )
        results[timesteps] = scores
        if show_results:
            print(
                "TIMESTEPS: %d RMSE: %.3f (%.3f)"
                % (timesteps, np.mean(scores), np.std(scores))
            )
    return results


<h3>Disable Tensorflow Warnings</h3>

In [8]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [9]:
df_r3d_and_r7d = pd.read_csv(
    "../../../../data/interim/"
    + "r3d_and_r7d-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)
df_r3d_only = pd.read_csv(
    "../../../../data/interim/"
    + "r3d_only-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)
df_no_r3d_r7d = pd.read_csv(
    "../../../../data/interim/"
    + "no-r3d-r7d-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)

## Feature Selection

### In this notebook we remove variables that could potentially cause data leakage. Thus we are performing testings removing the following variables:

#### IP - Initial setting time
#### FP - Final setting time


#### BL - Blaine specific surface (This is both removed and use)

In [10]:
seed = 47
scaler = StandardScaler()

# Neural Network Long Short Term Memory - LSTM

# Configs:

1. Train Period: [01-01-2019, 2021-05-02)

2. Test Period:  (2021-05-02, 2021-11-31]

3. TIMESTEPS: [1, 3, 5, 7, 10, 15, 20]

4. Epochs: 100

5. Batch size: 32

6. LSTM Cells: 64

7. LSTM Layers: 1

8. Optimization Algorithm: Adam
    
    8.1. Learning Rate: 1e-2
    
<h3>Repeats: 10</h3>
<h3>K: 10</h3>

# 1. Times Series Repeated KFold Cross validation - different Timesteps values

Here we will make a Times Series KFold Cross validation with all 3 datasets. Here we are using the Blocking Time Series Split method. And with timeseries Split method. We also do it with different number of timesteps


In [11]:
params = {}
params["activation"] = "tanh"
params["dropout_rate"] = 0.0
params["n_units"] = 64
params["n_layers"] = 1
params["learning_rate"] = 1e-2
params["batch_size"] = 32
params["epochs"] = 100

timesteps_list = [1, 3, 5, 7, 10, 15, 20]

repeats = 10

## 1.1 Dataset: no-r3d-r7d-no-resampling-with-fillna

<h3>1.1.1 Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [12]:
df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [13]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [14]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.597 (3.238)
TIMESTEPS: 3 RMSE: 3.328 (1.709)
TIMESTEPS: 5 RMSE: 3.259 (3.325)
TIMESTEPS: 7 RMSE: 4.034 (4.999)
TIMESTEPS: 10 RMSE: 3.075 (3.297)
TIMESTEPS: 15 RMSE: 1.643 (0.360)
TIMESTEPS: 20 RMSE: 2.142 (0.425)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [15]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [16]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.918 (0.896)
TIMESTEPS: 3 RMSE: 2.275 (0.054)
TIMESTEPS: 5 RMSE: 2.325 (0.124)
TIMESTEPS: 7 RMSE: 2.390 (0.060)
TIMESTEPS: 10 RMSE: 2.356 (0.030)
TIMESTEPS: 15 RMSE: 1.157 (0.023)
TIMESTEPS: 20 RMSE: 1.543 (0.018)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [17]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [18]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 5.241 (0.489)
TIMESTEPS: 3 RMSE: 1.705 (0.104)
TIMESTEPS: 5 RMSE: 4.628 (6.221)
TIMESTEPS: 7 RMSE: 6.327 (13.460)
TIMESTEPS: 10 RMSE: 7.485 (12.820)
TIMESTEPS: 15 RMSE: 1.766 (0.022)
TIMESTEPS: 20 RMSE: 1.723 (0.103)


<h3>1.1.2 Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [19]:
df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP", "BL"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [20]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [21]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 11.879 (3.775)
TIMESTEPS: 3 RMSE: 3.421 (1.816)
TIMESTEPS: 5 RMSE: 3.342 (3.154)
TIMESTEPS: 7 RMSE: 3.983 (5.810)
TIMESTEPS: 10 RMSE: 3.858 (5.152)
TIMESTEPS: 15 RMSE: 1.643 (0.358)
TIMESTEPS: 20 RMSE: 2.148 (0.419)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [22]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [23]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 11.476 (0.848)
TIMESTEPS: 3 RMSE: 2.423 (0.170)
TIMESTEPS: 5 RMSE: 2.353 (0.124)
TIMESTEPS: 7 RMSE: 2.355 (0.051)
TIMESTEPS: 10 RMSE: 2.380 (0.056)
TIMESTEPS: 15 RMSE: 1.184 (0.118)
TIMESTEPS: 20 RMSE: 1.562 (0.074)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [24]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [25]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 4.667 (0.416)
TIMESTEPS: 3 RMSE: 1.688 (0.093)
TIMESTEPS: 5 RMSE: 4.068 (3.299)
TIMESTEPS: 7 RMSE: 5.408 (7.257)
TIMESTEPS: 10 RMSE: 1.918 (0.478)
TIMESTEPS: 15 RMSE: 5.291 (10.554)
TIMESTEPS: 20 RMSE: 1.692 (0.051)


<h3>1.1.3 Time Series Repeated KFold Cross Validation with vary timesteps - df_no_r3d_r7d</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> 

1. The IP, FP and BL variables are discarded.

2. High correlated variables are removed


In [26]:
to_drop = [
    "Data/Hora",
    "IP",
    "FP",
    "BL",
    "CALCITA",
    "DESIDRATAÇÃO",
    "GIPSITA",
    "TIO2",
    "AL2O3",
    "PF",
    "FERRITA",
]

df = df_no_r3d_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(to_drop, axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [27]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [28]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 10.777 (4.315)
TIMESTEPS: 3 RMSE: 6.326 (4.511)
TIMESTEPS: 5 RMSE: 2.812 (1.306)
TIMESTEPS: 7 RMSE: 3.941 (4.185)
TIMESTEPS: 10 RMSE: 3.872 (4.560)
TIMESTEPS: 15 RMSE: 1.649 (0.352)
TIMESTEPS: 20 RMSE: 2.149 (0.421)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [29]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [30]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 11.520 (0.863)
TIMESTEPS: 3 RMSE: 2.310 (0.040)
TIMESTEPS: 5 RMSE: 2.361 (0.119)
TIMESTEPS: 7 RMSE: 3.576 (3.552)
TIMESTEPS: 10 RMSE: 2.370 (0.052)
TIMESTEPS: 15 RMSE: 1.188 (0.072)
TIMESTEPS: 20 RMSE: 1.576 (0.052)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [31]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [32]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 4.712 (0.739)
TIMESTEPS: 3 RMSE: 1.622 (0.087)
TIMESTEPS: 5 RMSE: 6.520 (9.190)
TIMESTEPS: 7 RMSE: 2.785 (2.666)
TIMESTEPS: 10 RMSE: 3.670 (5.873)
TIMESTEPS: 15 RMSE: 1.774 (0.013)
TIMESTEPS: 20 RMSE: 1.683 (0.011)


## 1.2 Dataset: df_r3d_only

<h3>1.2.1 Time Series Repeated KFold Cross Validation - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset only R3D variable is considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [33]:
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [34]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [35]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.400 (2.863)
TIMESTEPS: 3 RMSE: 3.477 (1.841)
TIMESTEPS: 5 RMSE: 2.977 (2.746)
TIMESTEPS: 7 RMSE: 2.751 (3.388)
TIMESTEPS: 10 RMSE: 3.307 (3.122)
TIMESTEPS: 15 RMSE: 1.644 (0.358)
TIMESTEPS: 20 RMSE: 2.143 (0.423)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [36]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [37]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.808 (1.160)
TIMESTEPS: 3 RMSE: 2.249 (0.053)
TIMESTEPS: 5 RMSE: 2.270 (0.058)
TIMESTEPS: 7 RMSE: 3.674 (3.754)
TIMESTEPS: 10 RMSE: 2.363 (0.030)
TIMESTEPS: 15 RMSE: 1.151 (0.017)
TIMESTEPS: 20 RMSE: 1.563 (0.071)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [38]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [39]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 5.626 (0.655)
TIMESTEPS: 3 RMSE: 1.791 (0.061)
TIMESTEPS: 5 RMSE: 4.295 (3.532)
TIMESTEPS: 7 RMSE: 3.678 (5.359)
TIMESTEPS: 10 RMSE: 2.801 (3.305)
TIMESTEPS: 15 RMSE: 1.749 (0.042)
TIMESTEPS: 20 RMSE: 1.685 (0.011)


<h3>1.2.2 Time Series Repeated KFold Cross Validation with vary timesteps - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [40]:
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP", "BL"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [41]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [42]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 11.864 (3.234)
TIMESTEPS: 3 RMSE: 3.785 (2.197)
TIMESTEPS: 5 RMSE: 2.518 (1.072)
TIMESTEPS: 7 RMSE: 4.514 (6.096)
TIMESTEPS: 10 RMSE: 4.347 (5.503)
TIMESTEPS: 15 RMSE: 1.645 (0.358)
TIMESTEPS: 20 RMSE: 2.140 (0.424)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [43]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [44]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.177 (0.888)
TIMESTEPS: 3 RMSE: 2.270 (0.086)
TIMESTEPS: 5 RMSE: 4.674 (6.893)
TIMESTEPS: 7 RMSE: 3.468 (3.361)
TIMESTEPS: 10 RMSE: 2.371 (0.042)
TIMESTEPS: 15 RMSE: 1.151 (0.020)
TIMESTEPS: 20 RMSE: 1.536 (0.009)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [45]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [46]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 4.886 (0.978)
TIMESTEPS: 3 RMSE: 1.747 (0.081)
TIMESTEPS: 5 RMSE: 4.193 (6.762)
TIMESTEPS: 7 RMSE: 9.988 (12.823)
TIMESTEPS: 10 RMSE: 4.536 (6.485)
TIMESTEPS: 15 RMSE: 13.015 (15.975)
TIMESTEPS: 20 RMSE: 1.671 (0.026)


<h3>1.2.3 Time Series Repeated KFold Cross Validation with vary timesteps - df_r3d_only</h3>

<b>Dataset: df_r3d_only:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b>

1. The IP, FP and BL variables are discarded.

2. High correlated variables are removed


In [47]:
to_drop = [
    "Data/Hora",
    "IP",
    "FP",
    "BL",
    "CALCITA",
    "DESIDRATAÇÃO",
    "GIPSITA",
    "TIO2",
    "AL2O3",
    "PF",
    "FERRITA",
]
df = df_r3d_only.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(to_drop, axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [48]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [49]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 10.299 (3.789)
TIMESTEPS: 3 RMSE: 4.495 (2.657)
TIMESTEPS: 5 RMSE: 3.226 (3.847)
TIMESTEPS: 7 RMSE: 4.492 (5.552)
TIMESTEPS: 10 RMSE: 5.317 (6.771)
TIMESTEPS: 15 RMSE: 1.984 (1.922)
TIMESTEPS: 20 RMSE: 2.155 (0.414)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [50]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [51]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 11.069 (0.903)
TIMESTEPS: 3 RMSE: 2.242 (0.038)
TIMESTEPS: 5 RMSE: 2.258 (0.090)
TIMESTEPS: 7 RMSE: 2.387 (0.119)
TIMESTEPS: 10 RMSE: 2.356 (0.028)
TIMESTEPS: 15 RMSE: 1.161 (0.039)
TIMESTEPS: 20 RMSE: 1.547 (0.044)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [52]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [53]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 4.749 (0.990)
TIMESTEPS: 3 RMSE: 1.598 (0.079)
TIMESTEPS: 5 RMSE: 19.693 (21.178)
TIMESTEPS: 7 RMSE: 1.808 (0.198)
TIMESTEPS: 10 RMSE: 1.695 (0.023)
TIMESTEPS: 15 RMSE: 1.769 (0.024)
TIMESTEPS: 20 RMSE: 1.668 (0.008)


## 1.3 Dataset: df_r3d_and_r7d

<h3>1.3.1 Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [54]:
df = df_r3d_and_r7d.copy()
columns = list(df.columns.draop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [55]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [56]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.161 (3.889)
TIMESTEPS: 3 RMSE: 3.497 (1.858)
TIMESTEPS: 5 RMSE: 2.379 (0.841)
TIMESTEPS: 7 RMSE: 2.826 (4.082)
TIMESTEPS: 10 RMSE: 4.485 (5.589)
TIMESTEPS: 15 RMSE: 1.654 (0.349)
TIMESTEPS: 20 RMSE: 2.150 (0.432)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [57]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [58]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 13.581 (0.841)
TIMESTEPS: 3 RMSE: 2.158 (0.105)
TIMESTEPS: 5 RMSE: 2.318 (0.109)
TIMESTEPS: 7 RMSE: 3.222 (2.631)
TIMESTEPS: 10 RMSE: 2.310 (0.118)
TIMESTEPS: 15 RMSE: 1.175 (0.040)
TIMESTEPS: 20 RMSE: 1.630 (0.288)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [59]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [60]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 6.163 (0.585)
TIMESTEPS: 3 RMSE: 1.737 (0.090)
TIMESTEPS: 5 RMSE: 4.720 (5.201)
TIMESTEPS: 7 RMSE: 7.833 (11.168)
TIMESTEPS: 10 RMSE: 4.114 (7.172)
TIMESTEPS: 15 RMSE: 1.761 (0.024)
TIMESTEPS: 20 RMSE: 1.671 (0.019)


<h3>1.3.2 Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b> The IP, FP, and BL variables are discarded.

In [61]:
df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(['Data/Hora', "IP", "FP", "BL"], axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [62]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [63]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.333 (4.615)
TIMESTEPS: 3 RMSE: 3.905 (2.344)
TIMESTEPS: 5 RMSE: 3.235 (3.761)
TIMESTEPS: 7 RMSE: 5.184 (6.942)
TIMESTEPS: 10 RMSE: 4.952 (6.035)
TIMESTEPS: 15 RMSE: 1.977 (1.832)
TIMESTEPS: 20 RMSE: 2.143 (0.424)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [64]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [65]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 12.201 (0.928)
TIMESTEPS: 3 RMSE: 2.169 (0.072)
TIMESTEPS: 5 RMSE: 2.483 (0.282)
TIMESTEPS: 7 RMSE: 2.335 (0.050)
TIMESTEPS: 10 RMSE: 2.343 (0.047)
TIMESTEPS: 15 RMSE: 1.160 (0.014)
TIMESTEPS: 20 RMSE: 1.536 (0.011)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [66]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [67]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 4.641 (0.500)
TIMESTEPS: 3 RMSE: 1.677 (0.107)
TIMESTEPS: 5 RMSE: 3.763 (4.519)
TIMESTEPS: 7 RMSE: 8.566 (10.854)
TIMESTEPS: 10 RMSE: 8.362 (12.682)
TIMESTEPS: 15 RMSE: 3.187 (4.312)
TIMESTEPS: 20 RMSE: 1.674 (0.024)


<h3>1.3.3 Time Series Repeated KFold Cross Validation - df_r3d_and_r7d</h3>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b>
    1. The IP, FP and BL variables are discarded.
    2. High correlated variables are removed


In [68]:
to_drop = [
    "Data/Hora",
    "IP",
    "FP",
    "BL",
    "CALCITA",
    "DESIDRATAÇÃO",
    "GIPSITA",
    "TIO2",
    "AL2O3",
    "PF",
    "FERRITA",
]

df = df_r3d_and_r7d.copy()
columns = list(df.columns.drop(['R28D']))
columns.append('R28D')
df = df.reindex(columns=columns)
df = df.drop(to_drop, axis=1)

<h4>Experiment 1</h4>
    
<b>Train 6 months and predicts one month every year</b>

In [69]:
train_period = 90 # 6 months of data since the data is spaced every 2 days
test_period = 15 # 1 month

In [70]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 10.535 (4.664)
TIMESTEPS: 3 RMSE: 6.445 (4.511)
TIMESTEPS: 5 RMSE: 3.864 (4.178)
TIMESTEPS: 7 RMSE: 5.186 (6.974)
TIMESTEPS: 10 RMSE: 4.669 (6.235)
TIMESTEPS: 15 RMSE: 2.166 (2.529)
TIMESTEPS: 20 RMSE: 2.156 (0.412)


<h4>Experiment 2</h4>
    
<b>Train 11 months and predicts one month every year</b>

In [71]:
train_period = 165 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 15 # 1 month

In [72]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 11.317 (0.558)
TIMESTEPS: 3 RMSE: 2.161 (0.075)
TIMESTEPS: 5 RMSE: 2.354 (0.058)
TIMESTEPS: 7 RMSE: 2.382 (0.028)
TIMESTEPS: 10 RMSE: 2.332 (0.065)
TIMESTEPS: 15 RMSE: 1.149 (0.021)
TIMESTEPS: 20 RMSE: 1.543 (0.029)


<h4>Experiment 3</h4>
    
<b>Train 2 years and 10 months and predicts october and november of 2021</b>

In [73]:
train_period = 200 # 11 months of data since the data is spaced every 2 days (180 + 150)/2
test_period = 30 # 2 months

In [74]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 5.277 (0.878)
TIMESTEPS: 3 RMSE: 1.566 (0.045)
TIMESTEPS: 5 RMSE: 2.441 (2.289)
TIMESTEPS: 7 RMSE: 2.262 (1.335)
TIMESTEPS: 10 RMSE: 4.159 (5.156)
TIMESTEPS: 15 RMSE: 1.755 (0.026)
TIMESTEPS: 20 RMSE: 2.776 (3.295)
