In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Database Reading and Manipulation
import pandas as pd

# Linear Algebra
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Time
import time

# Random and os for reproducibility
import random
import os

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Modeling
import tensorflow as tf

# Processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

# Pipeline
from sklearn.pipeline import Pipeline

# Data imputation
from sklearn.impute import SimpleImputer

# Making keras compatible with scikit learn api
# https://scikit-learn.org/stable/developers/develop.html
from sklearn.base import BaseEstimator, RegressorMixin

# Custom modules
## Model selection
from src.cross_validation.blocking_time_series_split import BlockingTimeSeriesSplit

## Function to print scores
from src.utils.print_scores import print_scores

## Function to calculate score regression metrics
from src.utils.score_regression_metrics import score_regression_metrics

## Function to fill the results metric dict
from src.utils.fill_results_dict import fill_results_dict

# Converting Times Series Data to 3D format
from src.utils.split_sequences import split_sequences

2024-07-16 00:00:00.604914: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-16 00:00:00.608354: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-16 00:00:00.677990: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-16 00:00:00.679091: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<IPython.core.display.Javascript object>

# Functions and definitions

### Convert train/test data to 3D format

In [3]:
def generate_sequences_helper(
    dataset, cement_types, dates=None, timesteps=None, split_by_cement_type=False
):
    index_train = dataset["y_train"].index
    index_test = dataset["y_test"].index

    dataset["y_train"] = dataset["y_train"].reset_index(drop=True)
    dataset["y_test"] = dataset["y_test"].reset_index(drop=True)

    if dates is not None:
        dataset["dates_train"] = dates[index_train].reset_index(drop=True)
        dataset["dates_test"] = dates[index_test].reset_index(drop=True)

    dataset["cement_types_train"] = cement_types.loc[index_train].reset_index(drop=True)
    dataset["cement_types_test"] = cement_types.loc[index_test].reset_index(drop=True)

    dataset = generate_sequences(dataset, timesteps, split_by_cement_type)

    return dataset

<IPython.core.display.Javascript object>

In [4]:
def generate_sequences(
    dataset,
    timesteps,
    split_by_cement_type=False,
    train_columns=[],
    test_columns=[],
):
    if split_by_cement_type:
        dataset["x_train"], dataset["y_train"] = split_sequences_per_cement_type(
            pd.concat(
                [
                    dataset["dates_train"],
                    pd.DataFrame(dataset["x_train"], columns=train_columns),
                    dataset["cement_types_train"],
                    dataset["y_train"],
                ],
                axis=1,
            ),
            timesteps,
        )

        dataset["x_test"], dataset["y_test"] = split_sequences_per_cement_type(
            pd.concat(
                [
                    dataset["dates_test"],
                    pd.DataFrame(dataset["x_test"], columns=test_columns),
                    dataset["cement_types_test"],
                    dataset["y_test"],
                ],
                axis=1,
            ),
            timesteps,
        )
    else:
        dataset["x_train"], dataset["y_train"] = split_sequences(
            pd.concat(
                [
                    pd.DataFrame(dataset["x_train"], columns=train_columns),
                    dataset["y_train"],
                ],
                axis=1,
            ).values,
            timesteps,
        )

        dataset["x_test"], dataset["y_test"] = split_sequences(
            pd.concat(
                [
                    pd.DataFrame(dataset["x_test"], columns=test_columns),
                    dataset["y_test"],
                ],
                axis=1,
            ).values,
            timesteps,
        )
    return dataset

<IPython.core.display.Javascript object>

### Data preprocessing

In [5]:
def impute_data(dataset, imputer=None, imputer_params=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply imputation to the data
    if imputer is not None:
        imputer = imputer() if imputer_params is None else imputer(**imputer_params)
        x_train = imputer.fit_transform(x_train)
        x_test = imputer.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [6]:
def transform_data(dataset, transformer=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply data normalization/standardization to the data
    if transformer is not None:
        scaler = transformer()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [7]:
def preprocess_data(dataset, transformer=None, imputer=None, imputer_params=None):
    dataset = impute_data(dataset, imputer, imputer_params)
    dataset = transform_data(dataset, transformer)
    return dataset

<IPython.core.display.Javascript object>

### Train and evaluate the model

In [8]:
def train_and_evaluate_model(Estimator, dataset, estimator_params=None):
    """
    Purpose: Helper function to be used in conjunction with
    blocked time_series cross validation function
    """
    x_train = dataset["x_train"]
    y_train = dataset["y_train"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]

    # Instantiate the model
    model = Estimator() if estimator_params is None else Estimator(estimator_params)

    # Fitting the model
    model.fit(x_train, y_train)

    # Making predictions on train/test sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Return regression metrics
    return model, score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)

<IPython.core.display.Javascript object>

In [9]:
def evaluate_model(model, dataset):
    """
    Purpose: Helper function to be used in conjunction with
    blocked time_series cross validation function
    """
    x_train = dataset["x_train"]
    y_train = dataset["y_train"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]

    # Making predictions on train/test sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Return regression metrics
    return score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)

<IPython.core.display.Javascript object>

### Custom Cross Validate

In [10]:
def custom_cross_validate(
    Estimator,
    Imputer,
    Transform,
    x,
    y,
    cv,
    timesteps,
    dates=None,
    cement_types=None,
    estimator_params=None,
    imputer_params=None,
    split_by_cement_type=True,
):
    results = []
    scores = []

    for train_index, test_index in cv.split(x):
        dataset = {
            "dates_train": dates[train_index].reset_index(drop=True),
            "cement_types_train": cement_types.loc[train_index].reset_index(drop=True),
            "x_train": x.loc[train_index].reset_index(drop=True),
            "y_train": y[train_index].reset_index(drop=True),
            "dates_test": dates[test_index].reset_index(drop=True),
            "cement_types_test": cement_types.loc[test_index].reset_index(drop=True),
            "x_test": x.loc[test_index].reset_index(drop=True),
            "y_test": y[test_index].reset_index(drop=True),
        }

        # Preprocess the dataset
        dataset = preprocess_data(dataset, Transform, Imputer, imputer_params)

        # generate sequences (3D format)
        dataset = generate_sequences(dataset, timesteps, split_by_cement_type)

        # Train and Evaluate the model
        score = train_and_evaluate_model(Estimator, dataset, estimator_params)
        scores.append(score)

    # After every iteration metrics results are appended together
    scores_final = {key: [] for key, _ in scores[0].items()}
    for scores_dict in scores:
        for key, value in scores_dict.items():
            scores_final[key] += [value]
    results.append(scores_final)
    return results

<IPython.core.display.Javascript object>

## Model Definition

In [11]:
class Conv1D_1(BaseEstimator, RegressorMixin):
    def __init__(self, params):
        self.batch_size = 64
        self.epochs = 300
        self.verbose = params.get("verbose", 0)
        self.callbacks = params.get("callbacks", None)
        self.validation_split = params.get("validation_split", None)
        self.kernel_size = params.get("kernel_size", 1)
        self.activation = params.get("activation", "relu")
        self.padding = params.get("padding", "causal")
        self.strides = params.get("strides", 1)
        self.pool_size = params.get("pool_size", 1)
        self.model = self.get_model()

    def fit(self, X=None, y=None):
        self.history = self.model.fit(
            X,
            y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            verbose=self.verbose,
            callbacks=self.callbacks,
            validation_split=self.validation_split,
        )

    def predict(self, X=None):
        return self.model.predict(X, verbose=self.verbose)

    def get_model(self):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Conv1D(
                filters=64,
                kernel_size=self.kernel_size,
                activation=self.activation,
                padding=self.padding,
                strides=self.strides,
            )
        )
        model.add(tf.keras.layers.MaxPooling1D(pool_size=self.pool_size))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(32, activation="relu"))
        model.add(tf.keras.layers.Dropout(rate=0.10))
        model.add(tf.keras.layers.Dense(units=1))
        model.compile(
            optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
        )
        return model

<IPython.core.display.Javascript object>

In [12]:
class Conv1D_2(BaseEstimator, RegressorMixin):
    def __init__(self, params):
        self.batch_size = 64
        self.epochs = 300
        self.verbose = params.get("verbose", 0)
        self.callbacks = params.get("callbacks", None)
        self.validation_split = params.get("validation_split", None)
        self.kernel_size = params.get("kernel_size", 1)
        self.activation = params.get("activation", "relu")
        self.padding = params.get("padding", "causal")
        self.strides = params.get("strides", 1)
        self.pool_size = params.get("pool_size", 1)
        self.model = self.get_model()

    def fit(self, X=None, y=None):
        self.history = self.model.fit(
            X,
            y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            verbose=self.verbose,
            callbacks=self.callbacks,
            validation_split=self.validation_split,
        )

    def predict(self, X=None):
        return self.model.predict(X, verbose=self.verbose)

    def get_model(self):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Conv1D(
                filters=64,
                kernel_size=self.kernel_size,
                activation=self.activation,
                padding=self.padding,
                strides=self.strides,
            )
        )
        model.add(tf.keras.layers.AveragePooling1D(pool_size=self.pool_size))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(32, activation="relu"))
        model.add(tf.keras.layers.Dropout(rate=0.10))
        model.add(tf.keras.layers.Dense(units=1))
        model.compile(
            optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
        )
        return model

<IPython.core.display.Javascript object>

In [13]:
class Conv1D_3(BaseEstimator, RegressorMixin):
    def __init__(self, params):
        self.batch_size = 64
        self.epochs = 300
        self.verbose = params.get("verbose", 0)
        self.callbacks = params.get("callbacks", None)
        self.validation_split = params.get("validation_split", None)
        self.kernel_size = params.get("kernel_size", 1)
        self.activation = params.get("activation", "relu")
        self.padding = params.get("padding", "causal")
        self.strides = params.get("strides", 1)
        self.pool_size = params.get("pool_size", 1)
        self.model = self.get_model()

    def fit(self, X=None, y=None):
        self.history = self.model.fit(
            X,
            y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            verbose=self.verbose,
            callbacks=self.callbacks,
            validation_split=self.validation_split,
        )

    def predict(self, X=None):
        return self.model.predict(X, verbose=self.verbose)

    def get_model(self):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Conv1D(
                filters=128,
                kernel_size=self.kernel_size,
                activation=self.activation,
                padding=self.padding,
                strides=self.strides,
            )
        )
        model.add(tf.keras.layers.MaxPooling1D(pool_size=self.pool_size))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(32, activation="relu"))
        model.add(tf.keras.layers.Dropout(rate=0.10))
        model.add(tf.keras.layers.Dense(units=1))
        model.compile(
            optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
        )
        return model

<IPython.core.display.Javascript object>

In [14]:
class Conv1D_4(BaseEstimator, RegressorMixin):
    def __init__(self, params):
        self.batch_size = 64
        self.epochs = 300
        self.verbose = params.get("verbose", 0)
        self.callbacks = params.get("callbacks", None)
        self.validation_split = params.get("validation_split", None)
        self.kernel_size = params.get("kernel_size", 1)
        self.activation = params.get("activation", "relu")
        self.padding = params.get("padding", "causal")
        self.strides = params.get("strides", 1)
        self.pool_size = params.get("pool_size", 1)
        self.model = self.get_model()

    def fit(self, X=None, y=None):
        self.history = self.model.fit(
            X,
            y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            verbose=self.verbose,
            callbacks=self.callbacks,
            validation_split=self.validation_split,
        )

    def predict(self, X=None):
        return self.model.predict(X, verbose=self.verbose)

    def get_model(self):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Conv1D(
                filters=128,
                kernel_size=self.kernel_size,
                activation=self.activation,
                padding=self.padding,
                strides=self.strides,
            )
        )
        model.add(tf.keras.layers.AveragePooling1D(pool_size=self.pool_size))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(32, activation="relu"))
        model.add(tf.keras.layers.Dropout(rate=0.10))
        model.add(tf.keras.layers.Dense(units=1))
        model.compile(
            optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
        )
        return model

<IPython.core.display.Javascript object>

In [15]:
class Conv1D_5(BaseEstimator, RegressorMixin):
    def __init__(self, params):
        self.batch_size = 64
        self.epochs = 300
        self.verbose = params.get("verbose", 0)
        self.callbacks = params.get("callbacks", None)
        self.validation_split = params.get("validation_split", None)
        self.kernel_size = params.get("kernel_size", 1)
        self.activation = params.get("activation", "relu")
        self.padding = params.get("padding", "causal")
        self.strides = params.get("strides", 1)
        self.pool_size = params.get("pool_size", 1)
        self.model = self.get_model()

    def fit(self, X=None, y=None):
        self.history = self.model.fit(
            X,
            y,
            batch_size=self.batch_size,
            epochs=self.epochs,
            verbose=self.verbose,
            callbacks=self.callbacks,
            validation_split=self.validation_split,
        )

    def predict(self, X=None):
        return self.model.predict(X, verbose=self.verbose)

    def get_model(self):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Conv1D(
                filters=64,
                kernel_size=self.kernel_size,
                activation=self.activation,
                padding=self.padding,
                strides=self.strides,
            )
        )
        model.add(
            tf.keras.layers.Conv1D(
                filters=32,
                kernel_size=self.kernel_size,
                activation=self.activation,
                padding=self.padding,
                strides=self.strides,
            )
        )
        model.add(tf.keras.layers.Dropout(rate=0.10))
        model.add(tf.keras.layers.AveragePooling1D(pool_size=self.pool_size))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(32, activation="relu"))
        model.add(tf.keras.layers.Dropout(rate=0.10))
        model.add(tf.keras.layers.Dense(units=1))
        model.compile(
            optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
        )
        return model

<IPython.core.display.Javascript object>

In [16]:
def pad_time_series(dataframe, timesteps):
    """
    Pad timeseries with zeros
    """
    df_tmp = pd.DataFrame(
        dict(
            zip(
                dataframe.columns,
                [[0 for _ in range(timesteps - 1)] for _ in range(dataframe.shape[1])],
            )
        )
    )
    df_tmp[DATE] = dataframe[DATE].iloc[0]
    return pd.concat([df_tmp, dataframe], axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

In [17]:
def split_sequences_per_cement_type(dataframe, timesteps, pad=False):
    """
    Create sequences per cement time
    to avoid having parts of the sequence
    of different types of cement.
    """
    if timesteps == 1:
        return split_sequences(
            dataframe.drop([DATE] + CEMENT_TYPES, axis=1).values, timesteps
        )

    dates = dataframe[DATE][timesteps - 1 :]
    data = []
    dataframes = []

    for cement_type in CEMENT_TYPES:
        data.append(dataframe[dataframe[cement_type] == 1])
    data.append(dataframe[(dataframe[CEMENT_TYPES] == 0).all(axis=1)])

    for df in data:
        if pad:
            dates = df[DATE].reset_index(drop=True)
            df = pad_time_series(df, timesteps).reset_index(drop=True)
        else:
            dates = df[DATE][timesteps - 1 :].reset_index(drop=True)
        x, y = split_sequences(df.drop([DATE] + CEMENT_TYPES, axis=1).values, timesteps)
        x = pd.DataFrame({"Sequences": [sample.tolist() for sample in x]})
        y = pd.DataFrame({"Target": y})
        dataframes.append(pd.concat([dates, x, y], axis=1))

    data = pd.concat(dataframes, axis=0)
    data[DATE] = pd.to_datetime(data[DATE])
    data = data.sort_values(by=DATE).reset_index(drop=True)
    x = data["Sequences"]
    y = data["Target"].values
    x = np.array(x.tolist())

    return x, y

<IPython.core.display.Javascript object>

# Settings for Reproducibility

In [18]:
def set_seeds():
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    os.environ["PYTHONHASHSEED"] = str(SEED)
    tf.random.set_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)


# https://stackoverflow.com/questions/36288235/how-to-get-stable-results-with-tensorflow-setting-random-seed

<IPython.core.display.Javascript object>

In [19]:
def set_global_determinism():
    set_seeds()

    os.environ["TF_DETERMINISTIC_OPS"] = "1"
    os.environ["TF_CUDNN_DETERMINISTIC"] = "1"

    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

<IPython.core.display.Javascript object>

In [20]:
index_to_save = 10
model_index = 1

<IPython.core.display.Javascript object>

In [21]:
SEED = 47
METRICS = (
    "neg_root_mean_squared_error",
    "neg_mean_absolute_error",
    "neg_mean_absolute_percentage_error",
    "r2",
)
METRICS_DICT = {
    "neg_root_mean_squared_error": "RMSE",
    "neg_mean_absolute_error": "MAE",
    "neg_mean_absolute_percentage_error": "MAPE",
    "r2": "R2",
}
DATE = "Date"
CEMENT_TYPES = [
    "Cement_Type_CEM B",
    "Cement_Type_CEM C",
    "Cement_Type_CP II-F-32",
    "Cement_Type_CP II-F-40",
    "Cement_Type_CP V-ARI",
    "Cement_Type_Type I-II",
    "Cement_Type_Type III",
    "Cement_Type_Type IL",
]

<IPython.core.display.Javascript object>

## Defining a dataframe structure to save the results

In [22]:
results_to_save = []

results_dict = {
    "Category": "Global Model",
    "Company": "INN",
    "Plant": "INN",
    "Features": "Chemical + Mineralogical + Properties CS Less",
    "Data Shape": None,
    "Timesteps": None,
    "Model": "Conv1D",
    "Model Params": None,
    "Scaler": "Standard Scaler",
    "Scaler Params": None,
    "Imputer": "Median",
    "Imputer Params": None,
    "Cross Validation": None,
    "Cross Validation Params": np.nan,
    "RMSE Train": np.nan,
    "MAE Train": np.nan,
    "MAPE Train": np.nan,
    "R2 Train": np.nan,
    "RMSE Test": np.nan,
    "MAE Test": np.nan,
    "MAPE Test": np.nan,
    "R2 Test": np.nan,
}

<IPython.core.display.Javascript object>

# Reading the dataset

In [23]:
df = pd.read_csv("../../../../../../../data/processed/inn/global_dataset_inn.csv")

<IPython.core.display.Javascript object>

<h2>1. Dataset: df_copy</h2> <br>In this dataset all features are used.

## Defining Features

In this set of experiments we use all available features

In [24]:
df_copy = df.copy()
df_copy = pd.get_dummies(data=df_copy, columns=["Cement_Type"], drop_first=True)

df_copy = df_copy.drop(
    [
        "CS1",
        "CS3",
        "CS7",
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

In [25]:
df_copy[CEMENT_TYPES] = df_copy[CEMENT_TYPES].astype(int)
dates = df["Date"].copy()
x = df_copy.drop(["Date", "CS28", "Factory_Plant"] + CEMENT_TYPES, axis=1)
y = df_copy["CS28"]
train_columns = x.columns

<IPython.core.display.Javascript object>

In [26]:
def prepare_dataset(
    dataframe_original=df,
    dataframe_copy=df_copy,
    train_size=0.8,
    test_size=0.2,
    ignore_test_set=False,
    timesteps=1,
    split_by_cement_type=True,
):
    dataframe_original = df.copy()
    dataframe_copy = df_copy.copy()
    dataframe_copy[CEMENT_TYPES] = dataframe_copy[CEMENT_TYPES].astype(int).copy()
    dates = dataframe_original["Date"].copy()
    x = dataframe_copy.drop(["Date", "CS28"] + CEMENT_TYPES, axis=1)
    y = dataframe_copy["CS28"]
    cement_types = dataframe_copy[CEMENT_TYPES].copy()

    # Split the dataframe by plant
    plants = dataframe_original["Factory_Plant"].unique()
    train_indexes = []
    test_indexes = []
    plants_test_indexes = {}

    for plant in plants:
        plant_df = dataframe_original[dataframe_original["Factory_Plant"] == plant]
        plant_df = plant_df.sort_values(by="Date")
        plant_indices = plant_df.index
        train_end_index = int(len(plant_indices) * train_size)
        # plants_indexes[]

        if not ignore_test_set:
            train_indexes.extend(plant_indices[:train_end_index])
            test_indexes.extend(plant_indices[train_end_index:])
            plants_test_indexes[plant] = plant_indices[train_end_index:]

        else:
            train_indexes.extend(plant_indices[:train_end_index])
            test_indexes.extend(plant_indices[train_end_index:])
            plants_test_indexes[plant] = plant_indices[train_end_index:]

    train_index = dataframe_original.loc[train_indexes].sort_values(by="Date").index
    test_index = dataframe_original.loc[test_indexes].sort_values(by="Date").index

    plant_id_series = x["Factory_Plant"].copy()
    x = x.drop(["Factory_Plant"], axis=1)

    dataset = {
        "dates_train": dates[train_index].reset_index(drop=True).copy(),
        "cement_types_train": cement_types.loc[train_index]
        .reset_index(drop=True)
        .copy(),
        "x_train": x.loc[train_index].reset_index(drop=True).copy(),
        "y_train": y[train_index].reset_index(drop=True).copy(),
        "dates_test": dates[test_index].reset_index(drop=True).copy(),
        "cement_types_test": cement_types.loc[test_index].reset_index(drop=True).copy(),
        "x_test": x.loc[test_index].reset_index(drop=True).copy(),
        "y_test": y[test_index].reset_index(drop=True).copy(),
    }

    # Preprocess the dataset
    dataset = preprocess_data(dataset, None, SimpleImputer, {"strategy": "median"})
    # return dataset, plant_id_series.loc[test_index].reset_index(drop=True).copy()
    dataset["x_test"] = np.concatenate(
        [
            dataset["x_test"],
            plant_id_series.loc[test_index]
            .reset_index(drop=True)
            .copy()
            .values.reshape(-1, 1),
        ],
        axis=1,
    )
    index_list = x.columns.tolist()
    index_list.append("Factory_Plant")
    test_columns = pd.Index(index_list)

    # generate sequences (3D format)
    dataset = generate_sequences(
        dataset,
        timesteps=timesteps,
        split_by_cement_type=split_by_cement_type,
        train_columns=train_columns,
        test_columns=test_columns,
    )

    return dataset

<IPython.core.display.Javascript object>

In [27]:
def prepare_dataset_helper(timesteps):
    dataset = prepare_dataset(timesteps=timesteps)
    # Remove the last column (plant identification) for overall evaluation
    x_test_overall = np.delete(dataset["x_test"], -1, axis=2)

    # Initialize dictionaries to hold plant-specific datasets
    plant_specific_x_test = {}
    plant_specific_y_test = {}

    # Get unique plant identifiers
    unique_plants = np.unique(dataset["x_test"][:, 0, -1])

    for plant in unique_plants:
        # Create a mask for the current plant
        plant_mask = dataset["x_test"][:, 0, -1] == plant
        # Filter x_test and y_test using the mask
        x_test_plant = dataset["x_test"][plant_mask]
        y_test_plant = dataset["y_test"][plant_mask]

        # Remove the last column (plant identification) from x_test_plant
        x_test_plant = np.delete(x_test_plant, -1, axis=2)

        # Store the filtered arrays in the dictionaries
        plant_specific_x_test[plant] = x_test_plant
        plant_specific_y_test[plant] = y_test_plant

    # Output the shapes to verify
    # print("Overall x_test shape:", x_test_overall.shape)
    # for plant in plant_specific_x_test:
    #    print(f"Plant {plant} x_test shape:", plant_specific_x_test[plant].shape)
    #    print(f"Plant {plant} y_test shape:", plant_specific_y_test[plant].shape)

    plant_specific = {"x_test": plant_specific_x_test, "y_test": plant_specific_y_test}
    x_test_overall = x_test_overall.astype(float)
    dataset["x_test"] = x_test_overall
    return dataset, plant_specific

<IPython.core.display.Javascript object>

In [28]:
def print_scores_2(scores, METRICS, METRICS_DICT):
    for phase in ["test"]:
        print("******")
        print(f"[{phase.upper()}]")
        print("******")
        for metric in METRICS:
            name = METRICS_DICT[metric]
            print(
                f"{name}: %.3f (%.3f)"
                % (
                    np.mean(scores[f"{phase}_" + metric]),
                    np.std(scores[f"{phase}_" + metric]),
                )
            )
        print("\n======================\n")

<IPython.core.display.Javascript object>

In [29]:
def append_results(scores_dict):
    for plant, scores in scores_dict.items():
        results_dict_copy = results_dict.copy()
        results_dict_copy["Plant"] = plant
        results_dict_copy["Timesteps"] = timesteps
        results_dict_copy["Cross Validation"] = "Out of time"
        results_dict_copy[
            "Cross Validation Params"
        ] = '{"train_size": 0.8, "test_size": 0.2}'
        results_dict_copy["Data Shape"] = x.shape
        results_dict_copy["Model"] = f"Conv1D_{model_index}"
        scores = {key: [value] for key, value in scores.items()}
        df_results = fill_results_dict(results_dict_copy, scores)
        results_to_save.append(df_results)


<IPython.core.display.Javascript object>

In [30]:
def print_results_and_get_scores_dict(scores, plant_specific):
    print("=======================")
    print("Overall Score")
    print("=======================\n")

    print_scores(scores, METRICS, METRICS_DICT)

    print("=======================")
    print("Scores per plant")
    print("=======================\n")

    scores_dict = {"INN": scores}

    for plant in df_copy["Factory_Plant"].unique():
        x_test = plant_specific["x_test"][plant].astype(float)
        y_test = plant_specific["y_test"][plant].astype(float)
        dataset["x_test"] = x_test
        dataset["y_test"] = y_test
        print("=======================")
        print(
            "Plant",
            plant,
        )
        print("=======================\n")
        scores_plant = evaluate_model(model, dataset)
        scores_dict[plant] = scores_plant
        print_scores_2(scores_plant, METRICS, METRICS_DICT)
    return scores_dict

<IPython.core.display.Javascript object>

In [31]:
def get_conv1d_params(
    timesteps=1,
    activation="relu",
    padding="causal",
    kernel_size=1,
    pool_size=1,
    strides=1,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
):
    params = {}
    params["verbose"] = verbose
    params["callbacks"] = callbacks
    params["validation_split"] = validation_split
    params["activation"] = activation
    params["padding"] = padding
    params["kernel_size"] = kernel_size
    params["strides"] = strides
    params["pool_size"] = pool_size

    return params

<IPython.core.display.Javascript object>

# Conv1D

### Conv1D 1

1. TIMESTEPS: 1

In [32]:
set_seeds()
set_global_determinism()
timesteps = 1
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=1,
    pool_size=1,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_1, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

2024-07-16 00:00:15.512509: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-07-16 00:00:15.512580: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: inspirada
2024-07-16 00:00:15.512596: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: inspirada
2024-07-16 00:00:15.512864: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.183.1
2024-07-16 00:00:15.512900: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.183.1
2024-07-16 00:00:15.512908: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.183.1


Minutes Elapsed:  0.4279435435930888


<IPython.core.display.Javascript object>

In [33]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 4.956 (0.000)
MAE: 3.965 (0.000)
MAPE: 0.085 (0.000)
R2: 0.253 (0.000)


******
[TEST]
******
RMSE: 5.100 (0.000)
MAE: 4.309 (0.000)
MAPE: 0.095 (0.000)
R2: 0.247 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 5.983 (0.000)
MAE: 5.139 (0.000)
MAPE: 0.102 (0.000)
R2: 0.051 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.868 (0.000)
MAE: 2.282 (0.000)
MAPE: 0.059 (0.000)
R2: -0.030 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 5.837 (0.000)
MAE: 5.438 (0.000)
MAPE: 0.121 (0.000)
R2: -0.372 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 1

1. TIMESTEPS: 7

In [34]:
set_seeds()
set_global_determinism()
timesteps = 7
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_1, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.5425240715344747


<IPython.core.display.Javascript object>

In [35]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 3.992 (0.000)
MAE: 3.018 (0.000)
MAPE: 0.068 (0.000)
R2: 0.515 (0.000)


******
[TEST]
******
RMSE: 4.509 (0.000)
MAE: 3.629 (0.000)
MAPE: 0.085 (0.000)
R2: 0.408 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 5.394 (0.000)
MAE: 4.541 (0.000)
MAPE: 0.089 (0.000)
R2: 0.182 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 3.759 (0.000)
MAE: 3.236 (0.000)
MAPE: 0.084 (0.000)
R2: -0.891 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 4.391 (0.000)
MAE: 3.305 (0.000)
MAPE: 0.083 (0.000)
R2: 0.213 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 1

1. TIMESTEPS: 14

In [36]:
set_seeds()
set_global_determinism()
timesteps = 14
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_1, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.7507236242294312


<IPython.core.display.Javascript object>

In [37]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 4.123 (0.000)
MAE: 3.458 (0.000)
MAPE: 0.075 (0.000)
R2: 0.482 (0.000)


******
[TEST]
******
RMSE: 3.406 (0.000)
MAE: 2.750 (0.000)
MAPE: 0.062 (0.000)
R2: 0.658 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 3.753 (0.000)
MAE: 2.986 (0.000)
MAPE: 0.060 (0.000)
R2: 0.607 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.542 (0.000)
MAE: 2.000 (0.000)
MAPE: 0.050 (0.000)
R2: 0.102 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 3.765 (0.000)
MAE: 3.212 (0.000)
MAPE: 0.073 (0.000)
R2: 0.416 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 2

1. TIMESTEPS: 1

In [38]:
set_seeds()
set_global_determinism()
timesteps = 1
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_2, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.4248672882715861


<IPython.core.display.Javascript object>

In [39]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 4.956 (0.000)
MAE: 3.965 (0.000)
MAPE: 0.085 (0.000)
R2: 0.253 (0.000)


******
[TEST]
******
RMSE: 5.100 (0.000)
MAE: 4.309 (0.000)
MAPE: 0.095 (0.000)
R2: 0.247 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 5.983 (0.000)
MAE: 5.139 (0.000)
MAPE: 0.102 (0.000)
R2: 0.051 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.868 (0.000)
MAE: 2.282 (0.000)
MAPE: 0.059 (0.000)
R2: -0.030 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 5.837 (0.000)
MAE: 5.438 (0.000)
MAPE: 0.121 (0.000)
R2: -0.372 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 2

1. TIMESTEPS: 7

In [40]:
set_seeds()
set_global_determinism()
timesteps = 7
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_2, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.518039325873057


<IPython.core.display.Javascript object>

In [41]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 3.232 (0.000)
MAE: 2.580 (0.000)
MAPE: 0.056 (0.000)
R2: 0.682 (0.000)


******
[TEST]
******
RMSE: 3.207 (0.000)
MAE: 2.528 (0.000)
MAPE: 0.057 (0.000)
R2: 0.700 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 4.360 (0.000)
MAE: 3.587 (0.000)
MAPE: 0.071 (0.000)
R2: 0.466 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.842 (0.000)
MAE: 2.289 (0.000)
MAPE: 0.060 (0.000)
R2: -0.080 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 2.422 (0.000)
MAE: 1.966 (0.000)
MAPE: 0.045 (0.000)
R2: 0.761 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 2

1. TIMESTEPS: 14

In [42]:
set_seeds()
set_global_determinism()
timesteps = 14
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_2, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.7373901605606079


<IPython.core.display.Javascript object>

In [43]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 3.652 (0.000)
MAE: 2.867 (0.000)
MAPE: 0.062 (0.000)
R2: 0.593 (0.000)


******
[TEST]
******
RMSE: 3.702 (0.000)
MAE: 3.036 (0.000)
MAPE: 0.068 (0.000)
R2: 0.596 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 4.465 (0.000)
MAE: 3.719 (0.000)
MAPE: 0.074 (0.000)
R2: 0.444 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 3.299 (0.000)
MAE: 2.717 (0.000)
MAPE: 0.071 (0.000)
R2: -0.513 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 3.424 (0.000)
MAE: 2.829 (0.000)
MAPE: 0.062 (0.000)
R2: 0.517 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 3

1. TIMESTEPS: 1

In [44]:
set_seeds()
set_global_determinism()
timesteps = 1
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_3, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.48280519247055054


<IPython.core.display.Javascript object>

In [45]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 4.737 (0.000)
MAE: 3.759 (0.000)
MAPE: 0.080 (0.000)
R2: 0.317 (0.000)


******
[TEST]
******
RMSE: 4.653 (0.000)
MAE: 3.773 (0.000)
MAPE: 0.082 (0.000)
R2: 0.373 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 5.457 (0.000)
MAE: 4.546 (0.000)
MAPE: 0.090 (0.000)
R2: 0.210 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.585 (0.000)
MAE: 2.048 (0.000)
MAPE: 0.053 (0.000)
R2: 0.163 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 5.339 (0.000)
MAE: 4.684 (0.000)
MAPE: 0.102 (0.000)
R2: -0.148 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 3

1. TIMESTEPS: 7

In [46]:
set_seeds()
set_global_determinism()
timesteps = 7
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_3, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.6624662240346273


<IPython.core.display.Javascript object>

In [47]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 3.433 (0.000)
MAE: 2.728 (0.000)
MAPE: 0.062 (0.000)
R2: 0.641 (0.000)


******
[TEST]
******
RMSE: 3.238 (0.000)
MAE: 2.530 (0.000)
MAPE: 0.061 (0.000)
R2: 0.695 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 2.809 (0.000)
MAE: 2.292 (0.000)
MAPE: 0.047 (0.000)
R2: 0.778 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.812 (0.000)
MAE: 2.267 (0.000)
MAPE: 0.059 (0.000)
R2: -0.058 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 3.815 (0.000)
MAE: 2.925 (0.000)
MAPE: 0.072 (0.000)
R2: 0.406 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 3

1. TIMESTEPS: 14

In [48]:
set_seeds()
set_global_determinism()
timesteps = 14
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_3, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  1.087899136543274


<IPython.core.display.Javascript object>

In [49]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 3.403 (0.000)
MAE: 2.724 (0.000)
MAPE: 0.059 (0.000)
R2: 0.647 (0.000)


******
[TEST]
******
RMSE: 3.687 (0.000)
MAE: 3.059 (0.000)
MAPE: 0.071 (0.000)
R2: 0.599 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 4.631 (0.000)
MAE: 3.927 (0.000)
MAPE: 0.080 (0.000)
R2: 0.402 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 3.592 (0.000)
MAE: 3.024 (0.000)
MAPE: 0.079 (0.000)
R2: -0.794 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 2.953 (0.000)
MAE: 2.486 (0.000)
MAPE: 0.058 (0.000)
R2: 0.641 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 4

1. TIMESTEPS: 1

In [50]:
set_seeds()
set_global_determinism()
timesteps = 1
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_4, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.519483995437622


<IPython.core.display.Javascript object>

In [51]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 4.737 (0.000)
MAE: 3.759 (0.000)
MAPE: 0.080 (0.000)
R2: 0.317 (0.000)


******
[TEST]
******
RMSE: 4.653 (0.000)
MAE: 3.773 (0.000)
MAPE: 0.082 (0.000)
R2: 0.373 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 5.457 (0.000)
MAE: 4.546 (0.000)
MAPE: 0.090 (0.000)
R2: 0.210 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.585 (0.000)
MAE: 2.048 (0.000)
MAPE: 0.053 (0.000)
R2: 0.163 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 5.339 (0.000)
MAE: 4.684 (0.000)
MAPE: 0.102 (0.000)
R2: -0.148 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 4

1. TIMESTEPS: 7

In [52]:
set_seeds()
set_global_determinism()
timesteps = 7
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_4, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.6721399664878845


<IPython.core.display.Javascript object>

In [53]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 3.269 (0.000)
MAE: 2.649 (0.000)
MAPE: 0.058 (0.000)
R2: 0.675 (0.000)


******
[TEST]
******
RMSE: 3.026 (0.000)
MAE: 2.388 (0.000)
MAPE: 0.054 (0.000)
R2: 0.733 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 4.009 (0.000)
MAE: 3.364 (0.000)
MAPE: 0.067 (0.000)
R2: 0.548 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.393 (0.000)
MAE: 1.896 (0.000)
MAPE: 0.049 (0.000)
R2: 0.234 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 2.659 (0.000)
MAE: 2.102 (0.000)
MAPE: 0.050 (0.000)
R2: 0.711 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 4

1. TIMESTEPS: 14

In [54]:
set_seeds()
set_global_determinism()
timesteps = 14
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_4, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  1.0932803670565288


<IPython.core.display.Javascript object>

In [55]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 2.893 (0.000)
MAE: 2.241 (0.000)
MAPE: 0.049 (0.000)
R2: 0.745 (0.000)


******
[TEST]
******
RMSE: 2.961 (0.000)
MAE: 2.261 (0.000)
MAPE: 0.052 (0.000)
R2: 0.742 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 3.901 (0.000)
MAE: 3.115 (0.000)
MAPE: 0.062 (0.000)
R2: 0.576 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 3.307 (0.000)
MAE: 2.759 (0.000)
MAPE: 0.072 (0.000)
R2: -0.520 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 1.574 (0.000)
MAE: 1.253 (0.000)
MAPE: 0.029 (0.000)
R2: 0.898 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 5

1. TIMESTEPS: 1

In [56]:
set_seeds()
set_global_determinism()
timesteps = 1
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_5, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  0.5995062788327535


<IPython.core.display.Javascript object>

In [57]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 4.580 (0.000)
MAE: 3.914 (0.000)
MAPE: 0.085 (0.000)
R2: 0.362 (0.000)


******
[TEST]
******
RMSE: 3.827 (0.000)
MAE: 3.097 (0.000)
MAPE: 0.067 (0.000)
R2: 0.576 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 4.644 (0.000)
MAE: 3.834 (0.000)
MAPE: 0.076 (0.000)
R2: 0.428 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.077 (0.000)
MAE: 1.687 (0.000)
MAPE: 0.042 (0.000)
R2: 0.460 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 4.289 (0.000)
MAE: 3.764 (0.000)
MAPE: 0.082 (0.000)
R2: 0.259 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 5

1. TIMESTEPS: 7

In [58]:
set_seeds()
set_global_determinism()
timesteps = 7
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_5, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  1.1145887653032938


<IPython.core.display.Javascript object>

In [59]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 2.802 (0.000)
MAE: 2.132 (0.000)
MAPE: 0.047 (0.000)
R2: 0.761 (0.000)


******
[TEST]
******
RMSE: 3.091 (0.000)
MAE: 2.427 (0.000)
MAPE: 0.056 (0.000)
R2: 0.722 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 3.973 (0.000)
MAE: 3.317 (0.000)
MAPE: 0.067 (0.000)
R2: 0.556 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 3.522 (0.000)
MAE: 2.993 (0.000)
MAPE: 0.078 (0.000)
R2: -0.660 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 1.620 (0.000)
MAE: 1.303 (0.000)
MAPE: 0.030 (0.000)
R2: 0.893 (0.000)




<IPython.core.display.Javascript object>

### Conv1D 5

1. TIMESTEPS: 14

In [60]:
set_seeds()
set_global_determinism()
timesteps = 14
model_early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
dataset, plant_specific = prepare_dataset_helper(timesteps=timesteps)
x_train = dataset["x_train"]
y_train = dataset["y_train"]
x_test = dataset["x_test"]
y_test = dataset["y_test"]

params = get_conv1d_params(
    timesteps=timesteps,
    callbacks=None,
    validation_split=0.0,
    verbose=0,
    kernel_size=timesteps,
    pool_size=timesteps,
)


start = time.time()
model, scores = train_and_evaluate_model(Conv1D_5, dataset, estimator_params=params)
end = time.time()
print("Minutes Elapsed: ", (end - start) / 60)

Minutes Elapsed:  1.9876990079879762


<IPython.core.display.Javascript object>

In [61]:
scores_dict = print_results_and_get_scores_dict(scores, plant_specific)
append_results(scores_dict)
model_index += 1

Overall Score

******
[TRAIN]
******
RMSE: 2.923 (0.000)
MAE: 2.214 (0.000)
MAPE: 0.048 (0.000)
R2: 0.739 (0.000)


******
[TEST]
******
RMSE: 3.325 (0.000)
MAE: 2.371 (0.000)
MAPE: 0.053 (0.000)
R2: 0.674 (0.000)


Scores per plant

Plant partner_iv

******
[TEST]
******
RMSE: 5.321 (0.000)
MAE: 4.303 (0.000)
MAPE: 0.086 (0.000)
R2: 0.210 (0.000)


Plant partner_ii

******
[TEST]
******
RMSE: 2.709 (0.000)
MAE: 2.186 (0.000)
MAPE: 0.057 (0.000)
R2: -0.020 (0.000)


Plant partner_i

******
[TEST]
******
RMSE: 1.473 (0.000)
MAE: 1.184 (0.000)
MAPE: 0.027 (0.000)
R2: 0.911 (0.000)




<IPython.core.display.Javascript object>

# Saving the results

In [62]:
path = f"../../../../../../../reports/results/global_models/inn/all_cements/pre_training/full/"
filename = f"conv1d_results_full_{index_to_save}.csv"


pd.concat(results_to_save).to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

In [63]:
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

In [64]:
columns_to_standardize = ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]

<IPython.core.display.Javascript object>

In [65]:
ddf = pd.concat(results_to_save).reset_index(drop=True)
ddf_copy = ddf.copy()
ddf_copy = ddf_copy[ddf_copy["Plant"].eq("INN")]
scaler = StandardScaler()
dddf_copy = ddf_copy.copy()
dddf_copy[columns_to_standardize] = scaler.fit_transform(
    ddf_copy[columns_to_standardize]
).copy()

ddf_copy["SCPM"] = (
    dddf_copy[["RMSE Test", "MAE Test", "MAPE Test"]].sum(axis=1) - dddf_copy["R2 Test"]
)
min_row = ddf_copy[ddf_copy["SCPM"].eq(ddf_copy["SCPM"].min())]
ddf.merge(min_row[["Timesteps", "Model"]], on=("Timesteps", "Model"), how="inner")

Unnamed: 0,Category,Company,Plant,Features,Data Shape,Timesteps,Model,Model Params,Scaler,Scaler Params,...,Cross Validation,Cross Validation Params,RMSE Train,MAE Train,MAPE Train,R2 Train,RMSE Test,MAE Test,MAPE Test,R2 Test
0,Global Model,INN,INN,Chemical + Mineralogical + Properties CS Less,"(3180, 16)",14,Conv1D_12,,Standard Scaler,,...,Out of time,"{""train_size"": 0.8, ""test_size"": 0.2}",2.892892,2.241293,0.048997,0.744772,2.960812,2.261228,0.052215,0.741595
1,Global Model,INN,partner_iv,Chemical + Mineralogical + Properties CS Less,"(3180, 16)",14,Conv1D_12,,Standard Scaler,,...,Out of time,"{""train_size"": 0.8, ""test_size"": 0.2}",2.892892,2.241293,0.048997,0.744772,3.900988,3.114726,0.061771,0.575577
2,Global Model,INN,partner_ii,Chemical + Mineralogical + Properties CS Less,"(3180, 16)",14,Conv1D_12,,Standard Scaler,,...,Out of time,"{""train_size"": 0.8, ""test_size"": 0.2}",2.892892,2.241293,0.048997,0.744772,3.306791,2.758895,0.071979,-0.519926
3,Global Model,INN,partner_i,Chemical + Mineralogical + Properties CS Less,"(3180, 16)",14,Conv1D_12,,Standard Scaler,,...,Out of time,"{""train_size"": 0.8, ""test_size"": 0.2}",2.892892,2.241293,0.048997,0.744772,1.574013,1.253223,0.029078,0.897882


<IPython.core.display.Javascript object>