In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Database Reading and Manipulation
import pandas as pd

# Linear Algebra
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Processing results
import json

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Custom modules
## Model selection
from src.cross_validation.blocking_time_series_split import BlockingTimeSeriesSplit

## Function to print scores
from src.utils.print_scores import print_scores

## Function to calculate score regression metrics
from src.utils.score_regression_metrics import score_regression_metrics

## Function to fill the results metric dict
from src.utils.fill_results_dict import fill_results_dict

# Modeling
from sklearn.linear_model import LinearRegression

# Preprocessing - Data standardization
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

# Pipeline
from sklearn.pipeline import Pipeline

# Data imputation
from sklearn.impute import SimpleImputer

<IPython.core.display.Javascript object>

# Functions and definitions

In [3]:
def score_model(model, x, y, n_splits=10, n_repeats=10):
    scoring = "neg_root_mean_squared_error"
    cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=SEED)
    scores = cross_val_score(model, x, y, scoring=scoring, cv=cv, n_jobs=-1)
    return scores

<IPython.core.display.Javascript object>

In [4]:
def plot_predictions(linreg_model, df, index, x, y):
    d = df["Data/Hora"].values
    test_series = pd.DataFrame({"ccs28": y}, index=pd.to_datetime(d))
    pred_series = linreg_model.predict(scaler.transform(x))
    pred_series = pd.DataFrame({"ccs28-pred": pred_series}, index=pd.to_datetime(d))

    fig, ax = plt.subplots(1, 1, sharex=True, sharey=True, figsize=(15, 7))

    test_series.plot(ax=ax)
    ax.axvline(test_series.index[index], color="r")  # end of train dataset
    pred_series[index:].plot(ax=ax)
    ax.grid(which="both")
    ax.legend(
        ["train and test series", "end of train series", "predicted"], loc="upper left"
    )
    # ax.set_xlabel('Period', labelpad=20, fontsize=15)
    ax.set_ylabel("Compressive Strength - MPa", labelpad=20, fontsize=15)
    plt.show()

<IPython.core.display.Javascript object>

In [5]:
def plot_scores_box_plot(scores, repeats, n_splits):
    plt.figure(figsize=(15, 8))
    plt.boxplot(
        scores.reshape((repeats, n_splits)),
        labels=[str(r) for r in range(1, repeats + 1)],
        showmeans=True,
    )
    plt.ylabel("RMSE", labelpad=20, fontsize=15)
    plt.xlabel("Repeats", labelpad=20, fontsize=15)
    plt.show()

<IPython.core.display.Javascript object>

## Helper functions for blocked time series cross validation

### Train test split

In [6]:
def split_by_periods(x, y, train_period, test_period):
    datasets = []
    i = 0
    max_samples = x.shape[0]

    for _ in range(0, max_samples // (train_period + test_period)):
        # Splitting the data into train/test sets
        x_train = x[i : i + train_period].copy()
        y_train = y[i : i + train_period].copy()
        x_test = x[i + train_period : i + train_period + test_period].copy()
        y_test = y[i + train_period : i + train_period + test_period].copy()

        datasets.append(
            {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test,
            }
        )
        # Increments the index for the next period of time
        i += train_period + test_period

    return datasets

<IPython.core.display.Javascript object>

In [7]:
def split_by_dates(x, y, train_period, test_period, dates):
    datasets = []
    dates = dates[: x.shape[0]].copy()
    dates_unique = dates.copy().unique()
    i = 0
    max_samples = dates_unique.shape[0]

    for _ in range(0, max_samples // (train_period + test_period)):
        # Splitting the Train Set
        start_date_train = dates_unique[i]
        end_date_train = dates_unique[i + train_period]
        idx_train = dates[(dates >= start_date_train) & (dates < end_date_train)].index
        x_train = x.loc[idx_train].copy()
        y_train = y[idx_train].copy()

        # Splitting the Test Set
        start_date_test = dates_unique[i + train_period]
        end_date_test = dates_unique[i + train_period + test_period]
        idx_test = dates[(dates >= start_date_test) & (dates < end_date_test)].index
        x_test = x.loc[idx_test].copy()
        y_test = y[idx_test].copy()

        datasets.append(
            {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test,
            }
        )

        i += train_period + test_period

    return datasets

<IPython.core.display.Javascript object>

In [8]:
def train_test_split_blocked_ts(x, y, train_period, test_period, dates=None):
    """
    Split the input data into train-test datasets based on train and test periods.

    Args:
        x (pd.DataFrame): Input features.
        y (np.Array): Target values.
        train_period (int): Length of the training period.
        test_period (int): Length of the testing period.
        dates (pd.Series): Optional date information.

    Returns:
        List[dict]: A list of dictionaries, each containing 'x_train', 'y_train', 'x_test', and 'y_test'.
    """
    if dates is None:
        return split_by_periods(x, y, train_period, test_period)
    else:
        return split_by_dates(x, y, train_period, test_period, dates)

<IPython.core.display.Javascript object>

### Data preprocessing

In [9]:
def impute_data(dataset, imputer=None, imputer_params=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply imputation to the data
    if imputer is not None:
        imputer = imputer() if imputer_params is None else imputer(**imputer_params)
        x_train = imputer.fit_transform(x_train)
        x_test = imputer.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [10]:
def transform_data(dataset, transformer=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply data normalization/standardization to the data
    if transformer is not None:
        scaler = transformer()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [11]:
def preprocess_data(dataset, transformer=None, imputer=None, imputer_params=None):
    dataset = impute_data(dataset, imputer, imputer_params)
    dataset = transform_data(dataset, transformer)
    return dataset

<IPython.core.display.Javascript object>

### Train and evaluate the model

In [12]:
def train_and_evaluate_model(Estimator, dataset, estimator_params=None):
    """
    Purpose: Helper function to be used in conjunction with
    blocked time_series cross validation function
    """
    x_train = dataset["x_train"]
    y_train = dataset["y_train"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]

    # Instantiate the model
    model = Estimator() if estimator_params is None else Estimator(**estimator_params)

    # Fitting the model
    model.fit(x_train, y_train)

    # Making predictions on train/test sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Return regression metrics
    return score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)

<IPython.core.display.Javascript object>

### Blocking time series cross validation

In [13]:
def repeated_blocking_time_series(
    Estimator,
    Transform,
    Imputer,
    x,
    y,
    train_period,
    test_period,
    dates=None,
    repeats=10,
    estimator_params=None,
    imputer_params=None,
):
    """
    Perform repeated cross-validation with blocked time series data.

    Args:
        Estimator: Machine learning model class.
        Transform: Data transformation method.
        Imputer: Data imputation method.
        x: Input features.
        y: Target values.
        train_period: Length of the training period.
        test_period: Length of the testing period.
        dates: Optional date information.
        repeats: Number of repetitions.
        estimator_params: Parameters for the model.
        imputer_params: Parameters for data imputation.

    Returns:
        list: List of dictionaries containing evaluation metrics for each repetition.
    """

    results = []
    max_samples = x.shape[0]

    # Splitting the data into train/test sets
    datasets = train_test_split_blocked_ts(x, y, train_period, test_period, dates)

    for _ in range(repeats):
        scores = []

        for dataset in datasets:
            dataset = preprocess_data(dataset, Transform, Imputer, imputer_params)
            score = train_and_evaluate_model(Estimator, dataset, estimator_params)
            scores.append(score)

        # After every iteration metrics results are appended together
        scores_final = {key: [] for key, _ in scores[0].items()}
        for scores_dict in scores:
            for key, value in scores_dict.items():
                scores_final[key] += [value]
        results.append(scores_final)
    return results

<IPython.core.display.Javascript object>

In [14]:
SEED = 47
METRICS = (
    "neg_root_mean_squared_error",
    "neg_mean_absolute_error",
    "neg_mean_absolute_percentage_error",
    "r2",
)
METRICS_DICT = {
    "neg_root_mean_squared_error": "RMSE",
    "neg_mean_absolute_error": "MAE",
    "neg_mean_absolute_percentage_error": "MAPE",
    "r2": "R2",
}

<IPython.core.display.Javascript object>

## Defining a dataframe structure to save the results

In [15]:
results_to_save = []

results_dict = {
    "Category": "Local Model",
    "Company": "partner_i",
    "Features": "Chemical + Mineralogical + Physical",
    "Data Shape": None,
    "Timesteps": None,
    "Model": "Linear Regression",
    "Model Params": None,
    "Scaler": "Standard Scaler",
    "Scaler Params": None,
    "Imputer": "Median",
    "Imputer Params": None,
    "Cross Validation": None,
    "Cross Validation Params": np.nan,
    "RMSE Train": np.nan,
    "MAE Train": np.nan,
    "MAPE Train": np.nan,
    "R2 Train": np.nan,
    "RMSE Test": np.nan,
    "MAE Test": np.nan,
    "MAPE Test": np.nan,
    "R2 Test": np.nan,
}

<IPython.core.display.Javascript object>

# Reading the dataset

In [16]:
df = pd.read_csv("../../../../../../data/processed/partner_i-Oficial/cement-shipping.csv")

<IPython.core.display.Javascript object>

## Defining Features

In this set of experiments we use all available features

In [17]:
df_copy = df.drop(
    [  # Removing One-Hot encoding variables
        "Cement_Type",
        
    ],
    axis=1,
).copy()

<IPython.core.display.Javascript object>

# 1. Linear Regression

<h2>1. Dataset: df_copy</h2> <br>In this dataset all features are used.

In [18]:
y = df_copy.pop("CS28").values
x = df_copy.drop(["Date"], axis=1)
dates = df["Date"].copy()

<IPython.core.display.Javascript object>

## 1.1 Repeated KFold Cross validation

<b>Dataset shape:</b> (1234, 37)<br>
<b>Repeats:</b>10<br>
<b>Splits:</b>5<br>
    1. 5 folds of 246 samples each
    2. 80% train (988 samples each fold)
    3. 20% test (246 samples each fold)
<b>Total:</b> 15 models<br>

In [19]:
repeats = 3
n_splits = 5
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = RepeatedKFold(n_splits=n_splits, n_repeats=repeats, random_state=SEED)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Repeated Cross Validation:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

Repeated Cross Validation:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -0.935 (0.008)
MAE: -0.728 (0.009)
MAPE: -0.017 (0.000)
R2: 0.966 (0.001)


******
[TEST]
******
RMSE: -0.977 (0.036)
MAE: -0.757 (0.033)
MAPE: -0.018 (0.001)
R2: 0.962 (0.003)




<IPython.core.display.Javascript object>

In [20]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Repeated KFold"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 3}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [21]:
df_results.groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(["mean", "std"]).reset_index()

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,0.976822,0.037492,0.756527,0.034231,0.017623,0.000823,0.962264,0.003121


<IPython.core.display.Javascript object>

## 1.2. Blocking Time Series Cross Validation

<b>Dataset shape:</b> (1234, 38)<br>
<b>Splits:</b>5<br>    
    1. 5 folds of 246 samples
    2. 80% train (987 samples each fold)
    3. 20% test (246 samples each fold)
<b>Total:</b> 5 models<br>

In [22]:
n_splits = 5
train_size = 0.8

pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = BlockingTimeSeriesSplit(n_splits=n_splits, train_size=train_size)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Blocking Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Blocking Time Series Split"
results_dict_copy[
    "Cross Validation Params"
] = '{"N_Splits": 5, "Repeats": 1, "train_size": 0.8}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

Blocking Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -0.805 (0.047)
MAE: -0.633 (0.039)
MAPE: -0.015 (0.001)
R2: 0.974 (0.003)


******
[TEST]
******
RMSE: -1.081 (0.099)
MAE: -0.858 (0.061)
MAPE: -0.020 (0.002)
R2: 0.951 (0.011)




<IPython.core.display.Javascript object>

In [23]:
pd.concat(results_to_save).reset_index().groupby(
    ["Features", "Model", "Cross Validation", "Timesteps"], dropna=False
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Timesteps,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,,1.080901,0.098729,0.858103,0.061362,0.019927,0.001509,0.950736,0.01111
1,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,,0.976822,0.036221,0.756527,0.033071,0.017623,0.000795,0.962264,0.003015


<IPython.core.display.Javascript object>

## 1.3. Time Series Split Cross Validation

The training set has size i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1) in the i th split, with a test set of size n_samples//(n_splits + 1) by default, where n_samples is the number of samples.


<b>Dataset shape:</b> (1234, 38)<br>
<b>Splits:</b>10<br>    
    1. Train: 10 folds of 114, 226, 338, 450, 562, 675, 787, 899, 1011, 1123 samples each fold
    2. Test: 112 samples each fold
<b>Total:</b> 10 models<br>

In [24]:
n_splits = 5
gap = 0
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = TimeSeriesSplit(gap=gap, max_train_size=None, n_splits=n_splits, test_size=None)

scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -0.840 (0.072)
MAE: -0.654 (0.056)
MAPE: -0.015 (0.001)
R2: 0.971 (0.004)


******
[TEST]
******
RMSE: -1.081 (0.031)
MAE: -0.847 (0.036)
MAPE: -0.020 (0.001)
R2: 0.954 (0.004)




<IPython.core.display.Javascript object>

In [25]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Time Series Split"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 1, "Gap": 0}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [26]:
pd.concat(results_to_save).groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,1.080901,0.098729,0.858103,0.061362,0.019927,0.001509,0.950736,0.01111
1,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,0.976822,0.036221,0.756527,0.033071,0.017623,0.000795,0.962264,0.003015
2,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,1.081374,0.030586,0.847352,0.035983,0.019657,0.000929,0.954137,0.003698


<IPython.core.display.Javascript object>

## 1.4. Out of time Split Cross Validation

<b>Dataset shape:</b> (1234, 38)<br>
<b>Train size: 80%</b><br>
<b>Test  size: 20%</b>


<b>Splits:</b> 2<br>    
    1. Train: 987
    2. Test: 247
<b>Total:</b> 1 model<br>

In [27]:
test_size = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=test_size, random_state=SEED, shuffle=False
)
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)

pipeline.fit(x_train, y_train)

y_train_pred = pipeline.predict(x_train)
y_test_pred = pipeline.predict(x_test)

scores = score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)
print_scores(scores, METRICS, METRICS_DICT)

******
[TRAIN]
******
RMSE: 0.915 (0.000)
MAE: 0.713 (0.000)
MAPE: 0.017 (0.000)
R2: 0.967 (0.000)


******
[TEST]
******
RMSE: 1.065 (0.000)
MAE: 0.836 (0.000)
MAPE: 0.019 (0.000)
R2: 0.954 (0.000)




<IPython.core.display.Javascript object>

In [28]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Out of time Split"
results_dict_copy["Cross Validation Params"] = '{"Test Size": 0.2}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(
    results_dict_copy, {key: [value] for key, value in scores.items()}
)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [29]:
pd.concat(results_to_save).groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,1.080901,0.098729,0.858103,0.061362,0.019927,0.001509,0.950736,0.01111
1,Chemical + Mineralogical + Physical,Linear Regression,Out of time Split,1.065179,0.0,0.835774,0.0,0.019295,0.0,0.954163,0.0
2,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,0.976822,0.036221,0.756527,0.033071,0.017623,0.000795,0.962264,0.003015
3,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,1.081374,0.030586,0.847352,0.035983,0.019657,0.000929,0.954137,0.003698


<IPython.core.display.Javascript object>

## 1.5 Blocking Time Series - Period Specific

### 1.5.1 Train 3 months and predicts one week every year

In [30]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=90,
    test_period=7,
    repeats=10,
    imputer_params={"strategy": "median"},
    dates=dates,
)
print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 90, "Test Period": 7, "Method": "Dates"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 0.758 (0.057)
MAE: 0.595 (0.054)
MAPE: 0.014 (0.001)
R2: 0.978 (0.005)


******
[TEST]
******
RMSE: 1.013 (0.148)
MAE: 0.795 (0.124)
MAPE: 0.018 (0.003)
R2: 0.957 (0.013)




<IPython.core.display.Javascript object>

### 1.5.2 Train 3 months and predicts one month every year

In [31]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=90,
    test_period=30,
    repeats=1,
    imputer_params={"strategy": "median"},
    dates=dates,
)
# print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 90, "Test Period": 30, "Method": "Dates"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

<IPython.core.display.Javascript object>

### 1.5.3 Train 6 months and predicts one week every year

In [32]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=180,
    test_period=7,
    repeats=1,
    imputer_params={"strategy": "median"},
    dates=dates,
)
print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 180, "Test Period": 7, "Method": "Dates"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 0.849 (0.056)
MAE: 0.669 (0.043)
MAPE: 0.016 (0.001)
R2: 0.972 (0.005)


******
[TEST]
******
RMSE: 0.872 (0.092)
MAE: 0.713 (0.096)
MAPE: 0.017 (0.002)
R2: 0.953 (0.024)




<IPython.core.display.Javascript object>

### 1.5.4 Train 6 months and predicts one month every year

In [33]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=180,
    test_period=30,
    repeats=1,
    imputer_params={"strategy": "median"},
    dates=dates,
)
print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 180, "Test Period": 30, "Method": "Dates"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 0.872 (0.046)
MAE: 0.686 (0.038)
MAPE: 0.016 (0.001)
R2: 0.970 (0.003)


******
[TEST]
******
RMSE: 1.167 (0.145)
MAE: 0.895 (0.137)
MAPE: 0.021 (0.004)
R2: 0.943 (0.020)




<IPython.core.display.Javascript object>

### 1.5.5 Train one year and predicts one month every year

In [34]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=365,
    test_period=30,
    repeats=1,
    imputer_params={"strategy": "median"},
)
print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 365, "Test Period": 30, "Method": "Points"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 0.850 (0.035)
MAE: 0.670 (0.034)
MAPE: 0.016 (0.001)
R2: 0.971 (0.002)


******
[TEST]
******
RMSE: 1.242 (0.252)
MAE: 0.930 (0.129)
MAPE: 0.022 (0.003)
R2: 0.940 (0.014)




<IPython.core.display.Javascript object>

### 1.5.6 Train 1 year and predicts three months every year

In [35]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=365,
    test_period=90,
    repeats=1,
    imputer_params={"strategy": "median"},
)
# print_scores(scores[0], METRICS, METRICS_DICT)
# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 365, "Test Period": 90, "Method": "Points"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

<IPython.core.display.Javascript object>

### 1.5.7 Train 1 year and six months and predicts one month every year

In [36]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=545,
    test_period=30,
    repeats=1,
    imputer_params={"strategy": "median"},
)
print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 545, "Test Period": 30, "Method": "Points"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 0.873 (0.027)
MAE: 0.681 (0.028)
MAPE: 0.016 (0.001)
R2: 0.970 (0.000)


******
[TEST]
******
RMSE: 1.012 (0.006)
MAE: 0.838 (0.003)
MAPE: 0.019 (0.000)
R2: 0.957 (0.001)




<IPython.core.display.Javascript object>

### 1.5.8 Train 1 year and six months and predicts two months every year

In [37]:
scores = repeated_blocking_time_series(
    LinearRegression,
    StandardScaler,
    SimpleImputer,
    x,
    y,
    train_period=545,
    test_period=60,
    repeats=1,
    imputer_params={"strategy": "median"},
)
print_scores(scores[0], METRICS, METRICS_DICT)

# results_dict_copy = results_dict.copy()
# results_dict_copy["Cross Validation"] = "Blocking Time Series"
# results_dict_copy[
#     "Cross Validation Params"
# ] = '{"Train Period": 545, "Test Period": 60, "Method": "Points"}'
# results_dict_copy["Data Shape"] = x.shape
# df_results = fill_results_dict(results_dict_copy, scores[0])
# results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 0.873 (0.028)
MAE: 0.684 (0.030)
MAPE: 0.016 (0.001)
R2: 0.970 (0.000)


******
[TEST]
******
RMSE: 1.131 (0.160)
MAE: 0.912 (0.124)
MAPE: 0.022 (0.003)
R2: 0.944 (0.016)




<IPython.core.display.Javascript object>

In [38]:
pipeline.named_steps["estimator"].coef_

array([ 1.05631171e-02, -1.93030073e-01, -8.05123790e-02,  1.84922397e-01,
       -2.30710150e-01,  1.32149765e-02, -3.74914558e-02,  1.87370667e-01,
       -4.29424664e-02,  4.24676206e-02, -1.07875171e-01,  1.09664969e-01,
       -9.35198315e-02, -1.29303909e-01,  3.05851559e-01,  5.35168089e-01,
       -3.56343933e-01, -1.30829310e-01, -3.93450626e-03, -8.16338264e-02,
       -7.92129844e-02,  2.00241958e-01,  1.17794299e-01, -1.22566637e-02,
        2.27392325e-01,  5.11618441e-04,  1.03172066e-01,  3.35933827e-02,
        1.68433167e-01, -9.21143676e-02, -8.19429964e-02,  1.56762091e-02,
        6.44898364e-02,  1.51598548e+00,  2.82292060e+00,  1.75447932e-01,
       -7.48179066e-01])

<IPython.core.display.Javascript object>

In [39]:
pipeline.named_steps["estimator"].intercept_

43.398173464561

<IPython.core.display.Javascript object>

In [40]:
coeff = pipeline.named_steps["estimator"].coef_
np.array(x.columns)[coeff == 0]

array([], dtype=object)

<IPython.core.display.Javascript object>

In [41]:
x.drop(np.array(x.columns)[coeff == 0], axis=1).columns

Index(['CaO', 'MgO', 'Na2O', 'Al2O3', 'SiO2', 'SO3', 'K2O', 'Fe2O3',
       'Loss on Ignition', 'Insoluble Residue', 'Total C3S', 'Alpha C2S',
       'Beta C2S', 'Gamma C2S', 'C4AF', 'C3A', 'Cubic C3A', 'Orthorhombic C3A',
       'Free CaO', 'Portlandite', 'Periclase', 'Arcanite', 'Aphthitalite',
       'Gypsum', 'Bassanite', 'Anhydrite', 'Calcite', 'Dolomite', 'Quartz',
       'Blaine', 'Initial setting time', 'Final setting time', 'Density',
       'CS3', 'CS7', '#200', '#325'],
      dtype='object')

<IPython.core.display.Javascript object>

In [42]:
x.drop(np.array(x.columns)[coeff == 0], axis=1).columns.shape

(37,)

<IPython.core.display.Javascript object>

In [43]:
x.columns.shape

(37,)

<IPython.core.display.Javascript object>

In [44]:
coeffs = pd.DataFrame(
    {col: [c] for col, c in zip(x.columns, coeff)}, index=["Coefficients"]
)

<IPython.core.display.Javascript object>

In [45]:
coeffs.T["Coefficients"].sort_values(ascending=False).to_frame(
    name="Coefficients"
).style.background_gradient(axis=None, vmin=1, vmax=5, cmap="Greens")

Unnamed: 0,Coefficients
CS7,2.822921
CS3,1.515985
C3A,0.535168
C4AF,0.305852
Bassanite,0.227392
Arcanite,0.200242
Fe2O3,0.187371
Al2O3,0.184922
#200,0.175448
Quartz,0.168433


<IPython.core.display.Javascript object>

In [46]:
pd.concat(results_to_save).groupby(
    ["Features", "Model", "Cross Validation", "Cross Validation Params"]
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Cross Validation Params,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",1.080901,0.098729,0.858103,0.061362,0.019927,0.001509,0.950736,0.01111
1,Chemical + Mineralogical + Physical,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",1.065179,0.0,0.835774,0.0,0.019295,0.0,0.954163,0.0
2,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",0.976822,0.036221,0.756527,0.033071,0.017623,0.000795,0.962264,0.003015
3,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",1.081374,0.030586,0.847352,0.035983,0.019657,0.000929,0.954137,0.003698


<IPython.core.display.Javascript object>

# Saving the results Dataframe

## Saving the full dataframe

In [47]:
path = (
    "../../../../../../reports/results/local_models/partner_i-oficial/all_cements/full/"
)
filename = "linear_regression_results_full_7.csv"

pd.concat(results_to_save).to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

## Saving the grouped dataframe

In [48]:
cols_groupby = [
    "Category",
    "Company",
    "Data Shape",
    "Timesteps",
    "Features",
    "Model",
    "Cross Validation",
    "Cross Validation Params",
]

cols_agg = ["RMSE Train", "MAE Train", "MAPE Train", "R2 Train"] + [
    "RMSE Test",
    "MAE Test",
    "MAPE Test",
    "R2 Test",
]

path = "../../../../../../reports/results/local_models/partner_i-oficial/all_cements/grouped/"
filename = "linear_regression_results_grouped_7.csv"


df_results_to_save = (
    pd.concat(results_to_save)
    .groupby(cols_groupby, dropna=False)[cols_agg]
    .agg(["mean", lambda series: pd.Series(series.std(ddof=0), name="std")])
    .reset_index()
    .rename(columns={"<lambda_0>": "std"})
)

df_results_to_save.to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,  # header=df_results_to_save.columns
)

<IPython.core.display.Javascript object>

In [49]:
df_results_to_save

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,Local Model,partner_i,"(1226, 37)",,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",0.805031,0.046589,...,0.974466,0.003214,1.080901,0.098729,0.858103,0.061362,0.019927,0.001509,0.950736,0.01111
1,Local Model,partner_i,"(1226, 37)",,Chemical + Mineralogical + Physical,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",0.915126,0.0,...,0.967235,0.0,1.065179,0.0,0.835774,0.0,0.019295,0.0,0.954163,0.0
2,Local Model,partner_i,"(1226, 37)",,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",0.935457,0.008227,...,0.965605,0.00063,0.976822,0.036221,0.756527,0.033071,0.017623,0.000795,0.962264,0.003015
3,Local Model,partner_i,"(1226, 37)",,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",0.840417,0.07191,...,0.97098,0.004039,1.081374,0.030586,0.847352,0.035983,0.019657,0.000929,0.954137,0.003698


<IPython.core.display.Javascript object>

In [50]:
c = pd.read_csv(
    "../../../../../../reports/results/local_models/partner_i-oficial/all_cements/grouped/linear_regression_results_grouped_1.csv",
    header=[0, 1],
).rename(columns=lambda x: "" if "Unnamed" in x else x, level=1)

<IPython.core.display.Javascript object>

In [51]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   (Category, )                 4 non-null      object 
 1   (Company, )                  4 non-null      object 
 2   (Data Shape, )               4 non-null      object 
 3   (Timesteps, )                0 non-null      float64
 4   (Features, )                 4 non-null      object 
 5   (Model, )                    4 non-null      object 
 6   (Cross Validation, )         4 non-null      object 
 7   (Cross Validation Params, )  4 non-null      object 
 8   (RMSE Train, mean)           4 non-null      float64
 9   (RMSE Train, std)            4 non-null      float64
 10  (MAE Train, mean)            4 non-null      float64
 11  (MAE Train, std)             4 non-null      float64
 12  (MAPE Train, mean)           4 non-null      float64
 13  (MAPE Train, std)       

<IPython.core.display.Javascript object>

In [52]:
df_results_to_save.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   (Category, )                 4 non-null      object 
 1   (Company, )                  4 non-null      object 
 2   (Data Shape, )               4 non-null      object 
 3   (Timesteps, )                0 non-null      float64
 4   (Features, )                 4 non-null      object 
 5   (Model, )                    4 non-null      object 
 6   (Cross Validation, )         4 non-null      object 
 7   (Cross Validation Params, )  4 non-null      object 
 8   (RMSE Train, mean)           4 non-null      float64
 9   (RMSE Train, std)            4 non-null      float64
 10  (MAE Train, mean)            4 non-null      float64
 11  (MAE Train, std)             4 non-null      float64
 12  (MAPE Train, mean)           4 non-null      float64
 13  (MAPE Train, std)       

<IPython.core.display.Javascript object>

In [53]:
df_results_to_save[df_results_to_save["Cross Validation Params"].str.contains("Date")]

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std


<IPython.core.display.Javascript object>

In [54]:
c

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",0.700636,0.049022,...,0.980729,0.001604,0.949926,0.143246,0.733407,0.106088,0.017101,0.002711,0.96104,0.015278
1,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",0.815584,0.0,...,0.973914,0.0,1.212233,0.0,0.900702,0.0,0.020311,0.0,0.9405,0.0
2,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",0.855466,0.005967,...,0.971171,0.000459,0.965048,0.217141,0.718911,0.04166,0.016602,0.000845,0.96172,0.020438
3,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",0.738757,0.058813,...,0.977559,0.002862,1.158192,0.309342,0.807439,0.128733,0.018491,0.002644,0.945201,0.026837


<IPython.core.display.Javascript object>