In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Database Reading and Manipulation
import pandas as pd

# Linear Algebra
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Processing results
import json

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Custom modules
## Model selection
from src.cross_validation.blocking_time_series_split import BlockingTimeSeriesSplit

## Function to print scores
from src.utils.print_scores import print_scores

## Function to calculate score regression metrics
from src.utils.score_regression_metrics import score_regression_metrics

## Function to fill the results metric dict
from src.utils.fill_results_dict import fill_results_dict

# Modeling
from sklearn.linear_model import LinearRegression

# Preprocessing - Data standardization
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

# Pipeline
from sklearn.pipeline import Pipeline

# Data imputation
from sklearn.impute import SimpleImputer

<IPython.core.display.Javascript object>

# Functions and definitions

## Helper functions for blocked time series cross validation

### Train test split

In [3]:
def split_by_periods(x, y, train_period, test_period):
    datasets = []
    i = 0
    max_samples = x.shape[0]

    for _ in range(0, max_samples // (train_period + test_period)):
        # Splitting the data into train/test sets
        x_train = x[i : i + train_period].copy()
        y_train = y[i : i + train_period].copy()
        x_test = x[i + train_period : i + train_period + test_period].copy()
        y_test = y[i + train_period : i + train_period + test_period].copy()

        datasets.append(
            {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test,
            }
        )
        # Increments the index for the next period of time
        i += train_period + test_period

    return datasets

<IPython.core.display.Javascript object>

In [4]:
def split_by_dates(x, y, train_period, test_period, dates):
    datasets = []
    dates = dates[: x.shape[0]].copy()
    dates_unique = dates.copy().unique()
    i = 0
    max_samples = dates_unique.shape[0]

    for _ in range(0, max_samples // (train_period + test_period)):
        # Splitting the Train Set
        start_date_train = dates_unique[i]
        end_date_train = dates_unique[i + train_period]
        idx_train = dates[(dates >= start_date_train) & (dates < end_date_train)].index
        x_train = x.loc[idx_train].copy()
        y_train = y[idx_train].copy()

        # Splitting the Test Set
        start_date_test = dates_unique[i + train_period]
        end_date_test = dates_unique[i + train_period + test_period]
        idx_test = dates[(dates >= start_date_test) & (dates < end_date_test)].index
        x_test = x.loc[idx_test].copy()
        y_test = y[idx_test].copy()

        datasets.append(
            {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test,
            }
        )

        i += train_period + test_period

    return datasets

<IPython.core.display.Javascript object>

In [5]:
def train_test_split_blocked_ts(x, y, train_period, test_period, dates=None):
    """
    Split the input data into train-test datasets based on train and test periods.

    Args:
        x (pd.DataFrame): Input features.
        y (np.Array): Target values.
        train_period (int): Length of the training period.
        test_period (int): Length of the testing period.
        dates (pd.Series): Optional date information.

    Returns:
        List[dict]: A list of dictionaries, each containing 'x_train', 'y_train', 'x_test', and 'y_test'.
    """
    if dates is None:
        return split_by_periods(x, y, train_period, test_period)
    else:
        return split_by_dates(x, y, train_period, test_period, dates)

<IPython.core.display.Javascript object>

### Data preprocessing

In [6]:
def impute_data(dataset, imputer=None, imputer_params=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply imputation to the data
    if imputer is not None:
        imputer = imputer() if imputer_params is None else imputer(**imputer_params)
        x_train = imputer.fit_transform(x_train)
        x_test = imputer.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [7]:
def transform_data(dataset, transformer=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply data normalization/standardization to the data
    if transformer is not None:
        scaler = transformer()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [8]:
def preprocess_data(dataset, transformer=None, imputer=None, imputer_params=None):
    dataset = impute_data(dataset, imputer, imputer_params)
    dataset = transform_data(dataset, transformer)
    return dataset

<IPython.core.display.Javascript object>

### Train and evaluate the model

In [9]:
def train_and_evaluate_model(Estimator, dataset, estimator_params=None):
    """
    Purpose: Helper function to be used in conjunction with
    blocked time_series cross validation function
    """
    x_train = dataset["x_train"]
    y_train = dataset["y_train"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]

    # Instantiate the model
    model = Estimator() if estimator_params is None else Estimator(**estimator_params)

    # Fitting the model
    model.fit(x_train, y_train)

    # Making predictions on train/test sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Return regression metrics
    return score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)

<IPython.core.display.Javascript object>

### Blocking time series cross validation

In [10]:
def repeated_blocking_time_series(
    Estimator,
    Transform,
    Imputer,
    x,
    y,
    train_period,
    test_period,
    dates=None,
    repeats=10,
    estimator_params=None,
    imputer_params=None,
):
    """
    Perform repeated cross-validation with blocked time series data.

    Args:
        Estimator: Machine learning model class.
        Transform: Data transformation method.
        Imputer: Data imputation method.
        x: Input features.
        y: Target values.
        train_period: Length of the training period.
        test_period: Length of the testing period.
        dates: Optional date information.
        repeats: Number of repetitions.
        estimator_params: Parameters for the model.
        imputer_params: Parameters for data imputation.

    Returns:
        list: List of dictionaries containing evaluation metrics for each repetition.
    """

    results = []
    max_samples = x.shape[0]

    # Splitting the data into train/test sets
    datasets = train_test_split_blocked_ts(x, y, train_period, test_period, dates)

    for _ in range(repeats):
        scores = []

        for dataset in datasets:
            dataset = preprocess_data(dataset, Transform, Imputer, imputer_params)
            score = train_and_evaluate_model(Estimator, dataset, estimator_params)
            scores.append(score)

        # After every iteration metrics results are appended together
        scores_final = {key: [] for key, _ in scores[0].items()}
        for scores_dict in scores:
            for key, value in scores_dict.items():
                scores_final[key] += [value]
        results.append(scores_final)
    return results

<IPython.core.display.Javascript object>

In [11]:
SEED = 47
METRICS = (
    "neg_root_mean_squared_error",
    "neg_mean_absolute_error",
    "neg_mean_absolute_percentage_error",
    "r2",
)
METRICS_DICT = {
    "neg_root_mean_squared_error": "RMSE",
    "neg_mean_absolute_error": "MAE",
    "neg_mean_absolute_percentage_error": "MAPE",
    "r2": "R2",
}
CEM_TYPE = ["cem_type_CEM B", "cem_type_CEM C"]

<IPython.core.display.Javascript object>

## Defining a dataframe structure to save the results

In [12]:
results_to_save = []

results_dict = {
    "Category": "Local Model",
    "Company": "partner_iv",
    "Features": "Chemical + Mineralogical + Physical + One-Hot",
    "Data Shape": None,
    "Timesteps": None,
    "Model": "Linear Regression",
    "Model Params": None,
    "Scaler": "Standard Scaler",
    "Scaler Params": None,
    "Imputer": "Median",
    "Imputer Params": None,
    "Cross Validation": None,
    "Cross Validation Params": np.nan,
    "RMSE Train": np.nan,
    "MAE Train": np.nan,
    "MAPE Train": np.nan,
    "R2 Train": np.nan,
    "RMSE Test": np.nan,
    "MAE Test": np.nan,
    "MAPE Test": np.nan,
    "R2 Test": np.nan,
}

<IPython.core.display.Javascript object>

# Reading the dataset

In [13]:
df = pd.read_csv("../../../../../data/processed/partner_iv/cement-shipping.csv")

<IPython.core.display.Javascript object>

## Defining Features

In this set of experiments we use all available features

In [14]:
df_copy = df.copy()
df_copy[CEM_TYPE] = df_copy[CEM_TYPE].astype(int)

<IPython.core.display.Javascript object>

# 1. Linear Regression

<h2>1. Dataset: df_copy</h2> <br>In this dataset all features are used.

In [15]:
y = df_copy.pop("CS28").values
x = df_copy.drop(["Date"], axis=1)
dates = df["Date"].copy()

<IPython.core.display.Javascript object>

## 1.1 Repeated KFold Cross validation

<b>Dataset shape:</b> (1234, 37)<br>
<b>Repeats:</b>10<br>
<b>Splits:</b>5<br>
    1. 5 folds of 246 samples each
    2. 80% train (988 samples each fold)
    3. 20% test (246 samples each fold)
<b>Total:</b> 15 models<br>

In [16]:
repeats = 3
n_splits = 5
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = RepeatedKFold(n_splits=n_splits, n_repeats=repeats, random_state=SEED)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Repeated Cross Validation:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

Repeated Cross Validation:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -2.069 (0.018)
MAE: -1.615 (0.017)
MAPE: -0.033 (0.000)
R2: 0.902 (0.002)


******
[TEST]
******
RMSE: -2.182 (0.084)
MAE: -1.704 (0.059)
MAPE: -0.035 (0.001)
R2: 0.890 (0.008)




<IPython.core.display.Javascript object>

In [17]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Repeated KFold"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 3}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [18]:
df_results.groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(["mean", "std"]).reset_index()

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,2.182339,0.086562,1.704186,0.061194,0.034894,0.001336,0.890398,0.008305


<IPython.core.display.Javascript object>

## 1.2. Blocking Time Series Cross Validation

<b>Dataset shape:</b> (1234, 38)<br>
<b>Splits:</b>5<br>    
    1. 5 folds of 246 samples
    2. 80% train (987 samples each fold)
    3. 20% test (246 samples each fold)
<b>Total:</b> 5 models<br>

In [19]:
n_splits = 5
train_size = 0.8

pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = BlockingTimeSeriesSplit(n_splits=n_splits, train_size=train_size)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Blocking Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Blocking Time Series Split"
results_dict_copy[
    "Cross Validation Params"
] = '{"N_Splits": 5, "Repeats": 1, "train_size": 0.8}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

Blocking Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -1.573 (0.208)
MAE: -1.253 (0.154)
MAPE: -0.026 (0.003)
R2: 0.941 (0.016)


******
[TEST]
******
RMSE: -2.734 (0.904)
MAE: -2.249 (0.697)
MAPE: -0.045 (0.013)
R2: 0.785 (0.134)




<IPython.core.display.Javascript object>

In [20]:
pd.concat(results_to_save).reset_index().groupby(
    ["Features", "Model", "Cross Validation", "Timesteps"], dropna=False
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Timesteps,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,,2.733887,0.903833,2.249005,0.696663,0.045426,0.012936,0.7846,0.133717
1,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,,2.182339,0.083627,1.704186,0.059119,0.034894,0.001291,0.890398,0.008023


<IPython.core.display.Javascript object>

## 1.3. Time Series Split Cross Validation

The training set has size i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1) in the i th split, with a test set of size n_samples//(n_splits + 1) by default, where n_samples is the number of samples.


<b>Dataset shape:</b> (1234, 38)<br>
<b>Splits:</b>10<br>    
    1. Train: 10 folds of 114, 226, 338, 450, 562, 675, 787, 899, 1011, 1123 samples each fold
    2. Test: 112 samples each fold
<b>Total:</b> 10 models<br>

In [21]:
n_splits = 5
gap = 0
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = TimeSeriesSplit(gap=gap, max_train_size=None, n_splits=n_splits, test_size=None)

scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -1.964 (0.075)
MAE: -1.531 (0.064)
MAPE: -0.031 (0.001)
R2: 0.910 (0.007)


******
[TEST]
******
RMSE: -3.066 (1.005)
MAE: -2.382 (0.738)
MAPE: -0.049 (0.017)
R2: 0.742 (0.212)




<IPython.core.display.Javascript object>

In [22]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Time Series Split"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 1, "Gap": 0}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [23]:
pd.concat(results_to_save).groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,2.733887,0.903833,2.249005,0.696663,0.045426,0.012936,0.7846,0.133717
1,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,2.182339,0.083627,1.704186,0.059119,0.034894,0.001291,0.890398,0.008023
2,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Time Series Split,3.066276,1.005002,2.382105,0.738085,0.049062,0.01655,0.741535,0.211923


<IPython.core.display.Javascript object>

## 1.4. Out of time Split Cross Validation

<b>Dataset shape:</b> (1234, 38)<br>
<b>Train size: 80%</b><br>
<b>Test  size: 20%</b>


<b>Splits:</b> 2<br>    
    1. Train: 987
    2. Test: 247
<b>Total:</b> 1 model<br>

In [24]:
test_size = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=test_size, random_state=SEED, shuffle=False
)
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)

pipeline.fit(x_train, y_train)

y_train_pred = pipeline.predict(x_train)
y_test_pred = pipeline.predict(x_test)

scores = score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)
print_scores(scores, METRICS, METRICS_DICT)

******
[TRAIN]
******
RMSE: 2.029 (0.000)
MAE: 1.590 (0.000)
MAPE: 0.032 (0.000)
R2: 0.908 (0.000)


******
[TEST]
******
RMSE: 2.525 (0.000)
MAE: 1.956 (0.000)
MAPE: 0.040 (0.000)
R2: 0.837 (0.000)




<IPython.core.display.Javascript object>

In [25]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Out of time Split"
results_dict_copy["Cross Validation Params"] = '{"Test Size": 0.2}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(
    results_dict_copy, {key: [value] for key, value in scores.items()}
)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [26]:
pd.concat(results_to_save).groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,2.733887,0.903833,2.249005,0.696663,0.045426,0.012936,0.7846,0.133717
1,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Out of time Split,2.524797,0.0,1.956191,0.0,0.03985,0.0,0.837051,0.0
2,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,2.182339,0.083627,1.704186,0.059119,0.034894,0.001291,0.890398,0.008023
3,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Time Series Split,3.066276,1.005002,2.382105,0.738085,0.049062,0.01655,0.741535,0.211923


<IPython.core.display.Javascript object>

In [27]:
pipeline.named_steps["estimator"].coef_

array([ 4.09128986e-02, -6.92421701e-01, -1.57074257e-01, -9.57693620e-02,
        5.50711405e-02,  1.60188676e-01,  1.18857034e-01,  2.43278981e+00,
       -4.10373213e-01,  1.89402886e-01, -1.05226455e+01, -6.34972843e+00,
        6.28391384e+01, -1.04593240e+00,  2.13334979e-03,  1.59209071e+01,
       -1.73391751e+00, -3.09450340e+00,  8.97391973e+00,  1.07155616e+01,
        6.24687991e+00,  6.03962319e+00, -2.01060700e+00,  1.13597367e+00,
        3.37721669e+00,  5.98680124e-01,  6.11179148e-01,  2.54598330e-01,
        3.00154778e+00,  1.39755903e+00,  2.03661477e+00,  5.96778680e+01,
        3.20113877e+00, -8.15739684e-01, -6.53789298e-01, -2.64252481e+00])

<IPython.core.display.Javascript object>

In [28]:
pipeline.named_steps["estimator"].intercept_

49.22584856396868

<IPython.core.display.Javascript object>

In [29]:
coeff = pipeline.named_steps["estimator"].coef_
np.array(x.columns)[coeff == 0]

array([], dtype=object)

<IPython.core.display.Javascript object>

In [30]:
x.drop(np.array(x.columns)[coeff == 0], axis=1).columns

Index(['SO3', 'Cl-', 'Blaine', 'ph2oimm', 'Initial Setting Time', 'Soundness',
       'flow', 'CS2', 'residuo 24 micron', 'R_wp', 'Alite_M3 C3S M3',
       'Alite_M1 C3S M1', 'Alite_Sum C3S tot', 'Ratio_M1 (rapporto M1/M3)',
       'C3S_CS (taglia dei cristalliti C3S)', 'Belite_beta', 'C3A cub',
       'C3A_ortho', 'C3A tot', 'C4AF', 'CaO', 'Ca(OH)2', 'Calce libera',
       'Periclasio (MgO)', 'Quartz', 'K2SO4', 'Langbeinite – MgK2(SO4)2',
       'Aphthitalite – (K,Na)3(SO4)2', 'Gesso', 'Emiidrato', 'Anidrite',
       'Calcite – CaCO3', 'SO3_XRD', 'CO2_XRD', 'cem_type_CEM B',
       'cem_type_CEM C'],
      dtype='object')

<IPython.core.display.Javascript object>

In [31]:
x.drop(np.array(x.columns)[coeff == 0], axis=1).columns.shape

(36,)

<IPython.core.display.Javascript object>

In [32]:
x.columns.shape

(36,)

<IPython.core.display.Javascript object>

In [33]:
coeffs = pd.DataFrame(
    {col: [c] for col, c in zip(x.columns, coeff)}, index=["Coefficients"]
)

<IPython.core.display.Javascript object>

In [34]:
coeffs.T["Coefficients"].sort_values(ascending=False).to_frame(
    name="Coefficients"
).style.background_gradient(axis=None, vmin=1, vmax=5, cmap="Greens")

Unnamed: 0,Coefficients
Alite_Sum C3S tot,62.839138
Calcite – CaCO3,59.677868
Belite_beta,15.920907
C4AF,10.715562
C3A tot,8.97392
CaO,6.24688
Ca(OH)2,6.039623
Quartz,3.377217
SO3_XRD,3.201139
Gesso,3.001548


<IPython.core.display.Javascript object>

In [35]:
pd.concat(results_to_save).groupby(
    ["Features", "Model", "Cross Validation", "Cross Validation Params"]
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Cross Validation Params,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",2.733887,0.903833,2.249005,0.696663,0.045426,0.012936,0.7846,0.133717
1,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",2.524797,0.0,1.956191,0.0,0.03985,0.0,0.837051,0.0
2,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",2.182339,0.083627,1.704186,0.059119,0.034894,0.001291,0.890398,0.008023
3,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",3.066276,1.005002,2.382105,0.738085,0.049062,0.01655,0.741535,0.211923


<IPython.core.display.Javascript object>

# Saving the results Dataframe

## Saving the full dataframe

In [36]:
path = "../../../../../reports/results/local_models/partner_iv/all_cements/full/"
filename = "linear_regression_results_full_1.csv"

pd.concat(results_to_save).to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

## Saving the grouped dataframe

In [37]:
cols_groupby = [
    "Category",
    "Company",
    "Data Shape",
    "Timesteps",
    "Features",
    "Model",
    "Cross Validation",
    "Cross Validation Params",
]

cols_agg = ["RMSE Train", "MAE Train", "MAPE Train", "R2 Train"] + [
    "RMSE Test",
    "MAE Test",
    "MAPE Test",
    "R2 Test",
]

path = "../../../../../reports/results/local_models/partner_iv/all_cements/grouped/"
filename = "linear_regression_results_grouped_1.csv"


df_results_to_save = (
    pd.concat(results_to_save)
    .groupby(cols_groupby, dropna=False)[cols_agg]
    .agg(["mean", lambda series: pd.Series(series.std(ddof=0), name="std")])
    .reset_index()
    .rename(columns={"<lambda_0>": "std"})
)

df_results_to_save.to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

In [38]:
df_results_to_save

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,Local Model,partner_iv,"(958, 36)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",1.573188,0.207793,...,0.940546,0.016257,2.733887,0.903833,2.249005,0.696663,0.045426,0.012936,0.7846,0.133717
1,Local Model,partner_iv,"(958, 36)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",2.028541,0.0,...,0.90803,0.0,2.524797,0.0,1.956191,0.0,0.03985,0.0,0.837051,0.0
2,Local Model,partner_iv,"(958, 36)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",2.068694,0.018082,...,0.902138,0.001902,2.182339,0.083627,1.704186,0.059119,0.034894,0.001291,0.890398,0.008023
3,Local Model,partner_iv,"(958, 36)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",1.96354,0.075271,...,0.909961,0.006894,3.066276,1.005002,2.382105,0.738085,0.049062,0.01655,0.741535,0.211923


<IPython.core.display.Javascript object>

In [39]:
c = pd.read_csv(
    "../../../../../reports/results/local_models/partner_i-oficial/all_cements/grouped/linear_regression_results_grouped_1.csv",
    header=[0, 1],
).rename(columns=lambda x: "" if "Unnamed" in x else x, level=1)

<IPython.core.display.Javascript object>

In [40]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   (Category, )                 4 non-null      object 
 1   (Company, )                  4 non-null      object 
 2   (Data Shape, )               4 non-null      object 
 3   (Timesteps, )                0 non-null      float64
 4   (Features, )                 4 non-null      object 
 5   (Model, )                    4 non-null      object 
 6   (Cross Validation, )         4 non-null      object 
 7   (Cross Validation Params, )  4 non-null      object 
 8   (RMSE Train, mean)           4 non-null      float64
 9   (RMSE Train, std)            4 non-null      float64
 10  (MAE Train, mean)            4 non-null      float64
 11  (MAE Train, std)             4 non-null      float64
 12  (MAPE Train, mean)           4 non-null      float64
 13  (MAPE Train, std)       

<IPython.core.display.Javascript object>

In [41]:
df_results_to_save.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   (Category, )                 4 non-null      object 
 1   (Company, )                  4 non-null      object 
 2   (Data Shape, )               4 non-null      object 
 3   (Timesteps, )                0 non-null      float64
 4   (Features, )                 4 non-null      object 
 5   (Model, )                    4 non-null      object 
 6   (Cross Validation, )         4 non-null      object 
 7   (Cross Validation Params, )  4 non-null      object 
 8   (RMSE Train, mean)           4 non-null      float64
 9   (RMSE Train, std)            4 non-null      float64
 10  (MAE Train, mean)            4 non-null      float64
 11  (MAE Train, std)             4 non-null      float64
 12  (MAPE Train, mean)           4 non-null      float64
 13  (MAPE Train, std)       

<IPython.core.display.Javascript object>

In [42]:
df_results_to_save[df_results_to_save["Cross Validation Params"].str.contains("Date")]

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std


<IPython.core.display.Javascript object>

In [43]:
c

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",0.700636,0.049022,...,0.980729,0.001604,0.949926,0.143246,0.733407,0.106088,0.017101,0.002711,0.96104,0.015278
1,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",0.815584,0.0,...,0.973914,0.0,1.212233,0.0,0.900702,0.0,0.020311,0.0,0.9405,0.0
2,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",0.855466,0.005967,...,0.971171,0.000459,0.965048,0.217141,0.718911,0.04166,0.016602,0.000845,0.96172,0.020438
3,Local Model,partner_i,"(1234, 42)",,Chemical + Mineralogical + Physical + One-Hot,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",0.738757,0.058813,...,0.977559,0.002862,1.158192,0.309342,0.807439,0.128733,0.018491,0.002644,0.945201,0.026837


<IPython.core.display.Javascript object>