In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Database Reading and Manipulation
import pandas as pd

# Linear Algebra
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Processing results
import json

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Custom modules
## Model selection
from src.cross_validation.blocking_time_series_split import BlockingTimeSeriesSplit

## Function to print scores
from src.utils.print_scores import print_scores

## Function to calculate score regression metrics
from src.utils.score_regression_metrics import score_regression_metrics

## Function to fill the results metric dict
from src.utils.fill_results_dict import fill_results_dict

# Modeling
from sklearn.linear_model import LinearRegression

# Preprocessing - Data standardization
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

# Pipeline
from sklearn.pipeline import Pipeline

# Data imputation
from sklearn.impute import SimpleImputer

<IPython.core.display.Javascript object>

# Functions and definitions

## Helper functions for blocked time series cross validation

### Train test split

In [3]:
def split_by_periods(x, y, train_period, test_period):
    datasets = []
    i = 0
    max_samples = x.shape[0]

    for _ in range(0, max_samples // (train_period + test_period)):
        # Splitting the data into train/test sets
        x_train = x[i : i + train_period].copy()
        y_train = y[i : i + train_period].copy()
        x_test = x[i + train_period : i + train_period + test_period].copy()
        y_test = y[i + train_period : i + train_period + test_period].copy()

        datasets.append(
            {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test,
            }
        )
        # Increments the index for the next period of time
        i += train_period + test_period

    return datasets

<IPython.core.display.Javascript object>

In [4]:
def split_by_dates(x, y, train_period, test_period, dates):
    datasets = []
    dates = dates[: x.shape[0]].copy()
    dates_unique = dates.copy().unique()
    i = 0
    max_samples = dates_unique.shape[0]

    for _ in range(0, max_samples // (train_period + test_period)):
        # Splitting the Train Set
        start_date_train = dates_unique[i]
        end_date_train = dates_unique[i + train_period]
        idx_train = dates[(dates >= start_date_train) & (dates < end_date_train)].index
        x_train = x.loc[idx_train].copy()
        y_train = y[idx_train].copy()

        # Splitting the Test Set
        start_date_test = dates_unique[i + train_period]
        end_date_test = dates_unique[i + train_period + test_period]
        idx_test = dates[(dates >= start_date_test) & (dates < end_date_test)].index
        x_test = x.loc[idx_test].copy()
        y_test = y[idx_test].copy()

        datasets.append(
            {
                "x_train": x_train,
                "y_train": y_train,
                "x_test": x_test,
                "y_test": y_test,
            }
        )

        i += train_period + test_period

    return datasets

<IPython.core.display.Javascript object>

In [5]:
def train_test_split_blocked_ts(x, y, train_period, test_period, dates=None):
    """
    Split the input data into train-test datasets based on train and test periods.

    Args:
        x (pd.DataFrame): Input features.
        y (np.Array): Target values.
        train_period (int): Length of the training period.
        test_period (int): Length of the testing period.
        dates (pd.Series): Optional date information.

    Returns:
        List[dict]: A list of dictionaries, each containing 'x_train', 'y_train', 'x_test', and 'y_test'.
    """
    if dates is None:
        return split_by_periods(x, y, train_period, test_period)
    else:
        return split_by_dates(x, y, train_period, test_period, dates)

<IPython.core.display.Javascript object>

### Data preprocessing

In [6]:
def impute_data(dataset, imputer=None, imputer_params=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply imputation to the data
    if imputer is not None:
        imputer = imputer() if imputer_params is None else imputer(**imputer_params)
        x_train = imputer.fit_transform(x_train)
        x_test = imputer.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [7]:
def transform_data(dataset, transformer=None):
    x_train = dataset["x_train"]
    x_test = dataset["x_test"]

    # Apply data normalization/standardization to the data
    if transformer is not None:
        scaler = transformer()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

    dataset["x_train"] = x_train
    dataset["x_test"] = x_test

    return dataset

<IPython.core.display.Javascript object>

In [8]:
def preprocess_data(dataset, transformer=None, imputer=None, imputer_params=None):
    dataset = impute_data(dataset, imputer, imputer_params)
    dataset = transform_data(dataset, transformer)
    return dataset

<IPython.core.display.Javascript object>

### Train and evaluate the model

In [9]:
def train_and_evaluate_model(Estimator, dataset, estimator_params=None):
    """
    Purpose: Helper function to be used in conjunction with
    blocked time_series cross validation function
    """
    x_train = dataset["x_train"]
    y_train = dataset["y_train"]
    x_test = dataset["x_test"]
    y_test = dataset["y_test"]

    # Instantiate the model
    model = Estimator() if estimator_params is None else Estimator(**estimator_params)

    # Fitting the model
    model.fit(x_train, y_train)

    # Making predictions on train/test sets
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Return regression metrics
    return score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)

<IPython.core.display.Javascript object>

### Blocking time series cross validation

In [10]:
def repeated_blocking_time_series(
    Estimator,
    Transform,
    Imputer,
    x,
    y,
    train_period,
    test_period,
    dates=None,
    repeats=10,
    estimator_params=None,
    imputer_params=None,
):
    """
    Perform repeated cross-validation with blocked time series data.

    Args:
        Estimator: Machine learning model class.
        Transform: Data transformation method.
        Imputer: Data imputation method.
        x: Input features.
        y: Target values.
        train_period: Length of the training period.
        test_period: Length of the testing period.
        dates: Optional date information.
        repeats: Number of repetitions.
        estimator_params: Parameters for the model.
        imputer_params: Parameters for data imputation.

    Returns:
        list: List of dictionaries containing evaluation metrics for each repetition.
    """

    results = []
    max_samples = x.shape[0]

    # Splitting the data into train/test sets
    datasets = train_test_split_blocked_ts(x, y, train_period, test_period, dates)

    for _ in range(repeats):
        scores = []

        for dataset in datasets:
            dataset = preprocess_data(dataset, Transform, Imputer, imputer_params)
            score = train_and_evaluate_model(Estimator, dataset, estimator_params)
            scores.append(score)

        # After every iteration metrics results are appended together
        scores_final = {key: [] for key, _ in scores[0].items()}
        for scores_dict in scores:
            for key, value in scores_dict.items():
                scores_final[key] += [value]
        results.append(scores_final)
    return results

<IPython.core.display.Javascript object>

In [11]:
SEED = 47
METRICS = (
    "neg_root_mean_squared_error",
    "neg_mean_absolute_error",
    "neg_mean_absolute_percentage_error",
    "r2",
)
METRICS_DICT = {
    "neg_root_mean_squared_error": "RMSE",
    "neg_mean_absolute_error": "MAE",
    "neg_mean_absolute_percentage_error": "MAPE",
    "r2": "R2",
}

<IPython.core.display.Javascript object>

## Defining a dataframe structure to save the results

In [12]:
results_to_save = []

results_dict = {
    "Category": "Local Model",
    "Company": "partner_iv",
    "Features": "Chemical + Mineralogical + Physical",
    "Data Shape": None,
    "Timesteps": None,
    "Model": "Linear Regression",
    "Model Params": None,
    "Scaler": "Standard Scaler",
    "Scaler Params": None,
    "Imputer": "Median",
    "Imputer Params": None,
    "Cross Validation": None,
    "Cross Validation Params": np.nan,
    "RMSE Train": np.nan,
    "MAE Train": np.nan,
    "MAPE Train": np.nan,
    "R2 Train": np.nan,
    "RMSE Test": np.nan,
    "MAE Test": np.nan,
    "MAPE Test": np.nan,
    "R2 Test": np.nan,
}

<IPython.core.display.Javascript object>

# Reading the dataset

In [13]:
df = pd.read_csv("../../../../../data/processed/partner_iv/cem_a.csv")

<IPython.core.display.Javascript object>

## Defining Features

In this set of experiments we use all available features

In [14]:
df_copy = df.copy()

<IPython.core.display.Javascript object>

# 1. Linear Regression

<h2>1. Dataset: df_copy</h2> <br>In this dataset all features are used.

In [15]:
y = df_copy.pop("CS28").values
x = df_copy.drop(["Date"], axis=1)
dates = df["Date"].copy()

<IPython.core.display.Javascript object>

## 1.1 Repeated KFold Cross validation

<b>Dataset shape:</b> (1234, 37)<br>
<b>Repeats:</b>10<br>
<b>Splits:</b>5<br>
    1. 5 folds of 246 samples each
    2. 80% train (988 samples each fold)
    3. 20% test (246 samples each fold)
<b>Total:</b> 15 models<br>

In [16]:
repeats = 3
n_splits = 5
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = RepeatedKFold(n_splits=n_splits, n_repeats=repeats, random_state=SEED)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Repeated Cross Validation:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

Repeated Cross Validation:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -2.243 (0.061)
MAE: -1.726 (0.051)
MAPE: -0.032 (0.001)
R2: 0.561 (0.015)


******
[TEST]
******
RMSE: -2.729 (0.340)
MAE: -2.085 (0.231)
MAPE: -0.039 (0.004)
R2: 0.328 (0.091)




<IPython.core.display.Javascript object>

In [17]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Repeated KFold"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 3}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [18]:
df_results.groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(["mean", "std"]).reset_index()

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,2.728761,0.352095,2.085127,0.239025,0.038534,0.004532,0.32783,0.093726


<IPython.core.display.Javascript object>

## 1.2. Blocking Time Series Cross Validation

<b>Dataset shape:</b> (1234, 38)<br>
<b>Splits:</b>5<br>    
    1. 5 folds of 246 samples
    2. 80% train (987 samples each fold)
    3. 20% test (246 samples each fold)
<b>Total:</b> 5 models<br>

In [19]:
n_splits = 5
train_size = 0.8

pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = BlockingTimeSeriesSplit(n_splits=n_splits, train_size=train_size)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Blocking Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Blocking Time Series Split"
results_dict_copy[
    "Cross Validation Params"
] = '{"N_Splits": 5, "Repeats": 1, "train_size": 0.8}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

Blocking Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -1.043 (0.257)
MAE: -0.826 (0.190)
MAPE: -0.015 (0.003)
R2: 0.866 (0.040)


******
[TEST]
******
RMSE: -5.659 (3.853)
MAE: -3.804 (1.500)
MAPE: -0.070 (0.027)
R2: -8.452 (12.203)




<IPython.core.display.Javascript object>

In [20]:
pd.concat(results_to_save).reset_index().groupby(
    ["Features", "Model", "Cross Validation", "Timesteps"], dropna=False
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Timesteps,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,,5.659491,3.853378,3.804222,1.500112,0.070151,0.026969,-8.451632,12.202596
1,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,,2.728761,0.340156,2.085127,0.23092,0.038534,0.004378,0.32783,0.090548


<IPython.core.display.Javascript object>

## 1.3. Time Series Split Cross Validation

The training set has size i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1) in the i th split, with a test set of size n_samples//(n_splits + 1) by default, where n_samples is the number of samples.


<b>Dataset shape:</b> (1234, 38)<br>
<b>Splits:</b>10<br>    
    1. Train: 10 folds of 114, 226, 338, 450, 562, 675, 787, 899, 1011, 1123 samples each fold
    2. Test: 112 samples each fold
<b>Total:</b> 10 models<br>

In [21]:
n_splits = 5
gap = 0
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = TimeSeriesSplit(gap=gap, max_train_size=None, n_splits=n_splits, test_size=None)

scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -1.924 (0.251)
MAE: -1.479 (0.208)
MAPE: -0.027 (0.004)
R2: 0.714 (0.072)


******
[TEST]
******
RMSE: -4.541 (2.313)
MAE: -3.670 (1.897)
MAPE: -0.068 (0.038)
R2: -3.059 (4.274)




<IPython.core.display.Javascript object>

In [22]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Time Series Split"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 1, "Gap": 0}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [23]:
pd.concat(results_to_save).groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,5.659491,3.853378,3.804222,1.500112,0.070151,0.026969,-8.451632,12.202596
1,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,2.728761,0.340156,2.085127,0.23092,0.038534,0.004378,0.32783,0.090548
2,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,4.541076,2.313115,3.669531,1.897113,0.068358,0.038069,-3.058626,4.274213


<IPython.core.display.Javascript object>

## 1.4. Out of time Split Cross Validation

<b>Dataset shape:</b> (1234, 38)<br>
<b>Train size: 80%</b><br>
<b>Test  size: 20%</b>


<b>Splits:</b> 2<br>    
    1. Train: 987
    2. Test: 247
<b>Total:</b> 1 model<br>

In [24]:
test_size = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=test_size, random_state=SEED, shuffle=False
)
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)

pipeline.fit(x_train, y_train)

y_train_pred = pipeline.predict(x_train)
y_test_pred = pipeline.predict(x_test)

scores = score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)
print_scores(scores, METRICS, METRICS_DICT)

******
[TRAIN]
******
RMSE: 2.125 (0.000)
MAE: 1.654 (0.000)
MAPE: 0.030 (0.000)
R2: 0.643 (0.000)


******
[TEST]
******
RMSE: 3.733 (0.000)
MAE: 2.998 (0.000)
MAPE: 0.056 (0.000)
R2: -1.549 (0.000)




<IPython.core.display.Javascript object>

In [25]:
results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Out of time Split"
results_dict_copy["Cross Validation Params"] = '{"Test Size": 0.2}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(
    results_dict_copy, {key: [value] for key, value in scores.items()}
)
results_to_save.append(df_results)

<IPython.core.display.Javascript object>

In [26]:
pd.concat(results_to_save).groupby(["Features", "Model", "Cross Validation"])[
    ["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]
].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,5.659491,3.853378,3.804222,1.500112,0.070151,0.026969,-8.451632,12.202596
1,Chemical + Mineralogical + Physical,Linear Regression,Out of time Split,3.733446,0.0,2.998258,0.0,0.056316,0.0,-1.548647,0.0
2,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,2.728761,0.340156,2.085127,0.23092,0.038534,0.004378,0.32783,0.090548
3,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,4.541076,2.313115,3.669531,1.897113,0.068358,0.038069,-3.058626,4.274213


<IPython.core.display.Javascript object>

In [27]:
pipeline.named_steps["estimator"].coef_

array([ -0.34066861,  -0.22437281,   0.12896355,  -0.1711108 ,
         0.48776943,  -0.06595203,   0.42299061,   1.41963871,
         0.11733101,   0.58998088,  -9.13395306,  -9.95531354,
        24.39607969,   4.12676674,   0.07578516,  18.59832378,
        -1.0306487 ,  -1.4506876 ,   8.15217852,   5.93588979,
        -6.26802205,  -8.95819986,  14.06567126,   1.20825224,
         0.76528306,   0.38518347,   0.18942258,  -0.82658701,
         1.69479262,   0.73813327,   1.11476409,  26.5376726 ,
         3.30925706, -11.72985249])

<IPython.core.display.Javascript object>

In [28]:
pipeline.named_steps["estimator"].intercept_

54.5710317460318

<IPython.core.display.Javascript object>

In [29]:
coeff = pipeline.named_steps["estimator"].coef_
np.array(x.columns)[coeff == 0]

array([], dtype=object)

<IPython.core.display.Javascript object>

In [30]:
coeffs = pd.DataFrame(
    {col: [c] for col, c in zip(x.columns, coeff)}, index=["Coefficients"]
)

<IPython.core.display.Javascript object>

In [31]:
coeffs.T["Coefficients"].sort_values(ascending=False).to_frame(
    name="Coefficients"
).style.background_gradient(axis=None, vmin=1, vmax=5, cmap="Greens")

Unnamed: 0,Coefficients
Calcite – CaCO3,26.537673
Alite_Sum C3S tot,24.39608
Belite_beta,18.598324
Calce libera,14.065671
C3A tot,8.152179
C4AF,5.93589
Ratio_M1 (rapporto M1/M3),4.126767
SO3_XRD,3.309257
Gesso,1.694793
CS2,1.419639


<IPython.core.display.Javascript object>

In [32]:
pd.concat(results_to_save).groupby(
    ["Features", "Model", "Cross Validation", "Cross Validation Params"]
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Cross Validation Params,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",5.659491,3.853378,3.804222,1.500112,0.070151,0.026969,-8.451632,12.202596
1,Chemical + Mineralogical + Physical,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",3.733446,0.0,2.998258,0.0,0.056316,0.0,-1.548647,0.0
2,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",2.728761,0.340156,2.085127,0.23092,0.038534,0.004378,0.32783,0.090548
3,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",4.541076,2.313115,3.669531,1.897113,0.068358,0.038069,-3.058626,4.274213


<IPython.core.display.Javascript object>

# Saving the results Dataframe

## Saving the full dataframe

In [33]:
path = "../../../../../reports/results/local_models/partner_iv/cem_a/full/"
filename = "linear_regression_results_full_1.csv"

pd.concat(results_to_save).to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

## Saving the grouped dataframe

In [34]:
cols_groupby = [
    "Category",
    "Company",
    "Data Shape",
    "Timesteps",
    "Features",
    "Model",
    "Cross Validation",
    "Cross Validation Params",
]

cols_agg = ["RMSE Train", "MAE Train", "MAPE Train", "R2 Train"] + [
    "RMSE Test",
    "MAE Test",
    "MAPE Test",
    "R2 Test",
]

path = "../../../../../reports/results/local_models/partner_iv/cem_a/grouped/"
filename = "linear_regression_results_grouped_1.csv"


df_results_to_save = (
    pd.concat(results_to_save)
    .groupby(cols_groupby, dropna=False)[cols_agg]
    .agg(["mean", lambda series: pd.Series(series.std(ddof=0), name="std")])
    .reset_index()
    .rename(columns={"<lambda_0>": "std"})
)

df_results_to_save.to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

In [35]:
df_results_to_save

Unnamed: 0_level_0,Category,Company,Data Shape,Timesteps,Features,Model,Cross Validation,Cross Validation Params,RMSE Train,RMSE Train,...,R2 Train,R2 Train,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,Local Model,partner_iv,"(315, 34)",,Chemical + Mineralogical + Physical,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",1.042904,0.25717,...,0.865997,0.039917,5.659491,3.853378,3.804222,1.500112,0.070151,0.026969,-8.451632,12.202596
1,Local Model,partner_iv,"(315, 34)",,Chemical + Mineralogical + Physical,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",2.124593,0.0,...,0.643197,0.0,3.733446,0.0,2.998258,0.0,0.056316,0.0,-1.548647,0.0
2,Local Model,partner_iv,"(315, 34)",,Chemical + Mineralogical + Physical,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",2.242974,0.061282,...,0.560898,0.015033,2.728761,0.340156,2.085127,0.23092,0.038534,0.004378,0.32783,0.090548
3,Local Model,partner_iv,"(315, 34)",,Chemical + Mineralogical + Physical,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",1.923623,0.251431,...,0.714136,0.071899,4.541076,2.313115,3.669531,1.897113,0.068358,0.038069,-3.058626,4.274213


<IPython.core.display.Javascript object>