In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# OS Features
import os

# Database Reading and Manipulation
import pandas as pd

# Linear Algebra
import numpy as np

# Plotting
import matplotlib.pyplot as plt

# Processing results
import json

# Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Custom modules
## Model selection
from src.cross_validation.blocking_time_series_split import BlockingTimeSeriesSplit

## Function to print scores
from src.utils.print_scores import print_scores

## Function to calculate score regression metrics
from src.utils.score_regression_metrics import score_regression_metrics

## Function to fill the results metric dict
from src.utils.fill_results_dict import fill_results_dict

# Modeling
from sklearn.linear_model import LinearRegression

# Preprocessing - Data standardization
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

# Pipeline
from sklearn.pipeline import Pipeline

# Data imputation
from sklearn.impute import SimpleImputer

<IPython.core.display.Javascript object>


# Functions and definitions

In [3]:
SEED = 47
METRICS = (
    "neg_root_mean_squared_error",
    "neg_mean_absolute_error",
    "neg_mean_absolute_percentage_error",
    "r2",
)
METRICS_DICT = {
    "neg_root_mean_squared_error": "RMSE",
    "neg_mean_absolute_error": "MAE",
    "neg_mean_absolute_percentage_error": "MAPE",
    "r2": "R2",
}

<IPython.core.display.Javascript object>

## Defining a dataframe structure to save the results

In [4]:
results_to_save = []

results_dict = {
    "Category": "Local Model",
    "Company": "209",
    "Plant": "E",
    "Features": "Chemical",
    "Data Shape": None,
    "Timesteps": None,
    "Model": "Linear Regression",
    "Model Params": None,
    "Scaler": "Standard Scaler",
    "Scaler Params": None,
    "Imputer": "Median",
    "Imputer Params": None,
    "Cross Validation": None,
    "Cross Validation Params": np.nan,
    "RMSE Train": np.nan,
    "MAE Train": np.nan,
    "MAPE Train": np.nan,
    "R2 Train": np.nan,
    "RMSE Test": np.nan,
    "MAE Test": np.nan,
    "MAPE Test": np.nan,
    "R2 Test": np.nan,
}

<IPython.core.display.Javascript object>

# Reading the dataset

In [5]:
df = pd.read_csv("../../../../../../data/processed/209/e.csv")

<IPython.core.display.Javascript object>

## Defining Features

In this set of experiments we use all available features

In [6]:
df_copy = df.copy().drop(
    [
        "Cement_Type",
        "Blaine",
        "Final setting time",
        "Initial setting time",
        "CS3",
        "CS7",
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

In [7]:
df

Unnamed: 0,Date,C4AF,C3A,Cubic C3A,Free CaO,Portlandite,Periclase,Aphthitalite,Langbeinite,Calcite,...,SO3,Loss on Ignition,Insoluble Residue,Blaine,Initial setting time,Final setting time,CS3,CS7,CS28,Cement_Type
0,2021-01-05,,,,,,,,,,...,3.46,4.35,1.76,4190.0,150.0,210.0,32.900000,38.800000,47.050000,CP II-E-40
1,2021-01-06,7.48,2.61,2.61,1.45,1.05,1.31,0.65,0.27,5.55,...,3.68,4.41,1.54,4200.0,145.0,205.0,31.900000,37.000000,48.070000,CP II-E-40
2,2021-01-07,8.28,2.15,2.04,1.15,0.40,1.87,0.34,0.48,6.83,...,3.42,4.25,1.32,4130.0,150.0,215.0,31.800000,37.000000,48.390000,CP II-E-40
3,2021-01-08,7.75,2.64,2.52,1.46,1.22,1.16,0.80,0.17,5.79,...,3.83,4.23,2.29,4000.0,150.0,210.0,31.100000,36.100000,48.260000,CP II-E-40
4,2021-01-09,7.70,2.23,2.20,1.75,1.32,1.42,0.63,0.32,6.38,...,3.76,4.47,1.91,4180.0,145.0,205.0,31.400000,37.600000,48.020000,CP II-E-40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933,2023-07-14,,,,,,,,,,...,3.06,4.03,1.81,4240.0,235.0,305.0,28.549999,36.509998,47.439999,CP II-E-40
934,2023-07-17,,,,,,,,,,...,3.32,3.80,,4200.0,205.0,280.0,28.250000,35.029999,45.560001,CP II-E-40
935,2023-07-20,,,,,,,,,,...,3.46,4.09,,4373.0,205.0,265.0,30.230000,37.639999,45.959999,CP II-E-40
936,2023-07-21,,,,,,,,,,...,2.54,4.08,,4396.0,210.0,280.0,29.639999,37.180000,47.419998,CP II-E-40


<IPython.core.display.Javascript object>

# 1. Linear Regression

<h2>1. Dataset: df_copy</h2> <br>In this dataset all features are used.

In [8]:
y = df_copy.pop("CS28").values
x = df_copy.drop(["Date"], axis=1)
dates = df["Date"].copy()

<IPython.core.display.Javascript object>

## 1.1 Repeated KFold Cross validation

<b>Dataset shape:</b> (594, 38)<br>
<b>Repeats:</b>10<br>
<b>Splits:</b>10<br>
    1. 10 folds of 59 samples each
    2. 90% train (535 samples each fold)
    3. 10% test (59 samples each fold)
<b>Total:</b> 100 models<br>

In [9]:
repeats = 3
n_splits = 5
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = RepeatedKFold(n_splits=n_splits, n_repeats=repeats, random_state=SEED)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Repeated Cross Validation:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Repeated KFold"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 3}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

Repeated Cross Validation:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -2.038 (0.042)
MAE: -1.493 (0.031)
MAPE: -0.034 (0.001)
R2: 0.548 (0.015)


******
[TEST]
******
RMSE: -2.068 (0.165)
MAE: -1.522 (0.123)
MAPE: -0.035 (0.003)
R2: 0.528 (0.059)




<IPython.core.display.Javascript object>

## 1.2. Blocking Time Series Cross Validation

<b>Dataset shape:</b> (594, 38)<br>
<b>Splits:</b>5<br>    
    1. 5 folds of 118 samples
    2. 50% train (59 samples each fold)
    3. 50% test (59 samples each fold)
<b>Total:</b> 5 models<br>

In [10]:
n_splits = 5
train_size = 0.8

pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = BlockingTimeSeriesSplit(n_splits=n_splits, train_size=train_size)
scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Blocking Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Blocking Time Series Split"
results_dict_copy[
    "Cross Validation Params"
] = '{"N_Splits": 5, "Repeats": 1, "train_size": 0.8}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

Blocking Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -1.721 (0.424)
MAE: -1.337 (0.291)
MAPE: -0.031 (0.007)
R2: 0.509 (0.202)


******
[TEST]
******
RMSE: -1.936 (0.505)
MAE: -1.514 (0.301)
MAPE: -0.035 (0.009)
R2: -0.432 (1.014)




<IPython.core.display.Javascript object>

## 1.3. Time Series Split Cross Validation

The training set has size i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1) in the i th split, with a test set of size n_samples//(n_splits + 1) by default, where n_samples is the number of samples.


<b>Dataset shape:</b> (594, 38)<br>
<b>Splits:</b>10<br>    
    1. Train: 10 folds of 54, 108, 162, 216, 270, 324, 378, 432, 486 samples each fold
    2. Test: 54 samples each fold
<b>Total:</b> 10 models<br>

In [11]:
n_splits = 5
gap = 0
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)
cv = TimeSeriesSplit(gap=gap, max_train_size=None, n_splits=n_splits, test_size=None)

scores = cross_validate(
    pipeline,
    x,
    y,
    scoring=METRICS,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
)
print("Time Series Split:")
print(f"Repeats: {repeats}")
print(f"n_splits: {n_splits}")
print()
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Time Series Split"
results_dict_copy["Cross Validation Params"] = '{"N_Splits": 5, "Repeats": 1, "Gap": 0}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(results_dict_copy, scores)
results_to_save.append(df_results)

Time Series Split:
Repeats: 3
n_splits: 5

******
[TRAIN]
******
RMSE: -1.982 (0.301)
MAE: -1.416 (0.172)
MAPE: -0.032 (0.005)
R2: 0.443 (0.196)


******
[TEST]
******
RMSE: -2.667 (0.637)
MAE: -2.064 (0.383)
MAPE: -0.048 (0.010)
R2: -0.081 (0.605)




<IPython.core.display.Javascript object>

## 1.4. Out of time Split Cross Validation

<b>Dataset shape:</b> (594, 38)<br>
<b>Train size: 80%</b><br>
<b>Test  size: 20%</b>


<b>Splits:</b> 2<br>    
    1. Train: 475
    2. Test: 118
<b>Total:</b> 1 model<br>

In [12]:
test_size = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=test_size, random_state=SEED, shuffle=False
)
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", StandardScaler()),
        ("estimator", LinearRegression()),
    ]
)

pipeline.fit(x_train, y_train)

y_train_pred = pipeline.predict(x_train)
y_test_pred = pipeline.predict(x_test)

scores = score_regression_metrics(y_train, y_train_pred, y_test, y_test_pred)
print_scores(scores, METRICS, METRICS_DICT)

results_dict_copy = results_dict.copy()
results_dict_copy["Cross Validation"] = "Out of time Split"
results_dict_copy["Cross Validation Params"] = '{"Test Size": 0.2}'
results_dict_copy["Data Shape"] = x.shape
df_results = fill_results_dict(
    results_dict_copy, {key: [value] for key, value in scores.items()}
)
results_to_save.append(df_results)

******
[TRAIN]
******
RMSE: 2.064 (0.000)
MAE: 1.454 (0.000)
MAPE: 0.033 (0.000)
R2: 0.597 (0.000)


******
[TEST]
******
RMSE: 2.173 (0.000)
MAE: 1.780 (0.000)
MAPE: 0.041 (0.000)
R2: -0.358 (0.000)




<IPython.core.display.Javascript object>

In [13]:
pd.concat(results_to_save).groupby(
    ["Features", "Model", "Cross Validation", "Cross Validation Params"]
)[["RMSE Test", "MAE Test", "MAPE Test", "R2 Test"]].agg(
    ["mean", lambda series: pd.Series(series.std(ddof=0), name="std")]
).reset_index().rename(
    columns={"<lambda_0>": "std"}
)

Unnamed: 0_level_0,Features,Model,Cross Validation,Cross Validation Params,RMSE Test,RMSE Test,MAE Test,MAE Test,MAPE Test,MAPE Test,R2 Test,R2 Test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std
0,Chemical,Linear Regression,Blocking Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""train_size"": 0.8}",1.936475,0.505357,1.514238,0.300819,0.034953,0.009027,-0.431602,1.014382
1,Chemical,Linear Regression,Out of time Split,"{""Test Size"": 0.2}",2.172937,0.0,1.779986,0.0,0.04053,0.0,-0.357916,0.0
2,Chemical,Linear Regression,Repeated KFold,"{""N_Splits"": 5, ""Repeats"": 3}",2.068107,0.164756,1.521895,0.123066,0.035105,0.003024,0.528384,0.058515
3,Chemical,Linear Regression,Time Series Split,"{""N_Splits"": 5, ""Repeats"": 1, ""Gap"": 0}",2.667125,0.636976,2.06375,0.383081,0.04849,0.010253,-0.080928,0.604513


<IPython.core.display.Javascript object>

# Saving the results Dataframe

In [14]:
index_to_save = 2

<IPython.core.display.Javascript object>

In [15]:
path = "../../../../../../reports/results/local_models/209/e/full/"
filename = f"linear_regression_results_full_{index_to_save}.csv"

pd.concat(results_to_save).to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

## Saving the grouped dataframe

In [16]:
cols_groupby = [
    "Category",
    "Company",
    "Data Shape",
    "Timesteps",
    "Features",
    "Model",
    "Cross Validation",
    "Cross Validation Params",
]

cols_agg = ["RMSE Train", "MAE Train", "MAPE Train", "R2 Train"] + [
    "RMSE Test",
    "MAE Test",
    "MAPE Test",
    "R2 Test",
]

path = "../../../../../../reports/results/local_models/209/e/grouped/"
filename = f"linear_regression_results_grouped_{index_to_save}.csv"


df_results_to_save = (
    pd.concat(results_to_save)
    .groupby(cols_groupby, dropna=False)[cols_agg]
    .agg(["mean", lambda series: pd.Series(series.std(ddof=0), name="std")])
    .reset_index()
    .rename(columns={"<lambda_0>": "std"})
)

df_results_to_save.to_csv(
    path_or_buf=path + filename,
    mode="w",
    index=False,
    header=True,
)

<IPython.core.display.Javascript object>

In [17]:
pd.Series(
    pipeline.named_steps["estimator"].coef_,
    df_copy.drop(["Date"], axis=1).columns,
).to_frame(name="Coefficients").sort_values(
    by="Coefficients"
).style.background_gradient(
    axis=None, vmin=1, vmax=5, cmap="Greens"
)

Unnamed: 0,Coefficients
Cubic C3A,-0.826917
Calcite,-0.636524
Portlandite,-0.389146
Insoluble Residue,-0.223521
Aphthitalite,-0.083544
Langbeinite,-0.026313
SO3,0.094887
Periclase,0.122016
Quartz,0.124438
Loss on Ignition,0.211938


<IPython.core.display.Javascript object>