In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
from xgboost import plot_importance

In [2]:
def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    rmse = mean_squared_error(y, y_pred, squared=False)
    return rmse

In [3]:
def get_xgboost_model():
    model = XGBRegressor(
        objective="reg:linear",
        random_state=seed,
        tree_method="gpu_hist",
        predictor="gpu_predictor",
        verbosity=0,
    )
    return model

In [4]:
warnings.filterwarnings("ignore")
seed=47

In [5]:
df_r3d_and_r7d = pd.read_csv(
    "../../../../data/interim/"
    + "r3d_and_r7d-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)
df_r3d_only = pd.read_csv(
    "../../../../data/interim/"
    + "r3d_only-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)
df_no_r3d_r7d = pd.read_csv(
    "../../../../data/interim/"
    + "no-r3d-r7d-no-resampling-with-fillna-"
    + "dados-historicos-partner_i-cement-CPIIE40.csv"
)

## Feature Selection

### In this notebook we remove variables that could potentially cause data leakage. Thus we are performing testings removing the following variables:

#### IP - Initial setting time
#### FP - Final setting time


#### BL - Blaine specific surface (This is both removed and use)

# XGBoost Gradient

In [6]:
y = df_no_r3d_r7d["R28D"]
x = df_no_r3d_r7d.drop(["Data/Hora", "R28D", "IP", "FP"], axis=1)

# Grid Search Hyperparameters - XGBoost

<b>Dataset:</b> df_no_r3d_r7d: In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [7]:
# https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning#2.-XGBoost-hyperparameters-
params = {
    "n_estimators": [10, 50, 100, 200, 300],
    "max_depth": [2, 3, 5, 10, 15, 20],
    "subsample": [0.8, 0.9, 1.0],
    "eta": [0.11, 0.12],
    "colsample_bytree": [0.3, 0.4],
    "min_child_weight": [1, 2, 3, 5],
    "gamma": [0.001, 0.1, 1, 2],
}

<h3>GridCV 1</h3>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

In [8]:
model = XGBRegressor(
    objective="reg:linear",
    random_state=seed,
    tree_method="gpu_hist",
    predictor="gpu_predictor",
    early_stopping_rounds=200,
    verbosity=0,
)
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=seed)
search = GridSearchCV(
    model, param_grid=params, scoring="neg_root_mean_squared_error", n_jobs=-1, cv=cv
)

import time

start = time.time()

result = search.fit(x, y)

end = time.time()
print("\nMinutes Elapsed: ", (end - start) / 60)

print("Best Score: %s" % result.best_score_)
print("Best Hyperparameters: %s" % result.best_params_)


Minutes Elapsed:  126.04459615151087
Best Score: -1.7524398242410926
Best Hyperparameters: {'colsample_bytree': 0.3, 'eta': 0.12, 'gamma': 1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50, 'subsample': 0.8}


# Repeated KFold Cross validation

## 1.1 Dataset: no-r3d-r7d-no-resampling-with-fillna

<h3>Experiment 1</h3>

<h4> Repeated KFold Cross Validation - df_no_r3d_r7d</h4>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [10]:
y = df_no_r3d_r7d["R28D"]
x = df_no_r3d_r7d.drop(["Data/Hora", "R28D", "IP", "FP"], axis=1)

In [11]:
model = get_xgboost_model()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=seed)
scores = cross_val_score(
    model, x, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1
)
scores = np.abs(scores)
print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))

RMSE: 1.868 (0.238)


<h3>Experiment 2</h3>


<h4> Repeated KFold Cross Validation - df_no_r3d_r7d</h4>

<b>Dataset: df_no_r3d_r7d:</b> In this dataset the R3D and R7D variables are not considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [12]:
y = df_no_r3d_r7d["R28D"]
x = df_no_r3d_r7d.drop(["Data/Hora", "R28D", "IP", "FP", "BL"], axis=1)

In [13]:
model = get_xgboost_model()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=seed)
scores = cross_val_score(
    model, x, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1
)
scores = np.abs(scores)
print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))

RMSE: 1.958 (0.271)


## 1.2 Dataset: df_r3d_only

<h3>Experiment 3</h3>
<h4> Repeated KFold Cross Validation - df_r3d-only</h4>

<b>Dataset: df_r3d-only:</b> In this dataset the R7D variable is not considered.

<b>Feature Selection:</b> The IP and FP variables are discarded.

In [14]:
y = df_r3d_only['R28D']
x = df_r3d_only.drop(["Data/Hora", "R28D", "IP", "FP"], axis=1)

In [15]:
model = get_xgboost_model()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=seed)
scores = cross_val_score(
    model, x, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1
)
scores = np.abs(scores)
print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))

RMSE: 1.822 (0.235)


<h3>Experiment 4</h3>

<h4> Repeated KFold Cross Validation - df_r3d-only</h4>

<b>Dataset: df_r3d-only:</b> In this dataset the R7D variable is not considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [16]:
y = df_r3d_only['R28D']
x = df_r3d_only.drop(["Data/Hora", "R28D", "IP", "FP", "BL"], axis=1)

In [17]:
model = get_xgboost_model()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=seed)
scores = cross_val_score(
    model, x, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1
)
scores = np.abs(scores)
print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))

RMSE: 1.873 (0.256)


## 1.3 Dataset: df_r3d_and_rd7

<h3>Experiment 5</h3>
<h4> Repeated KFold Cross Validation - df_r3d-and-rd7</h4>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b> The IP and, FP variables are discarded.

In [20]:
y = df_r3d_and_r7d['R28D']
x = df_r3d_and_r7d.drop(['Data/Hora', 'R28D', "IP", "FP"], axis=1)

In [21]:
model = get_xgboost_model()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=seed)
scores = cross_val_score(model, x, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
scores = np.abs(scores)
print('RMSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

RMSE: 1.709 (0.229)


<h3>Experiment 5</h3>
<h4> Repeated KFold Cross Validation - df_r3d-and-rd7</h4>

<b>Dataset: df_r3d_and_r7d:</b> In this dataset both R3D and R7D variables are considered.

<b>Feature Selection:</b> The IP, FP and BL variables are discarded.

In [22]:
y = df_r3d_and_r7d['R28D']
x = df_r3d_and_r7d.drop(['Data/Hora', 'R28D', "IP", "FP", "BL"], axis=1)

In [23]:
model = get_xgboost_model()
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=seed)
scores = cross_val_score(model, x, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
scores = np.abs(scores)
print('RMSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

RMSE: 1.728 (0.243)
