In [1]:
import pandas as pd
import os
import nibabel as nib
import pydicom as dicom
import h5py
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
import wandb
from skimage.transform import resize
from sklearn.metrics import roc_auc_score
from loguru import logger
import random
from catboost import CatBoostClassifier
import time
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from boruta import BorutaPy
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection, SelectByShuffling
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns

In [144]:
import matplotlib as mpl

mpl.use("pgf")

mpl.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

mpl.rcParams['font.weight'] = "bold"

mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False

mpl.rcParams['axes.titlesize'] = 24
mpl.rcParams['axes.labelsize'] = 14

mpl.rcParams['axes.linewidth'] = 3

mpl.rcParams['axes.labelcolor'] = "gray"
mpl.rcParams['axes.edgecolor'] = "gainsboro"

mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['lines.markersize'] = 10

mpl.rcParams['xtick.color'] = "darkgray" #"#494949ff"
mpl.rcParams['ytick.color'] = "darkgray" #"#494949ff"

mpl.rcParams['xtick.labelsize'] = 10
mpl.rcParams['ytick.labelsize'] = 10

In [3]:
# torch.manual_seed(383)
random.seed(383)
np.random.seed(383)

# Read data

In [4]:
df = pd.read_csv("Data/MGMT_FEATURES.csv")
# df = pd.read_csv("Data/MGMT_FEATURES_REAL_VALUES_ONLY.csv")
df.head()

Unnamed: 0,original_shape_Elongation_999,original_shape_Flatness_999,original_shape_LeastAxisLength_999,original_shape_MajorAxisLength_999,original_shape_Maximum2DDiameterColumn_999,original_shape_Maximum2DDiameterRow_999,original_shape_Maximum2DDiameterSlice_999,original_shape_Maximum3DDiameter_999,original_shape_MeshVolume_999,original_shape_MinorAxisLength_999,...,original_glszm_ZoneEntropy_999,original_glszm_ZonePercentage_999,original_glszm_ZoneVariance_999,original_ngtdm_Busyness_999,original_ngtdm_Coarseness_999,original_ngtdm_Complexity_999,original_ngtdm_Contrast_999,original_ngtdm_Strength_999,id,target
0,0.671941,0.596258,35.765461,59.983238,49.244289,73.756356,78.549348,81.289606,57215.583333,40.305199,...,7.412929,0.224012,2712.620134,2.032407,0.000198,4369.255898,0.05409,0.519513,BraTS2021_00000,1
1,0.786725,0.739887,59.3398,80.201127,88.543774,103.04368,101.316336,103.484298,190441.125,63.0962,...,7.229311,0.095564,76767.243664,4.320882,8.5e-05,3479.717131,0.016914,0.477147,BraTS2021_00002,1
2,0.81368,0.698377,44.600645,63.863267,62.64982,72.56032,71.84706,73.851202,99159.916667,51.964271,...,7.316799,0.204607,3104.867601,1.664162,0.000166,5939.801978,0.034956,0.578801,BraTS2021_00003,0
3,0.680784,0.646685,48.804745,75.469119,72.498276,81.049368,91.285267,92.956979,124691.833333,51.378194,...,7.85808,0.236675,1556.531377,1.434415,0.000167,6820.657571,0.132865,0.511573,BraTS2021_00005,1
4,0.810884,0.792596,54.372037,68.599966,74.1485,75.82216,80.05623,80.907354,137812.125,55.626638,...,7.634529,0.21102,2682.742794,1.698148,0.000149,5970.246455,0.054046,0.409836,BraTS2021_00006,1


In [5]:
df = df.dropna().reset_index(drop=True)

In [6]:
train = pd.read_csv("Data/TRAIN.csv")
# val = pd.read_csv("Data/VAL.csv")
test = pd.read_csv("Data/TEST.csv")

In [7]:
df_train = df[df["id"].isin(train["files"].to_list())].sample(frac=1, random_state=0).reset_index(drop=True)
# df_val = df[df["id"].isin(val["files"].to_list())].sample(frac=1, random_state=0).reset_index(drop=True)
df_test = df[df["id"].isin(test["files"].to_list())].sample(frac=1, random_state=0).reset_index(drop=True)

In [8]:
df_train.head()

Unnamed: 0,original_shape_Elongation_999,original_shape_Flatness_999,original_shape_LeastAxisLength_999,original_shape_MajorAxisLength_999,original_shape_Maximum2DDiameterColumn_999,original_shape_Maximum2DDiameterRow_999,original_shape_Maximum2DDiameterSlice_999,original_shape_Maximum3DDiameter_999,original_shape_MeshVolume_999,original_shape_MinorAxisLength_999,...,original_glszm_ZoneEntropy_999,original_glszm_ZonePercentage_999,original_glszm_ZoneVariance_999,original_ngtdm_Busyness_999,original_ngtdm_Coarseness_999,original_ngtdm_Complexity_999,original_ngtdm_Contrast_999,original_ngtdm_Strength_999,id,target
0,0.792231,0.609113,38.701883,63.53806,64.845971,90.801982,73.824115,104.780723,57830.25,50.336846,...,7.276526,0.09905,10101.522225,3.181045,0.000383,891.575236,0.038163,0.355945,BraTS2021_00305,1
1,0.660675,0.581067,43.428165,74.738686,59.211485,89.269256,84.095184,89.425947,109124.625,49.378015,...,7.074585,0.109578,51222.945948,2.513356,0.000165,1453.22089,0.024449,0.230353,BraTS2021_00768,1
2,0.759326,0.722963,45.722051,63.242551,65.391131,71.693793,70.710678,74.471471,91401.416667,48.021688,...,6.977608,0.03282,127619.005126,4.513787,0.000273,411.821295,0.019406,0.164174,BraTS2021_00120,1
3,0.724236,0.618222,47.966566,77.587963,90.553851,98.005102,101.212647,107.447662,102218.125,56.191983,...,7.463089,0.119597,12538.805801,3.326229,0.00019,1365.964092,0.03381,0.217483,BraTS2021_00395,0
4,0.66468,0.624986,48.090662,76.946741,93.085982,65.802736,92.195445,94.074439,89080.166667,51.144952,...,6.984675,0.144062,22877.469858,3.98102,0.000116,3894.733353,0.015735,0.626619,BraTS2021_00021,0


In [9]:
x_train = df_train.drop(columns=["id", "target"])
y_train = df_train["target"]

x_test = df_test.drop(columns=["id", "target"])
y_test = df_test["target"]

In [10]:
x = x_train.copy()
y = y_train.copy()

## LogisticRegression

## Parâmetros *default*

In [11]:
search_space = {}

In [12]:
MP = make_pipeline(
    StandardScaler(), 
    LogisticRegression(max_iter=10000)
)

In [13]:
%%capture
GRD_LR = RandomizedSearchCV(MP, 
                            search_space, 
                            n_iter=1, 
                            cv=10, 
                            scoring="roc_auc", 
                            verbose=3, 
                            return_train_score=True, 
                            n_jobs=-1)
GRD_LR.fit(x, y)

In [14]:
pd.DataFrame(GRD_LR.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score
0,{},0.569522,0.755186,1


In [15]:
LR = make_pipeline(
    StandardScaler(), 
    LogisticRegression(max_iter=10000)
)
LR.fit(x, y)

auc_test = roc_auc_score(y_test, LR.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 65.62%.


## Validação *bootstrap* 

In [16]:
def eval_model(model, x, y):
    
    x = x.copy()
    y = y.copy()    
    
    y_pred = model.predict_proba(x)[:, 1]
    
    return roc_auc_score(y, y_pred)

In [17]:
def run_validations(model, x, y, n_simulations=100):
    
    auc_train_array = np.zeros(n_simulations)
    auc_val_array = np.zeros(n_simulations)
    
    parameters_list = []
    
    for rs in range(n_simulations):
        
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                         y,
                                                         test_size=0.25,
                                                         random_state=rs,
                                                         stratify=y)
        
        model.fit(x_train, y_train)
        
        auc_train = eval_model(model, x_train, y_train)
        auc_val = eval_model(model, x_val, y_val)
        
        auc_train_array[rs] = auc_train
        auc_val_array[rs] = auc_val
                
    df_evaluation = pd.DataFrame(data={
        "auc_train": auc_train_array,
        "auc_val": auc_val_array,
        "random_state": range(n_simulations)
    }).sort_values(by="auc_val", ascending=False).reset_index(drop=True)

    return df_evaluation
 

In [18]:
def sample_hyperparameters_logistic_regression(x, y, n_rounds, n_simulations=100):
    
    results_list = []
    
    for n_round in range(n_rounds):
    
        search_space = {
                "C": np.random.choice(np.logspace(-6, 2)),
                "penalty": np.random.choice(["l2"]),
                "max_iter": 10000
         }
        
        MP = make_pipeline(
                DropConstantFeatures(), 
                DropDuplicateFeatures(), 
                SmartCorrelatedSelection(selection_method="variance"),
                StandardScaler(), 
                LogisticRegression(**search_space)
            )
        
        results = run_validations(MP, x, y, n_simulations)

        results["parameters"] = str(search_space)
        results["round"] = n_round
        results_list.append(results)
        
    df_results = pd.concat(results_list).reset_index(drop=True)
    
    return df_results

def validate_logistic_regression_in_test_set(results, x, y, x_test, y_test):
    
    mean_auc_val = results.groupby("round")["auc_val"].quantile(0.5).reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
    best_round = 2 #mean_auc_val["round"].tolist()[0]
        
    parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
    model = make_pipeline(
                DropConstantFeatures(), 
                DropDuplicateFeatures(), 
                SmartCorrelatedSelection(selection_method="variance"),
                StandardScaler(), 
                LogisticRegression(**parameters)
            )
    
    auc_train_array = np.zeros(100)
    auc_val_array = np.zeros(100)
    auc_test_array = np.zeros(100)
    
    for rs in range(100):
    
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                     y,
                                                     test_size=0.25,
                                                     random_state=rs,
                                                     stratify=y)

        model.fit(x_train, y_train)

        auc_train = eval_model(model, x_train, y_train)
        auc_val = eval_model(model, x_val, y_val)
        auc_test = eval_model(model, x_test, y_test)
        
        auc_train_array[rs] = auc_train
        auc_val_array[rs] = auc_val
        auc_test_array[rs] = auc_test
        
    print(f"AUC in train: {100*auc_train:.2f}")
    print(f"AUC in validation: {100*auc_val:.2f}")
    print(f"AUC in test: {100*auc_test:.2f}")
    
    return pd.DataFrame(data={
        "auc_train": auc_train_array,
        "auc_val": auc_val_array,
        "auc_test": auc_test_array,
    }), model, y_test, model.predict_proba(x_test)[:, 1] 

In [19]:
%%capture
df_results_lr = sample_hyperparameters_logistic_regression(x, y, 10, 100)

In [20]:
df_results_lr.head()

Unnamed: 0,auc_train,auc_val,random_state,parameters,round
0,0.631762,0.676511,21,"{'C': 0.005689866029018293, 'penalty': 'l2', '...",0
1,0.641382,0.655907,27,"{'C': 0.005689866029018293, 'penalty': 'l2', '...",0
2,0.640275,0.648352,89,"{'C': 0.005689866029018293, 'penalty': 'l2', '...",0
3,0.619813,0.643544,56,"{'C': 0.005689866029018293, 'penalty': 'l2', '...",0
4,0.642565,0.64011,55,"{'C': 0.005689866029018293, 'penalty': 'l2', '...",0


In [21]:
df_results_lr.groupby("round").agg({
    "auc_val": lambda x: np.quantile(x, 0.5),
    "auc_train": lambda x: np.quantile(x, 0.5)
}).sort_values(by="auc_val", ascending=False).head()

Unnamed: 0_level_0,auc_val,auc_train
round,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.580529,0.684157
3,0.577266,0.664287
9,0.575378,0.693415
0,0.565934,0.652205
4,0.56044,0.702901


In [22]:
def set_size(width_pt, fraction=1, subplots=(1, 1)):
    """Set figure dimensions to sit nicely in our document.

    Parameters
    ----------
    width_pt: float
            Document width in points
    fraction: float, optional
            Fraction of the width which you wish the figure to occupy
    subplots: array-like, optional
            The number of rows and columns of subplots.
    Returns
    -------
    fig_dim: tuple
            Dimensions of figure in inches
    """
    # Width of figure (in pts)
    fig_width_pt = width_pt * fraction
    # Convert from pt to inches
    inches_per_pt = 1 / 72.27

    # Golden ratio to set aesthetic figure height
    golden_ratio = (5**.5 - 1) / 2

    # Figure width in inches
    fig_width_in = fig_width_pt * inches_per_pt
    # Figure height in inches
    fig_height_in = fig_width_in * golden_ratio * (subplots[0] / subplots[1])

    return (fig_width_in, fig_height_in)

In [23]:
# plt.figure(figsize=set_size(545))

fig, ax = plt.subplots(1, 1, figsize=set_size(445))


# fig = sns.histplot(df_results_lr.query("round == 3"), x="auc_val", hue="round")

ax.hist(100*df_results_lr.query("round == 2")["auc_train"])
ax.hist(100*df_results_lr.query("round == 2")["auc_val"])
    
plt.xlabel("ROC-AUC")

plt.ylabel("Quantidade")

plt.legend(["Treino", "Validação"])
    
# plt.show()

# plt.savefig("FIG_LR.pgf")

<matplotlib.legend.Legend at 0x7fca8cfcae50>

In [24]:
df_test_lr, model_lr, y_test_lr, y_pred_lr = validate_logistic_regression_in_test_set(df_results_lr, x, y, x_test, y_test)

AUC in train: 68.62
AUC in validation: 59.75
AUC in test: 64.13


In [25]:
df_test_lr.std()

auc_train    0.013244
auc_val      0.041070
auc_test     0.021022
dtype: float64

In [26]:
df_test_lr["auc_train"].plot.hist()
df_test_lr["auc_val"].plot.hist()
df_test_lr["auc_test"].plot.hist()

plt.legend()

<matplotlib.legend.Legend at 0x7fca9402c6d0>

## *Random search*

In [27]:
search_space = {
    "logisticregression__C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.5, 1, 2, 3, 4, 5, 10, 15, 20, 25],
    "logisticregression__penalty": ["l1", "l2", "elasticnet", None]
}

In [28]:
MP = make_pipeline(
    DropConstantFeatures(), 
    DropDuplicateFeatures(), 
    SmartCorrelatedSelection(selection_method="variance"),
    StandardScaler(), 
    LogisticRegression(max_iter=10000))

In [29]:
# %%capture
# GRD_LR = RandomizedSearchCV(MP, 
#                             search_space, 
#                             n_iter=500, 
#                             cv=10, 
#                             scoring="roc_auc", 
#                             verbose=3, 
#                             return_train_score=True, 
#                             n_jobs=-1)
# GRD_LR.fit(x, y)

In [30]:
# input_cols = x.columns

# selected_cols = GRD_LR.best_estimator_[:-2].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [31]:
# pd.DataFrame(GRD_LR.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]].head()

In [32]:
# results = pd.DataFrame(GRD_LR.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [33]:
# params = pd.DataFrame(GRD_LR.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# params["max_iter"] = 10000

# LR = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
#     StandardScaler(), 
#     LogisticRegression(**params)
# )

# LR.fit(x, y)

# auc_test = roc_auc_score(y_test, LR.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

## CatBoost

## Parâmetros *default*

In [34]:
search_space = {}

In [35]:
MP = make_pipeline(
    CatBoostClassifier(verbose=False)
)

In [36]:
# %%capture
# GRD_CB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=50,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_CB.fit(x, y)

In [37]:
# results = pd.DataFrame(GRD_CB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# results.head()

In [38]:
# params = pd.DataFrame(GRD_CB.cv_results_).iloc[0]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# params["verbose"] = False

# CB = CatBoostClassifier(**params)
# CB.fit(x, y)

# auc_test = roc_auc_score(y_test, CB.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

## Validação *bootstrap* 

In [39]:
# def sample_hyperparameters_catboost(x, y, n_rounds, n_simulations=100):
    
#     results_list = []
    
#     for n_round in range(n_rounds):

#         search_space = {
#             "depth": np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
#             "n_estimators": np.random.choice([5, 10, 15, 20, 25, 50, 75, 100, 200, 300, 400, 500]),
#             "l2_leaf_reg": np.random.choice([1, 2, 5, 10, 15, 20, 25, 50, 100, 250, 500]),
#             "learning_rate": np.random.choice(list(np.logspace(-6, 0)) + [None]),
#             "auto_class_weights": np.random.choice([None, "Balanced", "SqrtBalanced"]),
#             "boosting_type": np.random.choice(["Ordered", "Plain"]),
#             "verbose": False
#         }
        
#         MP = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 CatBoostClassifier(**search_space)
#             )
        
#         results = run_validations(MP, x, y, n_simulations)

#         results["parameters"] = str(search_space)
#         results["round"] = n_round
#         results_list.append(results)
        
#     df_results = pd.concat(results_list).reset_index(drop=True)
    
#     return df_results

# def validate_catboost_in_test_set(results, x, y, x_test, y_test):
    
#     mean_auc_val = results.groupby("round")["auc_val"].mean().reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
#     best_round = mean_auc_val["round"].tolist()[0]
        
#     parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
#     model = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 CatBoostClassifier(**parameters)
#             )
    
#     x_train, x_val, y_train, y_val = train_test_split(x,
#                                                  y,
#                                                  test_size=0.25,
#                                                  random_state=0,
#                                                  stratify=y)

#     model.fit(x_train, y_train)
    
#     auc_train = eval_model(model, x_train, y_train)
#     auc_val = eval_model(model, x_val, y_val)
#     auc_test = eval_model(model, x_test, y_test)
        
#     print(f"AUC in train: {100*auc_train:.2f}")
#     print(f"AUC in validation: {100*auc_val:.2f}")
#     print(f"AUC in test: {100*auc_test:.2f}")

In [40]:
# %%capture
# df_results_cb = sample_hyperparameters_catboost(x, y, 10, 100)

In [41]:
# plt.figure(figsize=(25, 7.5))

# sns.histplot(df_results_cb, x="auc_val", hue="round")
    
# plt.show()

In [42]:
# df_results_cb.groupby("round").agg({
#     "auc_val": "mean",
#     "auc_train": "mean"
# }).sort_values(by="auc_val", ascending=False).head()

In [43]:
# validate_catboost_in_test_set(df_results_cb, x, y, x_test, y_test)

## *Random search*

In [44]:
# search_space = {
#     "catboostclassifier__depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
# #     "early_stopping_rounds": [5, 10, 15, 20],
#     "catboostclassifier__n_estimators": [5, 10, 15, 20, 25, 50, 75, 100, 200, 300, 400, 500],
#     "catboostclassifier__l2_leaf_reg": [1, 2, 5, 10, 15, 20, 25, 50, 100, 250, 500],
#     "catboostclassifier__learning_rate": list(np.logspace(-6, 0)) + [None],
#     "catboostclassifier__auto_class_weights": [None, "Balanced", "SqrtBalanced"],
#     "catboostclassifier__boosting_type": ["Ordered", "Plain"],
# #     "catboostclassifier__loss_function": ["AUC", "Logloss"],
# #     "catboostclassifier__min_data_in_leaf": [5, 10, 15, 20, 25, 30, 35, 40, 45, 100, 150, 200, None]
# }

In [45]:
# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     CatBoostClassifier(verbose=False)
# )

In [46]:
# %%capture
# GRD_CB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=100,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_CB.fit(x, y)

In [47]:
# results = pd.DataFrame(GRD_CB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [48]:
# input_cols = x.columns

# selected_cols = GRD_CB.best_estimator_[:-1].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [49]:
# params = pd.DataFrame(GRD_CB.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# params["verbose"] = False

# CB = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 CatBoostClassifier(**params)
#             )
# CB.fit(x, y)

# auc_test = roc_auc_score(y_test, CB.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

# GradientBoostingClassifier

In [50]:
search_space = {}

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
MP = make_pipeline(
    GradientBoostingClassifier()
)

In [53]:
%%capture
GRD_GB = RandomizedSearchCV(MP, 
                   search_space, 
                   n_iter=1,
                   scoring="roc_auc", 
                   cv=10, 
                   verbose=3, 
                   return_train_score=True, 
                   n_jobs=-1)
GRD_GB.fit(x, y)

In [54]:
results = pd.DataFrame(GRD_GB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

results.head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score,score_diff
0,{},0.546257,1.0,1,0.453743


In [55]:
GB = GradientBoostingClassifier()
GB.fit(x, y)

auc_test = roc_auc_score(y_test, GB.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 56.75%.


In [157]:
def sample_hyperparameters_gradient_boosting(x, y, n_rounds, n_simulations=100):
    
    results_list = []
    
    for n_round in range(n_rounds):

        search_space = {
            "max_depth": np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9]),
            "min_samples_split": np.random.choice([2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40]),
            "min_samples_leaf": np.random.choice([2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40]),    
            "n_estimators": np.random.choice([5, 10, 15, 20, 25, 50, 75, 100, 200, 300]),
            "max_features": np.random.choice(["auto", "sqrt", "log2"]),
            "validation_fraction": 0.25,
            "n_iter_no_change": 5            
        }
        
        MP = make_pipeline(
                DropConstantFeatures(), 
                DropDuplicateFeatures(), 
                SmartCorrelatedSelection(selection_method="variance"),
                GradientBoostingClassifier(**search_space)
            )
        
        results = run_validations(MP, x, y, n_simulations)

        results["parameters"] = str(search_space)
        results["round"] = n_round
        results_list.append(results)
        
    df_results = pd.concat(results_list).reset_index(drop=True)
    
    return df_results

    
def validate_gradient_boosting_in_test_set(results, x, y, x_test, y_test):
    
    mean_auc_val = results.groupby("round")["auc_val"].quantile(0.5).reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
    best_round = 14 #mean_auc_val["round"].tolist()[0]
        
    parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
    model = make_pipeline(
                DropConstantFeatures(), 
                DropDuplicateFeatures(), 
                SmartCorrelatedSelection(selection_method="variance"),
                GradientBoostingClassifier(**parameters)
            )
    
    auc_train_array = np.zeros(100)
    auc_val_array = np.zeros(100)
    auc_test_array = np.zeros(100)
    
    for rs in range(100):
    
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                     y,
                                                     test_size=0.25,
                                                     random_state=rs,
                                                     stratify=y)

        model.fit(x_train, y_train)

        auc_train = eval_model(model, x_train, y_train)
        auc_val = eval_model(model, x_val, y_val)
        auc_test = eval_model(model, x_test, y_test)
        
        auc_train_array[rs] = auc_train
        auc_val_array[rs] = auc_val
        auc_test_array[rs] = auc_test
        
    print(f"AUC in train: {100*auc_train:.2f}")
    print(f"AUC in validation: {100*auc_val:.2f}")
    print(f"AUC in test: {100*auc_test:.2f}")
    
    return pd.DataFrame(data={
        "auc_train": auc_train_array,
        "auc_val": auc_val_array,
        "auc_test": auc_test_array,
    }), model, y_test, model.predict_proba(x_test)[:, 1] 

In [57]:
%%capture
df_results_gb = sample_hyperparameters_gradient_boosting(x, y, 25, 1)

In [58]:
df_results_gb.groupby("round").agg({
    "auc_val": "median",
    "auc_train": "median"
}).sort_values(by="auc_val", ascending=False).head(10)

Unnamed: 0_level_0,auc_val,auc_train
round,Unnamed: 1_level_1,Unnamed: 2_level_1
4,0.649725,0.939149
0,0.618475,0.799236
18,0.61092,0.950181
9,0.610062,0.799389
15,0.606456,0.880664
14,0.575378,0.777629
10,0.571085,0.884844
6,0.569712,0.826513
13,0.563015,0.794713
22,0.554087,0.902882


In [59]:
from matplotlib import rc
rc("text", usetex=False)

In [60]:
mpl.use("pgf")

mpl.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [199]:
# plt.figure(figsize=set_size(545))

fig, ax = plt.subplots(1, 1, figsize=set_size(445))


# fig = sns.histplot(df_results_lr.query("round == 3"), x="auc_val", hue="round")

ax.hist(100*df_results_gb.query("round == 14")["auc_train"])
ax.hist(100*df_results_gb.query("round == 14")["auc_val"])
    
plt.xlabel("ROC-AUC")

plt.ylabel("Quantidade")

plt.legend(["Treino", "Validação"])
    
plt.show()

plt.savefig("FIG_GB.pgf")

  fig, ax = plt.subplots(1, 1, figsize=set_size(445))


In [158]:
df_test_gb, model_gb, y_test_gb, y_pred_gb = validate_gradient_boosting_in_test_set(df_results_gb, x, y, x_test, y_test)

AUC in train: 72.94
AUC in validation: 58.34
AUC in test: 53.55


In [159]:
df_test_gb.std()

auc_train    0.035013
auc_val      0.043822
auc_test     0.045776
dtype: float64

In [64]:
search_space = {
    "gradientboostingclassifier__max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9],
    "gradientboostingclassifier__min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],
    "gradientboostingclassifier__min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],    
    "gradientboostingclassifier__n_estimators": [5, 10, 15, 20, 25, 50, 75, 100, 200, 300],
    "gradientboostingclassifier__max_features": ["auto", "sqrt", "log2"],
    
}

In [65]:
# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     GradientBoostingClassifier(validation_fraction=0.25, n_iter_no_change=5)
# )

In [66]:
# %%capture
# GRD_GB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=20,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_GB.fit(x, y)

In [67]:
# results = pd.DataFrame(GRD_GB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [68]:
# input_cols = x.columns

# selected_cols = GRD_GB.best_estimator_[:-1].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [69]:
# params = pd.DataFrame(GRD_GB.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# params["validation_fraction"] = 0.25
# params["n_iter_no_change"] = 5

# GBC = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 GradientBoostingClassifier(**params)
#             )
# GBC.fit(x, y)

# auc_test = roc_auc_score(y_test, GBC.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

# RandomForests

## Parâmetros *default*

In [70]:
search_space = {}

In [71]:
MP = make_pipeline(
    RandomForestClassifier()
)

In [72]:
%%capture
GRD_RF = RandomizedSearchCV(MP, 
                   search_space, 
                   n_iter=1,
                   scoring="roc_auc", 
                   cv=10, 
                   verbose=3, 
                   return_train_score=True, 
                   n_jobs=-1)
GRD_RF.fit(x, y)

In [73]:
results = pd.DataFrame(GRD_RF.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

results.head()

Unnamed: 0,params,mean_test_score,mean_train_score,rank_test_score,score_diff
0,{},0.571607,1.0,1,0.428393


In [74]:
RF = RandomForestClassifier()
RF.fit(x, y)

auc_test = roc_auc_score(y_test, RF.predict_proba(x_test)[:, 1])

print(f"AUC test: {100*auc_test:.2f}%.")

AUC test: 55.28%.


## Validação *bootstrap* 

In [75]:
def sample_hyperparameters_random_forest(x, y, n_rounds, n_simulations=100):
    
    results_list = []
    
    for n_round in range(n_rounds):

        search_space = {
            "max_depth": np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9]),
            "min_samples_split": np.random.choice([2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40]),
            "min_samples_leaf": np.random.choice([2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40]),    
            "n_estimators": np.random.choice([5, 10, 15, 20, 25, 50, 75, 100, 200, 300]),
            "max_features": np.random.choice(["auto", "sqrt", "log2"]),
        }
        
        MP = make_pipeline(
                DropConstantFeatures(), 
                DropDuplicateFeatures(), 
                SmartCorrelatedSelection(selection_method="variance"),
                RandomForestClassifier(**search_space)
            )
        
        results = run_validations(MP, x, y, n_simulations)

        results["parameters"] = str(search_space)
        results["round"] = n_round
        results_list.append(results)
        
    df_results = pd.concat(results_list).reset_index(drop=True)
    
    return df_results
    
def validate_random_forest_in_test_set(results, x, y, x_test, y_test):
    
    mean_auc_val = results.groupby("round")["auc_val"].quantile(0.5).reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
    best_round = 1 #mean_auc_val["round"].tolist()[0]
        
    parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
    model = make_pipeline(
                DropConstantFeatures(), 
                DropDuplicateFeatures(), 
                SmartCorrelatedSelection(selection_method="variance"),
                RandomForestClassifier(**parameters)
            )
    
    auc_train_array = np.zeros(100)
    auc_val_array = np.zeros(100)
    auc_test_array = np.zeros(100)
    
    for rs in range(100):
    
        x_train, x_val, y_train, y_val = train_test_split(x,
                                                     y,
                                                     test_size=0.25,
                                                     random_state=rs,
                                                     stratify=y)

        model.fit(x_train, y_train)

        auc_train = eval_model(model, x_train, y_train)
        auc_val = eval_model(model, x_val, y_val)
        auc_test = eval_model(model, x_test, y_test)
        
        auc_train_array[rs] = auc_train
        auc_val_array[rs] = auc_val
        auc_test_array[rs] = auc_test
        
    print(f"AUC in train: {100*auc_train:.2f}")
    print(f"AUC in validation: {100*auc_val:.2f}")
    print(f"AUC in test: {100*auc_test:.2f}")
    
    return pd.DataFrame(data={
        "auc_train": auc_train_array,
        "auc_val": auc_val_array,
        "auc_test": auc_test_array,
    }), model, y_test, model.predict_proba(x_test)[:, 1] 

In [76]:
%%capture
df_results_rf = sample_hyperparameters_random_forest(x, y, 25, 100)

In [77]:
df_results_rf.head()

Unnamed: 0,auc_train,auc_val,random_state,parameters,round
0,0.775282,0.689045,97,"{'max_depth': 3, 'min_samples_split': 8, 'min_...",0
1,0.760584,0.645776,54,"{'max_depth': 3, 'min_samples_split': 8, 'min_...",0
2,0.751727,0.637363,12,"{'max_depth': 3, 'min_samples_split': 8, 'min_...",0
3,0.757759,0.630323,67,"{'max_depth': 3, 'min_samples_split': 8, 'min_...",0
4,0.764383,0.61693,58,"{'max_depth': 3, 'min_samples_split': 8, 'min_...",0


In [78]:
df_results_rf.groupby("round").agg({
    "auc_val": "mean",
    "auc_train": "mean"
}).sort_values(by="auc_val", ascending=False).head(25)

Unnamed: 0_level_0,auc_val,auc_train
round,Unnamed: 1_level_1,Unnamed: 2_level_1
22,0.564083,0.975735
8,0.561487,0.944592
15,0.561027,0.986703
20,0.558255,0.966585
3,0.55795,0.871699
12,0.557874,0.911183
24,0.557356,0.90389
11,0.557059,0.871974
23,0.556497,0.850525
19,0.555021,0.876013


In [208]:
# # plt.figure(figsize=set_size(545))

# fig, ax = plt.subplots(1, 1, figsize=set_size(445))


# # fig = sns.histplot(df_results_lr.query("round == 3"), x="auc_val", hue="round")

# ax.hist(100*df_results_rf.query("round == 1")["auc_train"])
# ax.hist(100*df_results_rf.query("round == 1")["auc_val"])
    
# plt.xlabel("ROC-AUC")

# plt.ylabel("Quantidade")

# plt.legend(["Treino", "Validação"])
    
# plt.show()

# plt.savefig("FIG_RF.pgf")

In [209]:
# fig

In [80]:
df_test_rf, model_rf, y_test_rf, y_pred_rf = validate_random_forest_in_test_set(df_results_rf, x, y, x_test, y_test)

AUC in train: 75.00
AUC in validation: 57.86
AUC in test: 53.24


In [81]:
df_test_rf.std()

auc_train    0.013899
auc_val      0.047020
auc_test     0.046990
dtype: float64

## *Random search*

In [82]:
# search_space = {
#     "randomforestclassifier__max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9],
#     "randomforestclassifier__min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],
#     "randomforestclassifier__min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],    
#     "randomforestclassifier__n_estimators": [5, 10, 15, 20, 25, 50, 75, 100, 200, 300],
#     "randomforestclassifier__max_features": ["auto", "sqrt", "log2"],
# }

In [83]:
# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     RandomForestClassifier()
# )

In [84]:
# %%capture
# GRD_RF = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=100,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_RF.fit(x, y)

In [85]:
# results = pd.DataFrame(GRD_RF.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [86]:
# input_cols = x.columns

# selected_cols = GRD_RF.best_estimator_[:-1].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [87]:
# params = pd.DataFrame(GRD_RF.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# # params["verbose"] = False

# RF = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 RandomForestClassifier(**params)
#             )
# RF.fit(x, y)

# auc_test = roc_auc_score(y_test, RF.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

# DecisionTree

## Parâmetros *default*

In [88]:
search_space = {}

In [89]:
# MP = make_pipeline(
#     DecisionTreeClassifier()
# )

In [90]:
# %%capture
# GRD_DT = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=1,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_DT.fit(x, y)

In [91]:
# results = pd.DataFrame(GRD_DT.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# results.head()

In [92]:
# DT = DecisionTreeClassifier()
# DT.fit(x, y)

# auc_test = roc_auc_score(y_test, DT.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

## Validação *bootstrap* 

In [93]:
# def sample_hyperparameters_decision_tree(x, y, n_rounds, n_simulations=100):
    
#     results_list = []
    
#     for n_round in range(n_rounds):

#         search_space = {
#                 "max_depth": np.random.choice([1, 2, 3, 4, 5, 6]),
#                 "min_samples_split": np.random.choice([2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40]),
#                 "min_samples_leaf": np.random.choice([2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40]),    
#                 "max_features": np.random.choice(["sqrt", "log2"]),
#                 "random_state": np.random.choice([0, 1, 2, 3, 4, 5, 6]),
#                 "criterion": np.random.choice(["gini", "entropy"])
#         }
        
#         MP = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 DecisionTreeClassifier(**search_space)
#             )
        
#         results = run_validations(MP, x, y, n_simulations)

#         results["parameters"] = str(search_space)
#         results["round"] = n_round
#         results_list.append(results)
        
#     df_results = pd.concat(results_list).reset_index(drop=True)
    
#     return df_results

# def validate_decision_tree_in_test_set(results, x, y, x_test, y_test):
    
#     mean_auc_val = results.groupby("round")["auc_val"].median().reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
#     best_round = mean_auc_val["round"].tolist()[0]
        
#     parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
#     model = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 DecisionTreeClassifier(**parameters)
#             )
    
#     x_train, x_val, y_train, y_val = train_test_split(x,
#                                                  y,
#                                                  test_size=0.25,
#                                                  random_state=0,
#                                                  stratify=y)

#     model.fit(x_train, y_train)
    
#     auc_train = eval_model(model, x_train, y_train)
#     auc_val = eval_model(model, x_val, y_val)
#     auc_test = eval_model(model, x_test, y_test)
        
#     print(f"AUC in train: {100*auc_train:.2f}")
#     print(f"AUC in validation: {100*auc_val:.2f}")
#     print(f"AUC in test: {100*auc_test:.2f}")

In [94]:
# %%capture
# df_results_dt = sample_hyperparameters_decision_tree(x, y, 50, 100)

In [95]:
# plt.figure(figsize=(25, 7.5))

# sns.histplot(df_results_dt, x="auc_val", hue="round")
    
# plt.show()

In [96]:
# df_results_dt.groupby("round").agg({
#     "auc_val": "median",
#     "auc_train": "median"
# }).sort_values(by="auc_val", ascending=False).head()

In [97]:
# validate_decision_tree_in_test_set(df_results_dt, x, y, x_test, y_test)

## *Random search*

In [98]:
# search_space = {
#     "decisiontreeclassifier__max_depth": [1, 2, 3, 4, 5, 6],
#     "decisiontreeclassifier__min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],
#     "decisiontreeclassifier__min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40],    
#     "decisiontreeclassifier__max_features": ["auto", "sqrt", "log2"],
#     "decisiontreeclassifier__random_state": [0, 1, 2, 3, 4, 5, 6],
#     "decisiontreeclassifier__criterion": ["gini", "entropy"]
# }

In [99]:
# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     DecisionTreeClassifier()
# )

In [100]:
# %%capture
# GRD_DT = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=100,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_DT.fit(x, y)

In [101]:
# results = pd.DataFrame(GRD_DT.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [102]:
# input_cols = x.columns

# selected_cols = GRD_DT.best_estimator_[:-1].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [103]:
# params = pd.DataFrame(GRD_DT.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# # params["verbose"] = False

# DT = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     DecisionTreeClassifier(**params)
# )
# DT.fit(x, y)

# auc_test = roc_auc_score(y_test, DT.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

# *Ensemble*

In [104]:
# y_test_pred_1 =  LR.predict_proba(x_test)[:, 1]

# y_test_pred_2 = RF.predict_proba(x_test)[:, 1]

# # y_test_pred_3 = DT.predict_proba(x_test)[:, 1]

# y_pred = (y_test_pred_1 + y_test_pred_2)/2

# auc_test = roc_auc_score(y_test, y_pred)

# print(f"AUC test: {100*auc_test:.2f}%.")

# LightGBM

## Parâmetros *default*

In [105]:
search_space = {}

In [106]:
# MP = make_pipeline(
#     LGBMClassifier()
# )

In [107]:
# %%capture
# GRD_LGB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=1,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_LGB.fit(x, y)

In [108]:
# results = pd.DataFrame(GRD_LGB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# results.head()

In [109]:
# LGB = LGBMClassifier()
# LGB.fit(x, y)

# auc_test = roc_auc_score(y_test, LGB.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

## Validação *bootstrap* 

In [110]:
# def sample_hyperparameters_lgbm(x, y, n_rounds, n_simulations=100):
    
#     results_list = []
    
#     for n_round in range(n_rounds):

#         search_space = {
# #             "boosting_type": np.random.choice(["gbdt", "dart", "goss", "rf"]),
#             "max_depth": np.random.choice([2, 3, 4, 5, 6, 7, 8, 9, 10]),
#             "n_estimators": np.random.choice([5, 10, 15, 20, 25, 50, 75, 100, 200, 300]),
#             "reg_alpha": np.random.choice([0, 1e-3, 1e-2, 1e-1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]),
#             "reg_lambda": np.random.choice([0, 1e-3, 1e-2, 1e-1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]),
#         }
        
#         MP = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 LGBMClassifier(**search_space)
#             )
        
#         results = run_validations(MP, x, y, n_simulations)

#         results["parameters"] = str(search_space)
#         results["round"] = n_round
#         results_list.append(results)
        
#     df_results = pd.concat(results_list).reset_index(drop=True)
    
#     return df_results

# def validate_lgbm_in_test_set(results, x, y, x_test, y_test):
    
#     mean_auc_val = results.groupby("round")["auc_val"].mean().reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
#     best_round = mean_auc_val["round"].tolist()[0]
        
#     parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
#     model = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 LGBMClassifier(**parameters)
#             )
    
#     x_train, x_val, y_train, y_val = train_test_split(x,
#                                                  y,
#                                                  test_size=0.25,
#                                                  random_state=0,
#                                                  stratify=y)

#     model.fit(x_train, y_train)
    
#     auc_train = eval_model(model, x_train, y_train)
#     auc_val = eval_model(model, x_val, y_val)
#     auc_test = eval_model(model, x_test, y_test)
        
#     print(f"AUC in train: {100*auc_train:.2f}")
#     print(f"AUC in validation: {100*auc_val:.2f}")
#     print(f"AUC in test: {100*auc_test:.2f}")

In [111]:
# %%capture
# df_results_lgbm = sample_hyperparameters_lgbm(x, y, 25, 100)

In [112]:
# plt.figure(figsize=(25, 7.5))

# sns.histplot(df_results_lgbm, x="auc_val", hue="round")
    
# plt.show()

In [113]:
# df_results_lgbm.groupby("round").agg({
#     "auc_val": "mean",
#     "auc_train": "mean"
# }).sort_values(by="auc_val", ascending=False).head()

In [114]:
# validate_lgbm_in_test_set(df_results_lgbm, x, y, x_test, y_test)

## *Random search*

In [115]:
# search_space = {
#     "lgbmclassifier__boosting_type": ["gbdt", "dart", "goss", "rf"],
#     "lgbmclassifier__max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10],
#     "lgbmclassifier__n_estimators": [5, 10, 15, 20, 25, 50, 75, 100, 200, 300],
#     "lgbmclassifier__reg_alpha": [0, 1e-3, 1e-2, 1e-1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
#     "lgbmclassifier__reg_lambda": [0, 1e-3, 1e-2, 1e-1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
# }

In [116]:
# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     LGBMClassifier()
# )

In [117]:
# %%capture
# GRD_LGB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=50,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_LGB.fit(x, y)

In [118]:
# results = pd.DataFrame(GRD_LGB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [119]:
# input_cols = x.columns

# selected_cols = GRD_LGB.best_estimator_[:-1].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [120]:
# params = pd.DataFrame(GRD_LGB.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# # params["verbose"] = False

# LGB = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     LGBMClassifier(**params)
# )
# LGB.fit(x, y)

# auc_test = roc_auc_score(y_test, LGB.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

# XGBoost

## Parâmetros *default*

In [121]:
search_space = {}

In [122]:
# MP = make_pipeline(
#     XGBClassifier()
# )

In [123]:
# %%capture
# GRD_XGB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=1,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_XGB.fit(x, y)

In [124]:
# results = pd.DataFrame(GRD_XGB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# results.head()

In [125]:
# XGB = XGBClassifier()
# XGB.fit(x, y)

# auc_test = roc_auc_score(y_test, XGB.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")

## Validação *bootstrap* 

In [126]:
# def sample_hyperparameters_xgboost(x, y, n_rounds, n_simulations=100):
    
#     results_list = []
    
#     for n_round in range(n_rounds):

#         search_space = {
#             "min_child_weight": np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
#             "gamma": np.random.choice([0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 6]),
#             "subsample": np.random.choice([0, 0.2, 0.4, 0.8, 1]),
#             "colsample_by_tree": np.random.choice([0, 0.2, 0.4, 0.6, 0.8, 1]),
#             "max_depth": np.random.choice([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]),
#             "n_estimators": np.random.choice(range(100, 2000, 100)),
#         }
        
#         MP = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 XGBClassifier(**search_space)
#             )
        
#         results = run_validations(MP, x, y, n_simulations)

#         results["parameters"] = str(search_space)
#         results["round"] = n_round
#         results_list.append(results)
        
#     df_results = pd.concat(results_list).reset_index(drop=True)
    
#     return df_results

# def validate_xgboost_in_test_set(results, x, y, x_test, y_test):
    
#     mean_auc_val = results.groupby("round")["auc_val"].mean().reset_index().sort_values(by="auc_val", ascending=False).reset_index(drop=True)
    
#     best_round = mean_auc_val["round"].tolist()[0]
        
#     parameters = eval(results[results["round"] == best_round]["parameters"].tolist()[0])
    
#     model = make_pipeline(
#                 DropConstantFeatures(), 
#                 DropDuplicateFeatures(), 
#                 SmartCorrelatedSelection(selection_method="variance"),
#                 XGBClassifier(**parameters)
#             )
    
#     x_train, x_val, y_train, y_val = train_test_split(x,
#                                                  y,
#                                                  test_size=0.25,
#                                                  random_state=0,
#                                                  stratify=y)

#     model.fit(x_train, y_train)
    
#     auc_train = eval_model(model, x_train, y_train)
#     auc_val = eval_model(model, x_val, y_val)
#     auc_test = eval_model(model, x_test, y_test)
        
#     print(f"AUC in train: {100*auc_train:.2f}")
#     print(f"AUC in validation: {100*auc_val:.2f}")
#     print(f"AUC in test: {100*auc_test:.2f}")

In [127]:
# %%capture
# df_results_xgb = sample_hyperparameters_xgboost(x, y, 5, 100)

In [128]:
# plt.figure(figsize=(25, 7.5))

# sns.histplot(df_results_xgb, x="auc_val", hue="round")
    
# plt.show()

In [129]:
# df_results_xgb.groupby("round").agg({
#     "auc_val": "mean",
#     "auc_train": "mean"
# }).sort_values(by="auc_val", ascending=False).head()

In [130]:
# validate_xgboost_in_test_set(df_results_xgb, x, y, x_test, y_test)

## *Random search*

In [131]:
# search_space = {
#     "xgbclassifier__min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#     "xgbclassifier__gamma": [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 6],
#     "xgbclassifier__subsample": [0, 0.2, 0.4, 0.8, 1],
#     "xgbclassifier__colsample_by_tree": [0, 0.2, 0.4, 0.6, 0.8, 1],
#     "xgbclassifier__max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
#     "xgbclassifier__n_estimators": range(100, 2000, 100),
# }

In [132]:
# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     XGBClassifier()
# )

In [133]:
# %%capture
# GRD_XGB = RandomizedSearchCV(MP, 
#                    search_space, 
#                    n_iter=100,
#                    scoring="roc_auc", 
#                    cv=10, 
#                    verbose=3, 
#                    return_train_score=True, 
#                    n_jobs=-1)
# GRD_XGB.fit(x, y)

In [134]:
# results = pd.DataFrame(GRD_XGB.cv_results_).sort_values(by="rank_test_score")[["params", "mean_test_score", "mean_train_score", "rank_test_score"]]
# results["score_diff"] = np.abs(results["mean_train_score"] - results["mean_test_score"])

# best_idx = results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).index[0]

# results[results["score_diff"] < 0.25].sort_values(by="mean_test_score", ascending=False).head()

In [135]:
# input_cols = x.columns

# selected_cols = GRD_XGB.best_estimator_[:-1].transform(x).columns

# print(f"Input features: {len(input_cols)}.")
# print(f"Output features: {len(selected_cols)}.")
# print(f"Removed features: {len(input_cols) - len(selected_cols)}.")

In [136]:
# params = pd.DataFrame(GRD_DT.cv_results_).iloc[best_idx]["params"]
# params = {k.split("__")[-1]: params[k] for k in params}
# # params["verbose"] = False

# MP = make_pipeline(
#     DropConstantFeatures(), 
#     DropDuplicateFeatures(), 
#     SmartCorrelatedSelection(selection_method="variance"),
# #     SelectByShuffling(estimator=LogisticRegression(max_iter=10000), scoring="roc_auc", cv=3),
#     XGBClassifier(**params)
# )
# XGB.fit(x, y)

# auc_test = roc_auc_score(y_test, XGB.predict_proba(x_test)[:, 1])

# print(f"AUC test: {100*auc_test:.2f}%.")