<div align="center">
<h1>Stage 7: Hyperparameter Tuning</a></h1>
by Hongnan Gao
<br>
</div>

## Dependencies and Configuration

In [None]:
%%capture
!pip install -q wandb
# !pip install -q shap
!pip install -q mlxtend==0.19.0
!pip install -q statsmodels==0.13.1
# !pip install gcloud == 0.18.3

In [None]:
import wandb
wandb.login()



True

In [None]:
import copy
import csv
import logging
import os
import random
from dataclasses import asdict, dataclass, field
from functools import wraps
from pathlib import Path
from time import time
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import mlxtend
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump, load
from mlxtend.evaluate import bias_variance_decomp, paired_ttest_5x2cv
from scipy import stats
from sklearn import (base, decomposition, dummy, ensemble, feature_selection,
                     linear_model, metrics, model_selection, neighbors,
                     pipeline, preprocessing, svm, tree)
from statsmodels.regression.linear_model import OLS

## Utils and Configurations

In [None]:
@dataclass
class config:
    raw_data: str = "https://storage.googleapis.com/reighns/reighns_ml_projects/docs/supervised_learning/classification/breast-cancer-wisconsin/data/raw/data.csv"
    processed_data: str = "https://storage.googleapis.com/reighns/reighns_ml_projects/docs/supervised_learning/classification/breast-cancer-wisconsin/data/processed/processed.csv"
    df_folds: str = "https://storage.googleapis.com/reighns/reighns_ml_projects/docs/supervised_learning/classification/breast-cancer-wisconsin/data/processed/df_folds.csv"
    train_size: float = 0.9
    seed: int = 1992
    num_folds: int = 5
    cv_schema: str = "StratifiedKFold"
    classification_type: str = "binary"
    
    target_col: List[str] = field(default_factory = lambda: ["diagnosis"])
    unwanted_cols : List[str] =  field(default_factory = lambda: ["id", "Unnamed: 32"])
    
    # Plotting
    colors : List[str] =field(default_factory = lambda: ["#fe4a49", "#2ab7ca", "#fed766", "#59981A"])
    cmap_reversed = plt.cm.get_cmap('mako_r')

    def to_dict(self) -> Dict[str, Any]:
        """Convert the config object to a dictionary.

        Returns:
            Dict: The config object as a dictionary.
        """

        return asdict(self)


    
#     spot_checking_boxplot = "../data/images/spot_checking_boxplot.png"
#     oof_confusion_matrix = "../data/images/oof_confusion_matrix.png"
#     final_train_confusion_matrix = "../data/images/final_train_confusion_matrix.png"
#     precision_recall_threshold_plot = "../data/images/precision_recall_threshold_plot.png"
#     roc_plot = "../data/images/roc_plot.png"
#     feature_importance = "../data/images/feature_importance.png"

In [None]:
def set_seeds(seed: int = 1234) -> None:
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    
def init_logger(log_file: str = "info.log"):
    """
    Initialize logger.
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(logging.Formatter("%(asctime)s - %(message)s", datefmt= "%Y-%m-%d,%H:%M:%S"))
    file_handler = logging.FileHandler(filename=log_file)
    file_handler.setFormatter(logging.Formatter("%(asctime)s - %(message)s",  datefmt= "%Y-%m-%d,%H:%M:%S"))
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    return logger

In [None]:
# Utils functions that we need

def variance_inflation_factor(exog, idx_kept, vif_idx):
    """Compute VIF for one feature.
    
    Args:
        exog (np.ndarray): Observations
        idx_kept (List[int]): Indices of features to consider
        vif_idx (int): Index of feature for which to compute VIF
    
    Returns:
        float: VIF for the selected feature
    """
    exog = np.asarray(exog)
    
    x_i = exog[:, vif_idx]
    mask = [col for col in idx_kept if col != vif_idx]
    x_noti = exog[:, mask]
    
    r_squared_i = OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    
    return vif

class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    """The base of the class structure is implemented in https://www.kaggle.com/ffisegydd/sklearn-multicollinearity-class;
    I heavily modified the class such that it can take in numpy arrays and correctly implemented the fit and transform method.
    """

    def __init__(self, thresh=10, max_drop=20):
        self.thresh = thresh
        self.max_drop = max_drop
        self.column_indices_kept_ = []
        self.feature_names_kept_ = None

    def reset(self):
        """Resets the state of predictor columns after each fold."""

        self.column_indices_kept_ = []
        self.feature_names_kept_ = None

    def fit(self, X, y=None):
        """Fits the Recursive VIF on the training folds and save the selected feature names in self.feature_names

        Args:
            X ([type]): [description]
            y ([type], optional): [description]. Defaults to None.

        Returns:
            [type]: [description]
        """
        
        self.column_indices_kept_, self.feature_names_kept_ = self.calculate_vif(X)     
        
        return self

    def transform(self, X, y=None):
        """Transforms the Validation Set according to the selected feature names.

        Args:
            X ([type]): [description]
            y ([type], optional): [description]. Defaults to None.

        Returns:
            [type]: [description]
        """

        return X[:, self.column_indices_kept_]

    def calculate_vif(self, X: Union[np.ndarray, pd.DataFrame]):
        """Implements a VIF function that recursively eliminates features.

        Args:
            X (Union[np.ndarray, pd.DataFrame]): [description]

        Returns:
            [type]: [description]
        """
        feature_names = None
        column_indices_kept = list(range(X.shape[1]))
        
        if isinstance(X, pd.DataFrame):
            feature_names = X.columns

        dropped = True
        count = 0
        
        while dropped and count <= self.max_drop:
            dropped = False
            
            max_vif, max_vif_col = None, None
            
            for col in column_indices_kept:
                
                vif = variance_inflation_factor(X, column_indices_kept, col)
                
                if max_vif is None or vif > max_vif:
                    max_vif = vif
                    max_vif_col = col
            
            if max_vif > self.thresh:
                # print(f"Dropping {max_vif_col} with vif={max_vif}")
                column_indices_kept.remove(max_vif_col)
                
                if feature_names is not None:
                    feature_names.pop(max_vif_col)
                    
                dropped = True
                count += 1
                
        return column_indices_kept, feature_names


def prepare_y(y: np.ndarray) -> np.ndarray:
    """Prepare the target variable for the model.

    If Binary Classification, we need to ravel the array to 1d.

    Args:
        y (np.ndarray): Target variable.

    Returns:
        np.ndarray: Transformed Target variable.
    """
    return y.ravel() if config.classification_type == "binary" else y

In [None]:
config = config()

basic_config: Dict = config.to_dict()
# train_config: Dict = Train().to_dict()

global_config: Dict = dict(basic=basic_config)

# We can log multiple dict under global_config - in wandb UI, it will show as basic. and train. to show which dict it is referring to.
run = wandb.init(project="bcw", name="classification", config=global_config)

In [None]:
# set logger
logger = init_logger()

# set seeding for reproducibility
_ = set_seeds(seed = config.seed)

# read data
df_folds = pd.read_csv(config.df_folds)

In [None]:
# Assign predictors and target accordingly
predictor_cols = df_folds.columns.to_list()[:-2]
target_col = config.target_col

## Model Selection: Hyperparameter Tuning with GridSearchCV

!!! success "Hyperparameter Tuning"
    We have done a quick spot checking on algorithms and realized that `LogisticRegression` is doing well for this task. For this purpose, I will just perform hyperparameter tuning on this single algorithm. However, in practice and if resources are allowed, I will also tune other models such as `RandomForest()`, or gradient boosting algorithms such as `XGBoost`, as I believe they will perform no worse than our Logistic Regression model given the right hyperparameters.


---

!!! info "Grid Search is the Gwei?"
    Meh! We will use an old-fashioned way to search for hyperparameters, which is brute force method. The time complexity of Grid Search is high and if you have many hyperparameters to tune, I recommend trying out <b>Random Grid Search</b> or libraries like <b>Optuna</b> that uses Bayesian Optimization.

---

!!! note "TODO"
    Try to code up your own `GridSearchCV` to have maximum flexibility.

### Make Finetuning Pipeline

The following `make_finetuning_pipeline` does exactly the same thing is as `make_pipeline` earlier. The only difference is we can pass in flexible list of steps to the pipeline from outside.

In [None]:
def make_finetuning_pipeline(
    model: Callable, steps: List[Tuple[str, Callable]]
) -> pipeline.Pipeline:
    """Return a pipeline that can be used for finetuning.

    Args:
        model (Callable): A model with default parameters.
        steps (List[Tuple[str, Callable]]): A list of preprocessing steps to pass in Pipeline object.

    Returns:
        Pipeline: Returns a pipeline that can be used for finetuning.
    """
    return pipeline.Pipeline([*steps, ("model", model)])

# TODO: Make a class to hold pipelines?
# class MakePipeline:
    
#     def __init__(self, estimator: Callable, steps: List[Callable]):
#         pass
    
#     def spot_checking_pipeline():
#         pass
    
#     def fine_tuning_pipeline():
#         pass

In [None]:
finetuning_pipeline_steps = [
    # standardization
    ("standardize", preprocessing.StandardScaler()),
    # reduce VIF
    ("remove_multicollinearity", ReduceVIF(thresh=10))
]

### Search Space

Run our hyperparameter search with cross-validation. For example, our `param_grid` has $2 \times 10 = 20$ combinations, and our cross validation has 5 folds, then there will be a total of 100 fits.

---

Below details the pseudo code of what happens under the hood:

- Define $G$ as the set of combination of hyperparamters. Define number of splits to be $K$.
- For each set of hyperparameter $z \in Z$:
    - for fold $j$ in K:
        - Set $F_{\text{train}}=\bigcup\limits_{i\neq k}^{K} F_{i}$
        - Set $F_{\text{val}} = F_{j}$ as the validation set
        - Perform Standard Scaling on $F_{\text{train}}$ and find the mean and std
        - Perform VIF recursively on $F_{\text{train}}$ and find the selected features
        - Transform $F_{\text{val}}$ using the mean and std found using $F_{\text{train}}$
        - Transform $F_{\text{val}}$ to have only the selected features from $F_{\text{train}}$
        - Train and fit on $F_{\text{train}}$ 
    - Evaluate the fitted parameters on $F_{\text{val}}$ to obtain $\mathcal{M}$


In [None]:
@dataclass
class ModelForTuning:
    model: Callable
    param_grid: Dict

Define our search space for the hyperparameters:

```python
logistic_r_param_grid = {model__penalty=["l1", "l2"],
              model__C=np.logspace(-4, 4, 10)}
```

We conveniently use `dataclass` to act as a medium so we can pass in model and param_grid independently for each model. We then collate them into a list of `ModelForTuning` object.

In [None]:
models_list = [
    ModelForTuning(
        model=linear_model.LogisticRegression(
            solver="saga",
            random_state=config.seed,
            max_iter=10000,
            n_jobs=-1,
            fit_intercept=True,
        ),
        param_grid=dict(
            model__penalty=["l1", "l2"],
            model__C=np.logspace(-4, 4, 10),
        ),
    ),
    ModelForTuning(
        model=tree.DecisionTreeClassifier(random_state=config.seed),
        param_grid=dict(
            model__max_depth=[2, 3, 5, 10, 20],
            model__min_samples_leaf=[5, 10, 20, 50, 100],
            model__criterion=["gini", "entropy"],
        ),
    ),
    ModelForTuning(
        model=ensemble.GradientBoostingClassifier(n_estimators=100),
        param_grid=dict(
            model__max_depth=[3, 6],
            model__learning_rate=[0.1, 0.05],
            model__subsample=[
                1,
                0.5,
            ],
        ),
    ),
]

In [None]:
def optimize_models(
    models_list: List[ModelForTuning],
    X_train: np.ndarray,
    y_train: np.ndarray,
    scorer: Union[str, Callable],
    steps: List[Tuple[str, Callable]],
) -> List[Callable]:
    """Optimize models in models_list using X_train and y_train.
    We are using GridSearchCV to find the best parameters for each model.
    Consider using Optuna for hyperparameter optimization (or wandb for hyperparameter optimization).

    Args:
        models_list (List[ModelForTuning]): List of models to optimize.
        X_train (np.ndarray): X_train data.
        y_train (np.ndarray): y_train data.

    Returns:
        grids (List[Callable]): List of optimized models.
    """
    # @ TODO: make a scoring list to pass in so we can evaluate multiple metrics.
    grids = [
        model_selection.GridSearchCV(
            make_finetuning_pipeline(model.model, steps),
            param_grid=model.param_grid,
            cv=5,
            refit=True,
            verbose=1,
            scoring=scorer,
            n_jobs=-1,
        )
        for model in models_list
    ]

    for grid in grids:
        grid.fit(X_train, y_train)

    return grids

In [None]:
roc_auc_scorer = "roc_auc_ovr" 
# Unsure why this gives much lower score - to investigate
# metrics.make_scorer(metrics.roc_auc_score, average="macro", multi_class='ovr')

In [None]:
X_train, y_train = df_folds[predictor_cols].values, df_folds[target_col].values
y_train = prepare_y(y_train)
grids = optimize_models(models_list, X_train, y_train, scorer=roc_auc_scorer, steps=finetuning_pipeline_steps)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
# The above optimize code is equivalent to the below, for better readability
# pipeline_logistic = make_finetuning_pipeline(
#     linear_model.LogisticRegression(
#         solver="saga", random_state=config.seed, max_iter=10000, n_jobs=None, fit_intercept=True
#     ), steps=steps
# )

# param_grid = dict(
#     model__penalty=["l1", "l2"],
#     model__C=np.logspace(-4, 4, 10),
# )

# grid = model_selection.GridSearchCV(pipeline_logistic, param_grid=param_grid, cv=5, refit=True, verbose=3, scoring = "roc_auc")
# _ = grid.fit(X_train, y_train)

We can save our results in a dataframe, we will also look at the top performing hyperparameter by querying the below:

```python
grid_cv_df = pd.DataFrame(grid.cv_results_)
grid_cv_df.loc[grid_cv_df['rank_test_score']==1]
```

In [None]:
# For example, we can see Logistic Regression's GridSearchCV
# results like this.
grid_cv_df = pd.DataFrame(grids[0].cv_results_)
display(grid_cv_df.loc[grid_cv_df['rank_test_score']==1])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.931891,0.058794,0.002457,0.000387,0.359381,l1,"{'model__C': 0.3593813663804626, 'model__penal...",0.997997,0.995547,0.997944,0.990132,0.995477,0.995419,0.002863,1


In [None]:
def return_grid_df(
    grids: List[model_selection.GridSearchCV],
) -> Union[pd.DataFrame, List[model_selection.GridSearchCV]]:
    """Return a dataframe of the grids with shorted names.

    Args:
        grids (List[model_selection.GridSearchCV]): A list of GridSearchCV models that are tuned.

    Returns:
        grid_df, grids (Union[pd.DataFrame, List[model_selection.GridSearchCV]]): A dataframe of the grids with shorted names.
    """

    def shorten_param(param_name):
        if "__" in param_name:
            return param_name.rsplit("__", 1)[1]
        return param_name

    grid_df = []
    for grid in grids:
        model_name = grid.estimator["model"].__class__.__name__
        cv_results = pd.DataFrame(grid.cv_results_).sort_values(
            "mean_test_score", ascending=False
        )

        # get the parameter names
        column_results = [f"param_{name}" for name in grid.param_grid.keys()]
        column_results += [
            "mean_test_score",
            "std_test_score",
            "rank_test_score",
        ]
        cv_results = cv_results[column_results]
        cv_results = cv_results.rename(shorten_param, axis=1)
        cv_results["model_name"] = model_name
        grid_df.append(cv_results)

    return grid_df, grids

In [None]:
# grid_df and grids should necessarily be in the same sequence.
# grid_df[0] == grids[0] in terms of model information, in this
# case, the first index of both should be logistic regression.
grid_df, grids = return_grid_df(grids)

In [None]:
for model_df, grid in zip(grid_df, grids):
    best_hyperparams_df = model_df.iloc[[0]]
    model_name = best_hyperparams_df.model_name.unique()[0]
    logger.info(f"Best hyperparameters found for {model_name} is as follows:\n{grid.best_params_}")
    display(best_hyperparams_df)
    print()

2021-11-16,09:19:36 - Best hyperparameters found for LogisticRegression is as follows:
{'model__C': 0.3593813663804626, 'model__penalty': 'l1'}
2021-11-16,09:19:36 - Best hyperparameters found for LogisticRegression is as follows:
{'model__C': 0.3593813663804626, 'model__penalty': 'l1'}


Unnamed: 0,penalty,C,mean_test_score,std_test_score,rank_test_score,model_name
8,l1,0.359381,0.995419,0.002863,1,LogisticRegression


2021-11-16,09:19:36 - Best hyperparameters found for DecisionTreeClassifier is as follows:
{'model__criterion': 'entropy', 'model__max_depth': 10, 'model__min_samples_leaf': 10}
2021-11-16,09:19:36 - Best hyperparameters found for DecisionTreeClassifier is as follows:
{'model__criterion': 'entropy', 'model__max_depth': 10, 'model__min_samples_leaf': 10}





Unnamed: 0,max_depth,min_samples_leaf,criterion,mean_test_score,std_test_score,rank_test_score,model_name
41,10,10,entropy,0.954515,0.015913,1,DecisionTreeClassifier


2021-11-16,09:19:37 - Best hyperparameters found for GradientBoostingClassifier is as follows:
{'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__subsample': 0.5}
2021-11-16,09:19:37 - Best hyperparameters found for GradientBoostingClassifier is as follows:
{'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__subsample': 0.5}





Unnamed: 0,max_depth,learning_rate,subsample,mean_test_score,std_test_score,rank_test_score,model_name
1,3,0.1,0.5,0.991031,0.005869,1,GradientBoostingClassifier





!!! success
    Our best performing set of hyperparameters for Logistic Regression `{'model__C': 0.3593813663804626, 'model__penalty': 'l1'}` gives rise to a mean cross validation score of $0.995419$, which is higher than the model with default hyperparameter scoring, $0.995$ by a small margin. Not too surprising for Logistic Regression here since there aren't many things to tune, and should not see major improvements, but for Decesion Tree, it has increased from 0.907 to around 0.95, seeing quite a big jump with tuned params.

!!! danger "DANGERRRRRRRRRRRRR"
    I am being a bit hand wavy in terms of comparison here, I assumed THAT `GridSearchCV` used the exact same splitting strategy (yes it uses `StratifiedKFold` here) with the exact **SEED/RANDOM_STATE**, which I cannot promise as of now. Thus, a different splitting will, unfortunately, result in different results, although, I don't expect by a huge margin - so I think it is a no-go to compare like this.
    We can probably pass in a cv function into `GridSearchCV` to ensure seeding. 
    This also highlights a problem that even K-fold splitting does not guarantee the reduction in variance. 

!!! sucess "Room for Improvement"
    Apart from the other methods to search for the optimal hyperparameters, we can also include preprocessing step as a tunable hyperparameter. More specifically, in our `ReduceVIF()` step, we hard coded two manual criterion in which the algorithm will stop; if the threshold reaches 10, or if the number of features removed hit 20; we can include them in the search space so we do not need to worry about how many features to remove!

## Model Persistence (Saving Models)

[Model Persistence](https://scikit-learn.org/stable/modules/model_persistence.html)

We save our models using `joblib` and we can load it back any time. 

!!! note
    Save it to wandb or GCP storage to store models for better consistency.

In [None]:
model_path = "/content/"

def save_model(grids: List[Callable], path: str):
    """Save a model to a file"""
    for grid in grids:
        model_name = grid.best_estimator_["model"].__class__.__name__
        path_to_save = Path(path, f"{model_name}_grid.joblib")
        # Dump to local path
        dump(grid, Path(path, path_to_save))
        # Dump to wandb cloud
        # "model.h5" is saved in wandb.run.dir & will be uploaded at the end of training
        wandb.save(os.path.join(wandb.run.dir, path_to_save))

Save the model!

### Wandb

We first see how we save and load using wandb.

In [None]:
save_model(grids, model_path)



In [None]:
logistic_path = "LogisticRegression_grid.joblib"

In [None]:
# restore the model file "model.h5" from a specific run by user "lavanyashukla"
# in project "save_and_restore" from run "10pr4joa"
best_model = wandb.restore(logistic_path)


# use the "name" attribute of the returned object
# if your framework expects a filename, e.g. as in Keras
# model.load_weights(best_model.name)

### Joblib

We see how we use `joblib` to save and load.

Load the model, and we can test it now if our loaded models is predicting correctly!

In [None]:
logistic_grid = load("/content/LogisticRegression_grid.joblib")

Great it seems to work!

### Sanity Check

!!! note
    We just make sure our loaded weight from path is the same as the one we trained. We can easily compare predictions (or coefficients) by the following.

In [None]:
load(best_model.name).predict(X_train).all() == logistic_grid.predict(
    X_train
).all() == grids[0].predict(X_train).all()

True

In [None]:
metrics.roc_auc_score(
    y_train, logistic_grid.predict_proba(X_train)[:, 1]
) == metrics.roc_auc_score(
    y_train, grids[0].predict_proba(X_train)[:, 1]
) == metrics.roc_auc_score(
    y_train, load(best_model.name).predict_proba(X_train)[:, 1]
)


True

Seems like the save and load method works perfectly.

!!! warning
    Do not call this directly.
    ```python
    grids[0].best_estimator_["model"].predict(X_train)
    ```

    This is because `grids[0].best_estimator_["model"]` is only referring to the Logistic Regression Model WITHOUT the pipeline (preprocessing) steps. And hence will raise error if the preprocessing steps has feature selection. But the main idea is, be careful when using the above.

In [None]:
# grids[0].best_estimator_["model"].predict(X_train)

## Retrain using Hyperparameters


!!! info "Retraining Methods"
    From the discussion[^cpmp], my doubts are cleared. Quoting verbatim from the discussion, we have:

K-folds cross validation was devised as a way to assess model performance using training data. A great paper on this from Sebastian Raschka is a must read https://arxiv.org/abs/1811.12808. You use K-folds cv to tune you model, then retrain on all training data with best hyperparamters found.

However, once you have run K-fold cv, you get $K$ trained models. Kagglers quickly found that ensembling these models was giving good results at zero computation cost, rather than having to retrain a model on full data. It soon became a very common practice.

---

!!! note "Takeway"
    For small-medium datasets, after finding the best hyperparameters $G$, we use $G$ in our model $h$ to train on the whole dataset $\mathcal{X}$ again to get the fitted parameters of $h$. Then you use the newly gained fitted parameters to then evaluate on the **Test Set**.
    For large and computationally expensive datasets, when you finished your K-folds, say 5 folds, you get 5 "different" models, $h_{i}, i \in {1, 2, 3, 4, 5}$, what you can do is to save the weights (or in normal ML, weights refer to the parameters gained), and evaluate on the test set for each of the five models, you then get 5 different test predictions, and a common practice is the do a simple mean of these 5 set of predictions. 



[^cpmp]: https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/275883

### Retrain on K-Folds

TODO: This should be easy for me as I dabbled more in Kaggle comp and are more familiar with this methodology.

### Retrain on the whole training set

A common practice after the hyperparameter tuning phase is to retrain the model on the whole dataset $X_{\text{train}}$ where we will get the estimator's coefficients obtained from the retraining. This is actually already done as the scikit-learn's `GridSearchCV` has a parameter `refit`; if we select it to be true, then after the model selection process is done (i.e. getting the best hyperparameters after cross validation with grid search), the grid search object will retrain on the whole $X_{\text{train}}$ with the best hyperparameters internally, and return us back an object in which we can call `predict` etc.

!!! warning "Paranoia Alert"

    However, to be extra careful, we can retrain manually using the best hyperparameters and check if scikit-learn is true to its documentation. We will just reconstruct the pipeline using the grid's best hyper parameters. We will then test if the retrained model's coefficients coincide with the grid's best estimator's coefficients. If there difference is 0, this means they are trained under the same circumstances and we can be sure that the refit parameter is behaving true to its words.

    ```python
    grid_best_hyperparams = grid.best_params_
    print(grid_best_hyperparams) ->
    {'model__C': 0.3593813663804626, 'model__penalty': 'l1'}
    ```

In [None]:
retrain_logistic_pipeline = pipeline.Pipeline(
    [
        ("standardize", preprocessing.StandardScaler()),
        ("remove_multicollinearity", ReduceVIF(thresh=10)),
        (
            "model",
            linear_model.LogisticRegression(
                C=0.3593813663804626,
                max_iter=10000,
                random_state=1992,
                solver="saga",
                penalty="l1",
            ),
        ),
    ]
)

_ = retrain_logistic_pipeline.fit(X_train, y_train)
logistic_grid = grids[0]
coef_diff = (
    retrain_logistic_pipeline["model"].coef_
    - logistic_grid.best_estimator_["model"].coef_
)

print("...")
assert np.all(coef_diff == 0) == True
logger.info("Retraining Assertion Passed!")

2021-11-16,09:19:38 - Retraining Assertion Passed!
2021-11-16,09:19:38 - Retraining Assertion Passed!


...
