In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import set_config

pd.set_option("display.max_columns", 85)
sns.set_theme(context="paper", font_scale=1.5, style="ticks", rc={"axes.grid": True})
set_config(display="diagram")

# AdaBoosting: Scoring by Nested Cross-Validation

### Load the data

In [None]:
# Read Data (NEW from PP)
df = pd.read_csv("../data/new_abnormal_writeout_noscale.data.csv", index_col=0)

# Drop NaNs
df.dropna(inplace=True)

# Collect Features and Labels
features_df = pd.DataFrame()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)

features_df["occ_total_sum"] = df["occ_total_sum"]
features_df["oldest_phylostratum"] = df["oldest_phylostratum"]
features_df = pd.concat([features_df, conf], axis=1)

X = features_df.to_numpy()
y = df["response"].to_numpy()

features_df

*** 
## Nested CV on Gradient Boosted Trees

### The Model and its Parameter Space

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

# Confounder PCA
confpca = ColumnTransformer([
    ("ots+of", "passthrough", [0, 1]), 
    ("conf", PCA(), slice(2, X.shape[1]))
])

# Parameter Grid
main_params = {
    "gb__learning_rate": [0.5, 1, 1.5],
    "gb__n_estimators": [50, 100, 200],
}

pca_on = {'pca': [confpca], 'pca__conf__n_components': [None, 0.01, 0.95]}
pca_off = {'pca': ['passthrough'],}

param_grid = [{**main_params, **pca_on}, {**main_params, **pca_off}]

# Define the model to be tuned
adab_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", confpca),
    ("gb", AdaBoostClassifier()),
])

adab_clf

### Nested CV

In [None]:
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, RepeatedKFold
from sklearn.utils._testing import ignore_warnings
from sklearn.model_selection import train_test_split


# configure the cross-validation procedure
np.random.seed(3)
model = adab_clf
k_outer = 10
k_inner = 3
cv_outer = KFold(n_splits=k_outer, shuffle=True, random_state=1)
cv_inner = KFold(n_splits=k_inner, shuffle=True, random_state=3)

# To store results
roc_results = list()
found_params = list()

print(f"Performing nested-cv with {k_outer} outer-folds and {k_inner} inner-folds.\n")
print("OUTER CV | BEST OF INNER CV | CHOSEN PARAMS")

for train_ix, test_ix in cv_outer.split(X):

    # split data
    X_tr, X_te = X[train_ix, :], X[test_ix, :]
    y_tr, y_te = y[train_ix], y[test_ix]

    # If some parameter combinations are incompatible:
    # with ignore_warnings(category=[ConvergenceWarning, FitFailedWarning]):
    
    # define search
    search = GridSearchCV(estimator=model, param_grid=param_grid, scoring="roc_auc", cv=cv_inner, n_jobs=4)
    
    # execute search
    result = search.fit(X_tr, y_tr)
        
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_

    # evaluate model on the hold out dataset
    # yhat = best_model.predict(X_te)
    yhat = best_model.predict_proba(X_te)[:,1]

    # evaluate the model
    roc_auc = roc_auc_score(y_te, yhat)
    
    # store the result
    roc_results.append(roc_auc)
    found_params.append(result.best_params_)

    # report progress
    print(">roc-auc=%.3f, est=%.3f, params=%s" % (roc_auc, result.best_score_, result.best_params_))

# summarize the estimated performance of the model
print("ROC-AUC: %.3f (std = %.3f)" % (np.mean(roc_results), np.std(roc_results)))

In [None]:
ncv_df = pd.DataFrame(roc_results, columns=['roc_auc'])
ncv_df = pd.concat([ncv_df, pd.DataFrame(found_params)], axis=1)
ncv_df

In [None]:
ncv_df["roc_auc"].mean()

In [None]:
ncv_df.to_csv("./data/gb_ncv.csv")