In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import set_config

pd.set_option("display.max_columns", 85)
sns.set_theme(context="paper", font_scale=1.5, style="ticks", rc={"axes.grid": True})
set_config(display="diagram")

# Random Forest: Scoring by Nested Cross-Validation

### Load data

In [None]:
# Read Data (NEW from PP)
df = pd.read_csv("../data/new_abnormal_writeout_noscale.data.csv", index_col=0)

# Drop NaNs
df.dropna(inplace=True)

# Collect Features and Labels
features_df = pd.DataFrame()
conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)

features_df["occ_total_sum"] = df["occ_total_sum"]
features_df["oldest_phylostratum"] = df["oldest_phylostratum"]
features_df = pd.concat([features_df, conf], axis=1)

X = features_df.to_numpy()
y = df["response"].to_numpy()

features_df

### Custom Scoring: Area Under Precision Recall Curve

In [16]:
from sklearn.metrics import auc, make_scorer, precision_recall_curve

def auprc(y_true, y_scores, **kwargs):
    """ Remember to use make_scorer(auprc, needs_proba=True,) when feeding to SKL's GSCV."""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # results is area under x=Recall and y=Precision curve. 
    return auc(recalls, precisions)

*** 
## Nested CV on RF

### The Model and its Parameter Space

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Confounder PCA as ColumnTransformer
confpca = ColumnTransformer([
    ("ots+of", "passthrough", [0, 1]), 
    ("conf", PCA(), slice(2, X.shape[1]))
])

# Parameter Grid
main_params = {
    "rf__min_samples_leaf": [1, 5, 20,],
    "rf__min_samples_split": [2, 5, 20],
    "rf__max_depth": [None, 5, 10, 20],
    # "rf__max_features": ['sqrt', None],
    "rf__n_estimators": [100, 300, 1000],
}

pca_on = {'pca': [confpca], 'pca__conf__n_components': [None, 0.95]}
pca_off = {'pca': ['passthrough'],}

param_grid = [{**main_params, **pca_on}, {**main_params, **pca_off}]

# Model as Pipeline
rf_clf = Pipeline([
    ('scaler', StandardScaler()),
    ("pca", confpca), 
    ("rf", BalancedRandomForestClassifier(n_jobs=1, max_features="sqrt"))
])

rf_clf

### Nested CV

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, RepeatedKFold
from joblib import dump, load

# configure the cross-validation procedure
np.random.seed(42)
model = rf_clf
k_outer = 10
k_inner = 3
cv_outer = KFold(n_splits=k_outer, shuffle=True)
cv_inner = KFold(n_splits=k_inner, shuffle=True)

# To store results
roc_results = []
prc_results = []
found_params = []
est_score = []

print(f"Performing nested-cv with {k_outer} outer-folds and {k_inner} inner-folds.\n")
print("OUTER CV | BEST OF INNER CV | CHOSEN PARAMS")

i = 0 # quick fix for filenaming
for train_ix, test_ix in cv_outer.split(X):

    # split data
    X_tr, X_te = X[train_ix, :], X[test_ix, :]
    y_tr, y_te = y[train_ix], y[test_ix]
    
    # define and execute the search
    search = GridSearchCV(estimator=model, param_grid=param_grid, scoring="roc_auc", cv=cv_inner, n_jobs=-1)
    result = search.fit(X_tr, y_tr)
        
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_

    # evaluate model on the hold out dataset
    # yhat = best_model.predict(X_te)
    yhat = best_model.predict_log_proba(X_te)[:,1]

    # evaluate the model
    roc_auc = roc_auc_score(y_te, yhat)
    prc_auc = auprc(y_te, yhat)
    
    # store the result
    roc_results.append(roc_auc)
    prc_results.append(prc_auc)
    found_params.append(result.best_params_)
    est_score.append(result.best_score_)
    
    # report progress
    print(">roc-auc=%.3f, est=%.3f, params=%s" % (roc_auc, result.best_score_, result.best_params_))


# summarize the estimated performance of the model
print("\nROC-AUC: %.4f (std = %.4f)" % (np.mean(roc_results), np.std(roc_results)))

### Results

In [None]:
from math import isnan
ncv_df = pd.DataFrame()
ncv_df['roc_auc'] = roc_results
ncv_df['pr_auc'] = pr_results


ncv_df['est'] = est_score
ncv_df = pd.concat([ncv_df, pd.DataFrame(found_params)], axis=1)
for i in range(len(found_params)):
    if 'pca__conf__n_components' in found_params[i]:
        ncv_df.loc[i, 'pca__conf__n_components'] = 'None'
    if isnan(ncv_df.loc[i, 'rf__max_depth']):
        ncv_df.loc[i, 'rf__max_depth'] = 'None'
ncv_df

In [None]:
ncv_df["roc_auc"].mean()

In [None]:
ncv_df.to_csv("./results/rf_ncv.csv")

In [None]:
ncv_df = pd.read_csv("./results/rf_ncv.csv")
ncv_df

In [None]:
ncv_df["roc_auc"].mean()

In [None]:
ncv_df["roc_auc"].std()