In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap
import pickle
import os

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              RandomForestRegressor, GradientBoostingRegressor)
from sklearn.model_selection import KFold
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             RocCurveDisplay, PrecisionRecallDisplay, 
                             mean_squared_error)
from pandas.api.types import CategoricalDtype

In [2]:
# Import modelling functions
from ensemble_functions import *
from weighting_functions import *
from explainer_functions import *

## Load dataset

### Imputed using MICE

In [3]:
# Choose with dataset to load (rpy2 or pure Python)
imputed_file = "imputed.pickle"
boston_path = "../../data/toy-dataset/boston-processed/"
biopsy_path = "../../data/toy-dataset/biopsy-processed/"

# Place to store results
results_path = "../../results/metrics/"
if not os.path.exists(results_path):
    os.mkdir(results_path)
figures_path = "../../results/figures/rf_models_python/"
if not os.path.exists(figures_path):
    os.mkdir(figures_path)
model_iter_name = "_pythonimputed_stratifiedkfold"

# Load imputed dataset
with open(boston_path + imputed_file, "rb") as handle:
    boston_imputed = pickle.load(handle)

with open(biopsy_path + imputed_file, "rb") as handle:
    biopsy_imputed = pickle.load(handle)

### Complete case data

In [4]:
# Set directory for complete case data
boston_cc_path = "../../data/toy-dataset/boston-complete-case/"
biopsy_cc_path = "../../data/toy-dataset/biopsy-complete-case/"

# Load complete case data
props = [10, 20, 30, 40, 50]
boston_cc = {}
biopsy_cc = {}
for p in props:
    boston_cc[p] = pd.read_csv(boston_cc_path + "boston_{}.csv".format(p))
    biopsy_cc[p] = pd.read_csv(biopsy_cc_path + "biopsy_{}.csv".format(p))

## Modelling on `boston` dataset (regression)

### Data preparation

In [5]:
# Determine covariates and outcome variables
bostonyvar = "medv"
bostonXvars = ["crim", "zn", "indus", "rm", "age", "dis", "tax", "ptratio", "black", 
               "lstat", "chas", "nox"]

# Selected features for model explanation
selected_boston = ["nox", "rm"]

### General model setup (random forest)

In [6]:
# Cross-validation folds
n_splits = 5

# Set random seed
SEED = 2023

## For now, we are using the default setup from sklearn
basemdl = RandomForestRegressor()
basemdlname = "rf"

In [7]:
# Placeholder table to store performance metrics (RMSE)
boston_perf = pd.DataFrame(np.zeros((len(props), 3)), index=props, 
                           columns=["CC", "Ensemble", "Weighting"])

### Complete case data

#### Cross-validation

In [8]:
# Iterate over all versions of the data
for p in tqdm(props):
    # Separate indep and outcome variables
    X = boston_cc[p][bostonXvars]
    y = boston_cc[p][bostonyvar]
    
    # Initialise k-fold object
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    
    # Placeholder for predictions to calculate performance metrics
    preds = []
    
    # Iterate over the folds
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # Get train and test data
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        
        # Train regressor on training data
        rf = copy.deepcopy(basemdl).fit(X_train, y_train)
        
        # Predict on test data and store predictions
        pred_ = rf.predict(X_test)
        preds.append(pd.DataFrame({"true": y_test, "pred": pred_}))
    
    # Aggregate predictions and compute RMSE
    preds = pd.concat(preds)
    rmse = mean_squared_error(preds["true"], preds["pred"], squared=False)
    boston_perf.loc[p, "CC"] = rmse

100%|█████████████████████████████████████████████| 5/5 [00:04<00:00,  1.09it/s]


#### Full model training and explanation

In [9]:
%matplotlib agg
# Iterate over all versions of the data
for p in props:
    # Separate indep and outcome variables
    X = boston_cc[p][bostonXvars]
    y = boston_cc[p][bostonyvar]
    
    # Train RF regressor
    rf = copy.deepcopy(basemdl).fit(X, y)
    
    # Create output path for model explanation
    expl_path = figures_path + "boston_cc_{}/".format(p)
    if not os.path.exists(expl_path):
        os.mkdir(expl_path)
    
    # Model explanation
    background_data = shap.maskers.Independent(X, max_samples=100)
    pred_fn = lambda x: rf.predict(x)
    expl = shap.Explainer(pred_fn, background_data)
    
    # Calculate SHAP values
    shapvals = expl(X)
    
    # Create SHAP dependence plots for selected features
    for c in selected_boston:
        f1, ax1 = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
        # shap.dependence_plot(c, shap_values=shapvals, features=X, show=False, ax=ax1)
        shap.plots.scatter(shapvals[:, c], show=False, ax=ax1)
        f1.tight_layout()
        f1.savefig(expl_path + "dependence_{}.pdf".format(c))
    
    # Create beeswarm plot
    plt.clf()
    _ = shap.plots.beeswarm(shapvals, show=False)
    plt.tight_layout()
    plt.savefig(expl_path + "beeswarm.pdf")

plt.clf()

Permutation explainer: 468it [03:48,  1.98it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 412it [03:26,  1.90it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 366it [02:33,  2.25it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 322it [02:27,  2.01it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 263it [01:57,  2.03it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


### Ensemble approach

#### Cross-validation

In [10]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in tqdm(props):
    # Prepare ensemble data
    X, y = PrepareEnsembleData(boston_imputed[p], bostonyvar, covars=bostonXvars)
    
    # Construct base model
    # basemdl = RandomForestRegressor()
    
    # Run K-fold CV
    metrics, preds = KFoldEnsemble(n_splits, X, y, 
                                   boston_imputed[p]["missingflag"].any(axis=1), 
                                   basemdl, classifier=False, random_state=SEED)
    
    # Store metrics
    all_preds.append(preds)
    boston_perf.loc[p, "Ensemble"] = metrics["RMSE"]

100%|█████████████████████████████████████████████| 5/5 [01:50<00:00, 22.01s/it]


#### Full model training and explanation

In [11]:
%matplotlib agg
# Iterate over all versions of the data
for p in props:
    # Prepare ensemble data
    X, y = PrepareEnsembleData(boston_imputed[p], bostonyvar, covars=bostonXvars)
    
    # Construct ensemble model
    # basemdl = RandomForestRegressor()
    ensemblerf = EnsembleRegressor(basemdl).fit(X, y)
    
    # Create output path for model explanation
    expl_path = figures_path + "boston_ensemble_{}/".format(p)
    if not os.path.exists(expl_path):
        os.mkdir(expl_path)
    
    # Get complete case data for model explanation
    mflag = boston_imputed[p]["missingflag"].any(axis=1)
    Xobs = X[0][bostonXvars][~mflag]
    
    # Model explanation
    background_data = shap.maskers.Independent(Xobs, max_samples=100)
    pred_fn = lambda x: ensemblerf.predict(x)
    expl = shap.Explainer(pred_fn, background_data)
    
    # Calculate SHAP values
    shapvals = expl(Xobs)
    
    # Create SHAP dependence plots for selected features
    for c in selected_boston:
        f1, ax1 = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
        # shap.dependence_plot(c, shap_values=shapvals, features=Xobs, show=False, ax=ax1)
        shap.plots.scatter(shapvals[:, c], show=False, ax=ax1)
        f1.tight_layout()
        f1.savefig(expl_path + "dependence_{}.pdf".format(c))
    
    # Create beeswarm plot
    plt.clf()
    _ = shap.plots.beeswarm(shapvals, show=False)
    plt.tight_layout()
    plt.savefig(expl_path + "beeswarm.pdf")

plt.clf()

Permutation explainer: 468it [1:02:05,  8.00s/it]                               
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 412it [52:36,  7.70s/it]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 366it [46:42,  7.70s/it]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 322it [41:02,  7.69s/it]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 263it [33:27,  7.69s/it]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


### Weighting approach

#### Cross-validation

In [12]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in tqdm(props):
    # Construct base model
    # basemdl = RandomForestRegressor()
    
    # Run K-fold CV
    metrics, preds = KFoldWeighted(n_splits, boston_imputed[p], bostonyvar, 
                                   basemdl, classifier=False, random_state=SEED, 
                                   covars=bostonXvars)
    
    # Store metrics
    all_preds.append(preds)
    boston_perf.loc[p, "Weighting"] = metrics["RMSE"]

100%|█████████████████████████████████████████████| 5/5 [00:11<00:00,  2.27s/it]


#### Full model training and explanation

In [13]:
%matplotlib agg
# Iterate over all versions of the data
for p in props:
    # Prepare weighted data
    X, y, w = PrepareWeightedData(boston_imputed[p], bostonyvar, covars=bostonXvars)
    
    # Construct weighted model
    weightedrf = copy.deepcopy(basemdl).fit(X, y, sample_weight=w)
    
    # Create output path for model explanation
    expl_path = figures_path + "boston_weighting_{}/".format(p)
    if not os.path.exists(expl_path):
        os.mkdir(expl_path)
    
    # Get complete case data for model explanation
    mflag = boston_imputed[p]["missingflag"].any(axis=1)
    Xobs = boston_imputed[p]["imp"][0][bostonXvars][~mflag]
    
    # Model explanation
    background_data = shap.maskers.Independent(Xobs, max_samples=100)
    pred_fn = lambda x: weightedrf.predict(x)
    expl = shap.Explainer(pred_fn, background_data)
    
    # Calculate SHAP values
    shapvals = expl(Xobs)
    
    # Create SHAP dependence plots for selected features
    for c in selected_boston:
        f1, ax1 = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
        # shap.dependence_plot(c, shap_values=shapvals, features=Xobs, show=False, ax=ax1)
        shap.plots.scatter(shapvals[:, c], show=False, ax=ax1)
        f1.tight_layout()
        f1.savefig(expl_path + "dependence_{}.pdf".format(c))
    
    # Create beeswarm plot
    plt.clf()
    _ = shap.plots.beeswarm(shapvals, show=False)
    plt.tight_layout()
    plt.savefig(expl_path + "beeswarm.pdf")

plt.clf()

Permutation explainer: 468it [03:05,  2.38it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 412it [02:46,  2.33it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 366it [02:29,  2.29it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 322it [02:12,  2.24it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Permutation explainer: 263it [01:47,  2.20it/s]                                 
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


### Summary

In [14]:
# Print all RMSEs
print(boston_perf)

          CC  Ensemble  Weighting
10  3.172781  3.024555   3.144469
20  3.171484  2.987382   3.139145
30  3.355522  3.107355   3.277915
40  3.257154  3.066913   3.360226
50  3.797095  3.142602   3.514352


In [15]:
# Save all RMSEs in CSV
boston_perf.to_csv(results_path + "boston_rmse_{}{}.csv".format(basemdlname, model_iter_name))

## Modelling on `biopsy` dataset (classification)

### Data preparation

In [16]:
# Determine covariates and outcome variables
biopsyyvar = "class_malignant"
biopsyXvars = ["V1", "V2", "V3", "V4", "V5", "V7", "V8", "V9"]

# Selected features for model explanation
selected_biopsy = ["V1"]

### General model setup (random forest)

In [17]:
# Cross-validation folds
n_splits = 5

# Set random seed
SEED = 2023

# Probability cutoff to compute metrics such as F1 score
pred_cutoff = 0.5

## For now, we are using the default setup from sklearn
basemdl = RandomForestClassifier()
basemdlname = "rf"

In [18]:
# Placeholder table to store performance metrics (AUROC and F1)
biopsy_auroc = pd.DataFrame(np.zeros((len(props), 3)), index=props, 
                            columns=["CC", "Ensemble", "Weighting"])
biopsy_f1 = pd.DataFrame(np.zeros((len(props), 3)), index=props, 
                         columns=["CC", "Ensemble", "Weighting"])

### Complete case data

#### Cross-validation

In [19]:
# Iterate over all versions of the data
for p in tqdm(props):
    # Separate indep and outcome variables
    X = biopsy_cc[p][biopsyXvars]
    y = biopsy_cc[p][biopsyyvar]
    
    # Initialise k-fold object
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    
    # Placeholder for predictions to calculate performance metrics
    preds = []
    
    # Iterate over the folds
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # Get train and test data
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        
        # Train classifier on training data
        rf = copy.deepcopy(basemdl).fit(X_train, y_train)
        
        # Predict on test data and store predictions
        pred_ = rf.predict_proba(X_test)[:, 1]
        preds.append(pd.DataFrame({"true": y_test, "pred": pred_}))
    
    # Aggregate predictions, compute AUROC and F1 score
    preds = pd.concat(preds)
    preds["pred_labels"] = preds["pred"] > pred_cutoff
    biopsy_auroc.loc[p, "CC"] = roc_auc_score(preds["true"], preds["pred"])
    biopsy_f1.loc[p, "CC"] = f1_score(preds["true"], preds["pred_labels"])

100%|█████████████████████████████████████████████| 5/5 [00:02<00:00,  1.87it/s]


#### Full model training and explanation

In [20]:
%matplotlib agg
# Iterate over all versions of the data
for p in props:
    # Separate indep and outcome variables
    X = biopsy_cc[p][biopsyXvars]
    y = biopsy_cc[p][biopsyyvar]
    
    # Train RF classifier
    rf = copy.deepcopy(basemdl).fit(X, y)
    
    # Create output path for model explanation
    expl_path = figures_path + "biopsy_cc_{}/".format(p)
    if not os.path.exists(expl_path):
        os.mkdir(expl_path)
    
    # Model explanation
    background_data = shap.maskers.Independent(X, max_samples=100)
    pred_fn = lambda x: rf.predict_proba(x)[:, 1]
    expl = shap.Explainer(pred_fn, background_data, link=shap.links.logit)
    
    # Calculate SHAP values
    shapvals = expl(X)
    
    # Create SHAP dependence plots for selected features
    for c in selected_biopsy:
        f1, ax1 = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
        # shap.dependence_plot(c, shap_values=shapvals, features=X, show=False, ax=ax1)
        shap.plots.scatter(shapvals[:, c], show=False, ax=ax1)
        f1.tight_layout()
        f1.savefig(expl_path + "dependence_{}.pdf".format(c))
    
    # Create beeswarm plot
    plt.clf()
    _ = shap.plots.beeswarm(shapvals, show=False)
    plt.tight_layout()
    plt.savefig(expl_path + "beeswarm.pdf")

plt.clf()

Exact explainer: 643it [00:44, 11.45it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 584it [00:37, 11.46it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 511it [00:29, 11.61it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 433it [00:21, 10.98it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 339it [00:14,  7.81it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


### Ensemble approach

#### Cross-validation

In [21]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in tqdm(props):
    # Prepare ensemble data
    X, y = PrepareEnsembleData(biopsy_imputed[p], biopsyyvar, covars=biopsyXvars)
    
    # Construct base model
    # basemdl = RandomForestClassifier()
    
    # Run K-fold CV
    metrics, preds = KFoldEnsemble(n_splits, X, y, 
                                   biopsy_imputed[p]["missingflag"].any(axis=1), 
                                   basemdl, classifier=True, random_state=SEED)
    
    # Store metrics
    all_preds.append(preds)
    biopsy_auroc.loc[p, "Ensemble"] = metrics["AUROC"]
    biopsy_f1.loc[p, "Ensemble"] = metrics["F1"]

100%|█████████████████████████████████████████████| 5/5 [00:53<00:00, 10.72s/it]


#### Full model training and explanation

In [22]:
%matplotlib agg
# Iterate over all versions of the data
for p in props:
    # Prepare ensemble data
    X, y = PrepareEnsembleData(biopsy_imputed[p], biopsyyvar, covars=biopsyXvars)
    
    # Construct ensemble model
    # basemdl = RandomForestClassifier()
    ensemblerf = EnsembleClassifier(basemdl).fit(X, y)
    
    # Create output path for model explanation
    expl_path = figures_path + "biopsy_ensemble_{}/".format(p)
    if not os.path.exists(expl_path):
        os.mkdir(expl_path)
    
    # Get complete case data for model explanation
    mflag = biopsy_imputed[p]["missingflag"].any(axis=1)
    Xobs = X[0][biopsyXvars][~mflag]
    
    # Model explanation
    background_data = shap.maskers.Independent(Xobs, max_samples=100)
    pred_fn = lambda x: ensemblerf.predict_proba(x)[:, 1]
    expl = shap.Explainer(pred_fn, background_data, link=shap.links.logit)
    
    # Calculate SHAP values
    shapvals = expl(Xobs)
    
    # Create SHAP dependence plots for selected features
    for c in selected_biopsy:
        f1, ax1 = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
        # shap.dependence_plot(c, shap_values=shapvals, features=Xobs, show=False, ax=ax1)
        shap.plots.scatter(shapvals[:, c], show=False, ax=ax1)
        f1.tight_layout()
        f1.savefig(expl_path + "dependence_{}.pdf".format(c))
    
    # Create beeswarm plot
    plt.clf()
    _ = shap.plots.beeswarm(shapvals, show=False)
    plt.tight_layout()
    plt.savefig(expl_path + "beeswarm.pdf")

plt.clf()

Exact explainer: 643it [14:04,  1.33s/it]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 584it [12:09,  1.27s/it]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 511it [09:44,  1.16s/it]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 433it [07:30,  1.07s/it]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 339it [05:19,  1.02it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


### Weighting approach

#### Cross-validation

In [23]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in tqdm(props):
    # Construct base model
    # basemdl = RandomForestClassifier()
    
    # Run K-fold CV
    metrics, preds = KFoldWeighted(n_splits, biopsy_imputed[p], biopsyyvar, 
                                   basemdl, classifier=True, random_state=SEED, 
                                   covars=biopsyXvars)
    
    # Store metrics
    all_preds.append(preds)
    biopsy_auroc.loc[p, "Weighting"] = metrics["AUROC"]
    biopsy_f1.loc[p, "Weighting"] = metrics["F1"]

100%|█████████████████████████████████████████████| 5/5 [00:04<00:00,  1.03it/s]


#### Full model training and explanation

In [24]:
%matplotlib agg
# Iterate over all versions of the data
for p in props:
    # Prepare weighted data
    X, y, w = PrepareWeightedData(biopsy_imputed[p], biopsyyvar, covars=biopsyXvars)
    
    # Construct weighted model
    weightedrf = copy.deepcopy(basemdl).fit(X, y, sample_weight=w)
    
    # Create output path for model explanation
    expl_path = figures_path + "biopsy_weighting_{}/".format(p)
    if not os.path.exists(expl_path):
        os.mkdir(expl_path)
    
    # Get complete case data for model explanation
    mflag = biopsy_imputed[p]["missingflag"].any(axis=1)
    Xobs = biopsy_imputed[p]["imp"][0][biopsyXvars][~mflag]
    
    # Model explanation
    background_data = shap.maskers.Independent(Xobs, max_samples=100)
    pred_fn = lambda x: weightedrf.predict_proba(x)[:, 1]
    expl = shap.Explainer(pred_fn, background_data, link=shap.links.logit)
    
    # Calculate SHAP values
    shapvals = expl(Xobs)
    
    # Create SHAP dependence plots for selected features
    for c in selected_biopsy:
        f1, ax1 = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
        # shap.dependence_plot(c, shap_values=shapvals, features=Xobs, show=False, ax=ax1)
        shap.plots.scatter(shapvals[:, c], show=False, ax=ax1)
        f1.tight_layout()
        f1.savefig(expl_path + "dependence_{}.pdf".format(c))
    
    # Create beeswarm plot
    plt.clf()
    _ = shap.plots.beeswarm(shapvals, show=False)
    plt.tight_layout()
    plt.savefig(expl_path + "beeswarm.pdf")

plt.clf()

Exact explainer: 643it [00:45, 11.13it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 584it [00:39, 11.27it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 511it [00:31, 11.10it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 433it [00:26, 10.41it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Exact explainer: 339it [00:18,  8.70it/s]                                       
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


### Summary

In [25]:
# Print all AUROCs
print(biopsy_auroc)

          CC  Ensemble  Weighting
10  0.984677  0.986793   0.986227
20  0.984088  0.981155   0.979900
30  0.982799  0.982822   0.981836
40  0.991333  0.991499   0.987452
50  0.994379  0.997022   0.997655


In [26]:
# Print all F1 scores
print(biopsy_f1)

          CC  Ensemble  Weighting
10  0.923469  0.944724   0.942643
20  0.907895  0.911475   0.915584
30  0.894737  0.923729   0.906780
40  0.872483  0.896104   0.904459
50  0.909091  0.967742   0.957447


In [27]:
# Save all metrics in CSV
biopsy_auroc.to_csv(results_path + "biopsy_auroc_{}{}.csv".format(basemdlname, model_iter_name))
biopsy_f1.to_csv(results_path + "biopsy_f1_{}{}.csv".format(basemdlname, model_iter_name))