In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap
import pickle
import os

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              RandomForestRegressor, GradientBoostingRegressor)
from sklearn.model_selection import KFold
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             RocCurveDisplay, PrecisionRecallDisplay, 
                             mean_squared_error)
from pandas.api.types import CategoricalDtype

In [None]:
# Import modelling functions
from ensemble_functions import *
from weighting_functions import *
from explainer_functions import *

## Load dataset

### Imputed using MICE

In [None]:
# Choose with dataset to load (rpy2 or pure Python)
imputed_file = "imputed.pickle"
boston_path = "../../data/toy-dataset/boston-processed/"
biopsy_path = "../../data/toy-dataset/biopsy-processed/"

# Place to store results
results_path = "../../results/metrics/"
if not os.path.exists(results_path):
    os.mkdir(results_path)

# Load imputed dataset
with open(boston_path + imputed_file, "rb") as handle:
    boston_imputed = pickle.load(handle)

with open(biopsy_path + imputed_file, "rb") as handle:
    biopsy_imputed = pickle.load(handle)

### Complete case data

In [None]:
# Set directory for complete case data
boston_cc_path = "../../data/toy-dataset/boston-complete-case/"
biopsy_cc_path = "../../data/toy-dataset/biopsy-complete-case/"

# Load complete case data
props = [10, 20, 30, 40, 50]
boston_cc = {}
biopsy_cc = {}
for p in props:
    boston_cc[p] = pd.read_csv(boston_cc_path + "boston_{}.csv".format(p))
    biopsy_cc[p] = pd.read_csv(boston_cc_path + "biopsy_{}.csv".format(p))

## Modelling on `boston` dataset (regression)

### Data preparation

In [None]:
# Determine covariates and outcome variables
bostonyvar = "medv"
bostonXvars = ["crim", "zn", "indus", "rm", "age", "dis", "tax", "ptratio", "black", 
               "lstat", "chas", "nox"]

### General model setup (random forest)

In [None]:
# Cross-validation folds
n_splits = 5

# Set random seed
SEED = 2023

## For now, we are using the default setup from sklearn

In [None]:
# Placeholder table to store performance metrics (RMSE)
boston_perf = pd.DataFrame(np.zeros((len(props), 3)), index=props, 
                           columns=["CC", "Ensemble", "Weighting"])

### Complete case data

#### Cross-validation

In [None]:
# Iterate over all versions of the data
for p in tqdm(props):
    # Separate indep and outcome variables
    X = boston_cc[p][bostonXvars]
    y = boston_cc[p][bostonyvar]
    
    # Initialise k-fold object
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    
    # Placeholder for predictions to calculate performance metrics
    preds = []
    
    # Iterate over the folds
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # Get train and test data
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        
        # Train regressor on training data
        rf = RandomForestRegressor().fit(X_train, y_train)
        
        # Predict on test data and store predictions
        pred_ = rf.predict(X_test)
        preds.append(pd.DataFrame({"true": y_test, "pred": pred_}))
    
    # Aggregate predictions and compute RMSE
    preds = pd.concat(preds)
    rmse = mean_squared_error(preds["true"], preds["pred"], squared=False)
    boston_perf.loc[p, "CC"] = rmse

#### Full model training and explanation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Separate indep and outcome variables
    X = boston_cc[p][bostonXvars]
    y = boston_cc[p][bostonyvar]
    
    # Train RF regressor
    rf = RandomForestRegressor().fit(X, y)
    
    # Model explanation

### Ensemble approach

#### Cross-validation

In [None]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in props:
    # Prepare ensemble data
    X, y = PrepareEnsembleData(boston_imputed[p], bostonyvar, covars=bostonXvars)
    
    # Construct base model
    basemdl = RandomForestRegressor()
    
    # Run K-fold CV
    metrics, preds = KFoldEnsemble(n_splits, X, y, boston_imputed[p]["missingflag"], 
                                   basemdl, classifier=False, random_state=SEED)
    
    # Store metrics
    all_preds.append(preds)
    boston_perf.loc[p, "Ensemble"] = metrics["RMSE"]

#### Full model training and explanation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Prepare ensemble data
    X, y = PrepareEnsembleData(boston_imputed[p], bostonyvar, covars=bostonXvars)
    
    # Construct ensemble model
    basemdl = RandomForestRegressor()
    ensemblerf = EnsembleRegressor(basemdl).fit(X, y)
    
    # Model explanation

### Weighting approach

#### Cross-validation

In [None]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in props:
    # Construct base model
    basemdl = RandomForestRegressor()
    
    # Run K-fold CV
    metrics, preds = KFoldWeighted(n_splits, boston_imputed[p], bostonyvar, 
                                   basemdl, classifier=False, random_state=SEED, 
                                   covars=bostonXvars)
    
    # Store metrics
    all_preds.append(preds)
    boston_perf.loc[p, "Weighting"] = metrics["RMSE"]

#### Full model training and explanation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Prepare weighted data
    X, y, w = PrepareWeightedData(boston_imputed[p], bostonyvar, covars=bostonXvars)
    
    # Construct weighted model
    weightedrf = RandomForestRegressor().fit(X, y, sample_weight=w)
    
    # Model explanation

### Summary

In [None]:
# Print all RMSEs
print(boston_perf)

In [None]:
# Save all RMSEs in CSV
boston_perf.to_csv(results_path + "boston_rmse_rf.csv")

## Modelling on `biopsy` dataset (classification)

### Data preparation

In [None]:
# Determine covariates and outcome variables
biopsyyvar = "class_malignant"
biopsyXvars = ["V1", "V2", "V3", "V4", "V5", "V7", "V8", "V9"]

### General model setup (random forest)

In [None]:
# Cross-validation folds
n_splits = 5

# Set random seed
SEED = 2023

# Probability cutoff to compute metrics such as F1 score
pred_cutoff = 0.5

## For now, we are using the default setup from sklearn

In [None]:
# Placeholder table to store performance metrics (AUROC and F1)
biopsy_auroc = pd.DataFrame(np.zeros((len(props), 3)), index=props, 
                            columns=["CC", "Ensemble", "Weighting"])
biopsy_f1 = pd.DataFrame(np.zeros((len(props), 3)), index=props, 
                         columns=["CC", "Ensemble", "Weighting"])

### Complete case data

#### Cross-validation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Separate indep and outcome variables
    X = biopsy_cc[p][biopsyXvars]
    y = biopsy_cc[p][biopsyyvar]
    
    # Initialise k-fold object
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    
    # Placeholder for predictions to calculate performance metrics
    preds = []
    
    # Iterate over the folds
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        # Get train and test data
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        
        # Train classifier on training data
        rf = RandomForestClassifier().fit(X_train, y_train)
        
        # Predict on test data and store predictions
        pred_ = rf.predict_proba(X_test)[:, 1]
        preds.append(pd.DataFrame({"true": y_test, "pred": pred_}))
    
    # Aggregate predictions, compute AUROC and F1 score
    preds = pd.concat(preds)
    preds["pred_labels"] = preds["pred"] > pred_cutoff
    biopsy_auroc.loc[p, "CC"] = roc_auc_score(preds["true"], preds["pred"])
    biopsy_f1.loc[p, "CC"] = f1_score(preds["true"], preds["pred_labels"])

#### Full model training and explanation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Separate indep and outcome variables
    X = biopsy_cc[p][biopsyXvars]
    y = biopsy_cc[p][biopsyyvar]
    
    # Train RF classifier
    rf = RandomForestClassifier().fit(X, y)
    
    # Model explanation

### Ensemble approach

#### Cross-validation

In [None]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in props:
    # Prepare ensemble data
    X, y = PrepareEnsembleData(biopsy_imputed[p], biopsyyvar, covars=biopsyXvars)
    
    # Construct base model
    basemdl = RandomForestClassifier()
    
    # Run K-fold CV
    metrics, preds = KFoldEnsemble(n_splits, X, y, biopsy_imputed[p]["missingflag"], 
                                   basemdl, classifier=True, random_state=SEED)
    
    # Store metrics
    all_preds.append(preds)
    biopsy_auroc.loc[p, "Ensemble"] = metrics["AUROC"]
    biopsy_f1.loc[p, "Ensemble"] = metrics["F1"]

#### Full model training and explanation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Prepare ensemble data
    X, y = PrepareEnsembleData(biopsy_imputed[p], biopsyyvar, covars=biopsyXvars)
    
    # Construct ensemble model
    basemdl = RandomForestClassifier()
    ensemblerf = EnsembleClassifier(basemdl).fit(X, y)
    
    # Model explanation

### Weighting approach

#### Cross-validation

In [None]:
# Placeholder for all preds just in case things go wrong
all_preds = []

# Iterate over all versions of the data
for p in props:
    # Construct base model
    basemdl = RandomForestClassifier()
    
    # Run K-fold CV
    metrics, preds = KFoldWeighted(n_splits, biopsy_imputed[p], biopsyyvar, 
                                   basemdl, classifier=True, random_state=SEED, 
                                   covars=biopsyXvars)
    
    # Store metrics
    all_preds.append(preds)
    biopsy_auroc.loc[p, "Weighting"] = metrics["AUROC"]
    biopsy_f1.loc[p, "Weighting"] = metrics["F1"]

#### Full model training and explanation

In [None]:
# Iterate over all versions of the data
for p in props:
    # Prepare weighted data
    X, y, w = PrepareWeightedData(biopsy_imputed[p], biopsyyvar, covars=biopsyXvars)
    
    # Construct weighted model
    weightedrf = RandomForestClassifier().fit(X, y, sample_weight=w)
    
    # Model explanation

### Summary

In [None]:
# Print all AUROCs
print(biopsy_auroc)

In [None]:
# Print all F1 scores
print(biopsy_f1)

In [None]:
# Save all metrics in CSV
biopsy_auroc.to_csv(results_path + "biopsy_auroc_rf.csv")
biopsy_f1.to_csv(results_path + "biopsy_f1_rf.csv")