In [8]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold

## Step 1: Load incomplete dataset

## Step 2: Implement multiple imputation with PMM

In [None]:
def ImputeRandomSample(data, seed=None):
    """
    Impute missing values using one of the observed values for a given variable 
    (univariate imputation).
    This function iterates over all columns in a data frame and imputes as necessary.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    seed : None, optional
        Random seed for reproducibility
    
    Returns
    -------
    Pandas DataFrame
        Imputed data frame
    """
    # Copy data frame
    imp = data.copy()
    
    # Set random seed for reproducibility
    if seed is not None:
        np.random.seed(seed)
    
    # Iterate over all variables in the data frame
    for c in data.columns:
        # Find rows to be imputed and number of missing observations
        mr = data[c].isna()
        n = mr.sum()
        
        # Collect observed data for sampling
        obs = data[c].dropna().values
        
        # Impute using random samples
        # We assume that the data is not fully missing but only partially
        imp.loc[mr, c] = np.random.choice(obs, size=n, replace=True)
        
    return imp

In [None]:
# The following function performs single (not multiple yet) imputation using PMM
# which will act as the building block for the MICE implementation later on

def ImputePMM(data, missingflag, d=5, k=1e-5, seed=None, targets=None):
    """
    Perform imputation using the predictive mean matching (PMM) method. 
    If the target variables are not specified, it assumes that all variables 
    are continuous and can be modelled using the Bayesian linear model.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    missingflag : Pandas DataFrame
        A boolean data frame with the exact same dimension as `data`, with an
        indicator that shows if a given observation and variable is missing in
        `data`
    d : int, optional
        Number of donors in the donor set (default = 5)
    k : float, optional
        Ridge parameter for numerical stability (default = 1e-5)
    seed : int, optional
        Random seed
    targets : list, optional
        List of target variables to be imputed (assumed to be continuous)
    
    Returns
    -------
    Pandas DataFrame
        Imputed data frame
    """
    # Set random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Iterate over all columns
    if targets is None:
        targets = data.columns
    for i, c in enumerate(targets):
        # Separate out variable to be imputed and predictors - with their respective
        # flags
        y = data[c]
        yflag = missingflag[c]
        X = data.drop(c, axis=1)
        Xflag = missingflag.drop(c, axis=1)
        
        # Skip if no missing value
        if yflag.sum() == 0:
            continue
        
        # Separate out observed and missing
        Xobs = X[~yflag].values
        Xmis = X[yflag].values
        yobs = y[~yflag].values
        ymis = y[yflag].values
        
        # Calculate regression weights (Algorithm 3.1)
        S = np.transpose(Xobs) @ Xobs
        V = np.linalg.inv(S + k * np.diag(np.diag(S)))
        bhat = V @ np.transpose(Xobs) @ yobs
        
        # Calculate noise variance
        df = len(yobs) - Xobs.shape[1]
        gdot = np.random.chisquare(df)
        res = yobs - Xobs @ bhat
        sigdot = np.sqrt(np.transpose(res) @ res / gdot)
        
        # Draw beta from the posterior distribution
        z1 = np.random.normal(size=Xobs.shape[1])
        bdot = bhat + sigdot * z1 @ np.linalg.cholesky(V)
        
        # Calculate the imputed values and overwrite data matrix
        # NOTE: This is the Bayesian linear model approach, not PMM
        #z2 = np.random.normal(size=len(ymis))
        #yimp = Xmis @ bdot + z2 * sigdot
        #data.loc[yflag, c] = yimp
        
        # Calculate distances
        eta = np.subtract.outer(np.dot(Xobs, bhat).ravel(), np.dot(Xmis, bdot).ravel())
        eta = np.abs(eta)
        
        # Identify donor sets for each missing value
        ind = np.argsort(eta, axis=0)
        donorind = ind[:d, :]
        
        # Draw random donor for each missing value
        selectedind = np.random.randint(0, d, size=len(ymis))
        selecteddonorind = np.diag(donorind[np.ix_(selectedind, np.arange(len(ymis)))])
        yimp = yobs[selecteddonorind]
        
        # Overwrite data matrix
        data.loc[yflag, c] = yimp

    return data

In [None]:
# Helper function to calculate statistics of imputed data
def getImputedStats(data, missingflag):
    # Initialize arrays for mean and SD
    mu = np.zeros(data.shape[1])
    sigma = np.zeros(data.shape[1])
    
    for i, c in enumerate(data.columns):
        # Extract missing data
        miss = data.loc[missingflag[c], c]
        
        # Get statistics
        mu[i] = miss.mean()
        sigma[i] = miss.std(ddof=1)
    
    return mu, sigma

# Main function for MICE using PMM
def MICEPMM(data, m=10, maxit=5, d=5, k=1e-5, seed=123):
    """
    Implement multivariate imputation by chained equations (MICE) using 
    predictive mean matching (PMM) method. This function assumes that all
    variables are continuous and can be modelled using a Bayesian linear model.
    Furthermore, it assumes a fully conditional specification (FCS), which makes
    the original MICE framework.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    m : int, optional
        Number of multiply imputed data to be generated (default = 10)
    maxit : int, optional
        Maximum number of iterations for the MICE algorithm (default = 5)
    d : int, optional
        Number of donors in the donor set (default = 5)
    k : float, optional
        Ridge parameter for numerical stability (default = 1e-5)
    seed : int, optional
        Random seed
    
    Returns
    -------
    dict
        Python dictionary with the imputed data (`imp`), missing data flag (`missingflag`),
        and the chain statistics (`chainmean` and `chainstd`)
    """
    # Set random seed
    if seed is not None:
        np.random.seed(seed)

    # Create flags for missing value
    missingflag = data.isna()
    
    # Make m copies of the data
    imp = []
    for _ in range(m):
        # Initialize using random sample
        imp.append(ImputeRandomSample(data))
    
    # Initialize chain statistics
    chainmean = np.empty((data.shape[1], m, maxit+1))
    chainstd = np.empty((data.shape[1], m, maxit+1))
    for i in range(m):
        chainmean[:, i, 0], chainstd[:, i, 0] = getImputedStats(imp[i], missingflag)
    
    # Iterate over maxit
    for j in tqdm(range(maxit)):
        #print("Iteration {}".format(j))
        
        for i in range(m):
            # Impute using PMM
            imp[i] = ImputePMM(imp[i], missingflag, d=d, k=k)
        
            # Calculate updated chain statistics
            chainmean[:, i, j+1], chainstd[:, i, j+1] = getImputedStats(imp[i], missingflag)
    
    # Return multiply imputed data and chain statistics
    res = {
        "imp": imp,
        "missingflag": missingflag,
        "chainmean": chainmean,
        "chainstd": chainstd
    }
    return res

In [None]:
# Helper function to present imputed value across different imputations
def getImputedData(res, colname):
    """
    Helper function to get a data frame of imputed values for a given variable
    
    Parameters
    ----------
    res : dict
        Python dictionary returned by `MICEPMM`
    colname : str
        Variable for which the imputed data is to be shown
    
    Returns
    -------
    Pandas DataFrame
        Data frame showing observations with missing data only. Each column represents
        the output from each imputation model
    """
    # Retrieve relevant data
    implist, missingflag = res["imp"], res["missingflag"]
    
    # Extract relevant missing flag
    yflag = missingflag[colname]
    
    # Iterate over all imputed data
    impcombined = []
    for i, data in enumerate(implist):
        # Extract imputed values
        data = data.loc[yflag]
        data = data.rename(columns={colname: colname + str(i)})
        impcombined.append(data[colname + str(i)])
    
    impcombined = pd.concat(impcombined, axis=1)
    
    return impcombined

In [None]:
# Helper function to visualise the chain statistics
def ChainStatsViz(res, maxvar=3):
    """
    Constructs a trace plot of chain statistics based on the `MICEPMM` output
    
    Parameters
    ----------
    res : dict
        Python dictionary returned by `MICEPMM`
    maxvar : int, optional
        Maximum number of variables to be visualized (default = 3). Note that only
        variables with missing data will be shown
    
    Returns
    -------
    matplotlib.pyplot.figure
        Figure showing the trace plots
    """
    # Retrieve chain mean and standard deviation
    chainmean, chainstd = res["chainmean"], res["chainstd"]
    
    # Pick first maxvar variables with missing data
    allvars = pd.Series(data.columns.values)
    missingvars = data.columns[res["missingflag"].sum() > 0]
    if len(missingvars) < maxvar:
        maxvar = len(missingvars)
    else:
        missingvars = missingvars[:maxvar]
    missingvarsind = allvars[allvars.isin(missingvars)].index
    
    # Get number of imputed data
    m = len(res["imp"])
    
    # Placeholder for plotting
    fig, axs = plt.subplots(figsize=(8, maxvar*3), ncols=2, nrows=maxvar, 
                            sharex=True)
    cmap = cm.get_cmap("jet", 10) # If m > 10, it will rotate back to start
    
    # Generate plot for each variable and imputation
    for i, idx in enumerate(missingvarsind):
        # Plot chain mean
        for j in range(m):
            axs[i, 0].plot(chainmean[i, j, :], color=cmap(j % 10), alpha=0.7)
        axs[i, 0].set_title("{}: mean".format(allvars[idx]))
        
        # Plot chain SD
        for j in range(m):
            axs[i, 1].plot(chainstd[i, j, :], color=cmap(j % 10), alpha=0.7)
        axs[i, 1].set_title("{}: SD".format(allvars[idx]))
    
    return fig

In [None]:
"""
(Note from 3.4.4 of Van Buuren)
"Morris et al. (2014) advise to spend efforts on specifying the imputation model correctly, 
rather than expecting predictive mean matching to do the work."

(Note from 3.6.2 of Van Buuren)
The logreg, polr and polyreg methods in mice implement option 5.

(Note from 3.6.3 of Van Buuren)
Imputation of categorical data is more difficult than continuous data. As a rule of thumb, 
in logistic regression we need at least 10 events per predictor in order to get reasonably 
stable estimates of the regression coefficients (Van Belle, 2002, p. 87).

Next step:
1. Implement logreg and polyreg (either bootstrap or using data augmentation)
2. Implement tree-based model with XAI
3. Consider option to extend PMM using GLM as underlying model
"""

In [None]:
# Main function for imputation using logistic regression with bootstrap

def ImputeLogRegBoot(data, missingflag, d=5, k=1e-5, seed=None, targets=None):
    """
    Perform imputation using the logistic regression method with bootstrapping. 
    Note that target columns should be specified if not all variables are binary.
    Note that this implementation also assumes that the binary variables have
    been encoded as 0s and 1s.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    missingflag : Pandas DataFrame
        A boolean data frame with the exact same dimension as `data`, with an
        indicator that shows if a given observation and variable is missing in
        `data`
    d : int, optional
        Number of donors in the donor set (default = 5)
    k : float, optional
        Ridge parameter for numerical stability (default = 1e-5)
    seed : int, optional
        Random seed
    targets : list, optional
        Target columns to be imputed
    
    Returns
    -------
    Pandas DataFrame
        Imputed data frame
    """
    # Set random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Iterate over all columns
    if targets is None:
        targets = data.columns
    for i, c in enumerate(targets):
        # Separate out variable to be imputed and predictors - with their respective
        # flags
        y = data[c]
        yflag = missingflag[c]
        X = data.drop(c, axis=1)
        Xflag = missingflag.drop(c, axis=1)
        
        # Skip if no missing value
        if yflag.sum() == 0:
            continue
        
        # Separate out observed and missing
        Xobs = X[~yflag].values
        Xmis = X[yflag].values
        yobs = y[~yflag].values
        ymis = y[yflag].values
        
        # Stop process if y is not binary
        if yobs.nunique() > 2:
            raise ValueError("Column {} has {} unique values".format(c, yobs.nunique()))
        
        # Resample observed data using bootstrap
        resampled_idx = np.random.choice(np.arange(yobs.shape[0]), size=yobs.shape[0], 
                                         replace=True)
        Xobs1 = Xobs[resampled_idx, :]
        yobs1 = yobs[resampled_idx, :]
        
        # Train logistic regression model
        Xobs1 = sm.add_constant(Xobs1)
        lr_model = sm.Logit(yobs1, Xobs1).fit()
        
        # Predict probabilities using the fitted model
        pmis = lr_model.predict(sm.add_constant(Xmis))
        
        # Generate imputed binary values based on probabilities
        yimp = (np.random.uniform(size=ymis.shape[0]) < pmis).astype(float)
        
        # Overwrite data matrix
        data.loc[yflag, c] = yimp

    return data

In [None]:
# Main function for imputation using logistic regression with data augmentation

def ImputeLogRegAugment(data, missingflag, d=5, k=1e-5, seed=None, targets=None):
    """
    Perform imputation using the logistic regression method with data augmentation
    according to White, Daniel, and Royston (2010).
    Note that target columns should be specified if not all variables are binary.
    Note that this implementation also assumes that the binary variables have
    been encoded as 0s and 1s.
    A key difference against R implementation is the use of standard binomial
    instead of quasibinomial.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    missingflag : Pandas DataFrame
        A boolean data frame with the exact same dimension as `data`, with an
        indicator that shows if a given observation and variable is missing in
        `data`
    d : int, optional
        Number of donors in the donor set (default = 5)
    k : float, optional
        Ridge parameter for numerical stability (default = 1e-5)
    seed : None, optional
        Random seed
    targets : None, optional
        Target columns to be imputed
    
    Returns
    -------
    Pandas DataFrame
        Imputed data frame
    """
    # Set random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Iterate over all columns
    if targets is None:
        targets = data.columns
    for i, c in enumerate(targets):
        # Separate out variable to be imputed and predictors - with their respective
        # flags
        y = data[c]
        yflag = missingflag[c]
        X = data.drop(c, axis=1)
        Xflag = missingflag.drop(c, axis=1)
        
        # Skip if no missing value
        if yflag.sum() == 0:
            continue
        
        # Perform data augmentation
        Xaug, yaug, Xflagaug, yflagaug = _augment_data(X, y, Xflag, yflag)
        
        # Separate out observed and missing
        Xobs = Xaug[~yflagaug].values
        Xmis = Xaug[yflagaug].values
        yobs = yaug[~yflagaug].values
        ymis = yaug[yflagaug].values
        
        # Stop process if y is not binary
        if yobs.nunique() > 2:
            raise ValueError("Column {} has {} unique values".format(c, yobs.nunique()))
        
        # Train logistic regression model
        Xobs = sm.add_constant(Xobs)
        lr_model = sm.Logit(yobs, Xobs).fit()
        
        # Sample betas from (estimated) posterior distribution
        cov_unscaled = lr_model.cov_params(scale=False)
        cov_sqrt = linalg.cholesky(cov_unscaled, lower=True)
        betas = lr_model.params.values
        beta_star = betas + np.dot(cov_sqrt, np.random.normal(len(betas)))
        
        # Predict probabilities using betas
        pmis = expit(np.dot(sm.add_constant(Xmis), beta_star))
        
        # Generate imputed binary values based on probabilities
        yimp = (np.random.uniform(size=ymis.shape[0]) < pmis).astype(float)
        
        # Overwrite data matrix
        data.loc[yflag, c] = yimp

    return data

def _augment_data(X, y, Xflag, yflag):
    # Helper function for data augmentation based on White, Daniel, Royston (2010)
    
    # Get number of categories and number of predictors
    p = X.shape[1]
    k = y.dropna().nunique()
    nr = 2 * p * k
    
    # Calculate column wise mean and SD, then construct a matrix
    mu = X.mean()
    sig = X.std(ddof=1) # R implements sample variance by default, but not Python
    mu_mtrx = np.tile(mu.values.reshape(1, -1), (nr, 1))
    sig_mtrx = np.tile(sig.values.reshape(1, -1), (nr, 1))
    
    # Create shift matrix and outcome label vector
    shift_mtrx = linalg.block_diag(*tuple([[[.5], [-.5]]] * p))
    shift_mtrx = np.tile(shift_mtrx, (k, 1))
    ynew = np.repeat(np.arange(k), 2 * p)
    
    # Create augmented data
    aug = mu_mtrx + shift_mtrx * sig_mtrx
    aug = pd.DataFrame(aug, columns=X.columns)
    ynew = pd.Series(ynew)
    augflag = pd.DataFrame(np.zeros(nr, ).astype(bool), columns=X.columns)
    
    # Augment to the original data
    Xaug = pd.concat([X, aug], ignore_index=True)
    yaug = pd.concat([y, ynew], ignore_index=True)
    Xflagaug = pd.concat([Xflag, augflag], ignore_index=True)
    yflagaug = pd.concat([yflag, pd.Series(np.zeros(nr).astype(bool))], ignore_index=True)
    
    return Xaug, yaug, Xflagaug, yflagaug

In [None]:
# Main function for imputation using multinomial regression with data augmentation
# Do this later

In [None]:
# Main function to perform MICE using logistic regression

def MICELogReg(data, m=10, maxit=5, d=5, k=1e-5, seed=123, method="boot"):
    """
    Implement multivariate imputation by chained equations (MICE) using 
    logistic regression method. This function assumes that all
    variables are binary and are encoded as 0s and 1s. Two methods of logistic
    regression imputation are supported, one based on bootstrap approach and the
    other based on Bayesian approach (with data augmentation).
    Furthermore, it assumes a fully conditional specification (FCS), which makes
    the original MICE framework.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    m : int, optional
        Number of multiply imputed data to be generated (default = 10)
    maxit : int, optional
        Maximum number of iterations for the MICE algorithm (default = 5)
    d : int, optional
        Number of donors in the donor set (default = 5)
    k : float, optional
        Ridge parameter for numerical stability (default = 1e-5)
    seed : int, optional
        Random seed
    method : str, optional
        Logistic regression method, "boot" for bootstrap approach (default) and
        "bayes" for Bayesian approach
    
    Returns
    -------
    dict
        Python dictionary with the imputed data (`imp`), missing data flag (`missingflag`),
        and the chain statistics (`chainmean` and `chainstd`)
    """
    # Set random seed
    if seed is not None:
        np.random.seed(seed)

    # Create flags for missing value
    missingflag = data.isna()
    
    # Make m copies of the data
    imp = []
    for _ in range(m):
        # Initialize using random sample
        imp.append(ImputeRandomSample(data))
    
    # Initialize chain statistics
    chainmean = np.empty((data.shape[1], m, maxit+1))
    chainstd = np.empty((data.shape[1], m, maxit+1))
    for i in range(m):
        chainmean[:, i, 0], chainstd[:, i, 0] = getImputedStats(imp[i], missingflag)
    
    # Iterate over maxit
    for j in tqdm(range(maxit)):
        #print("Iteration {}".format(j))
        
        for i in range(m):
            # Impute using appropriate method
            if method == "boot":
                imp[i] = ImputeLogRegBoot(imp[i], missingflag, d=d, k=k)
            elif method == "bayes":
                imp[i] = ImputeLogRegBayes(imp[i], missingflag, d=d, k=k)
            else:
                raise ValueError("Invalid method: {}".format(method))
        
            # Calculate updated chain statistics
            chainmean[:, i, j+1], chainstd[:, i, j+1] = getImputedStats(imp[i], missingflag)
    
    # Return multiply imputed data and chain statistics
    res = {
        "imp": imp,
        "missingflag": missingflag,
        "chainmean": chainmean,
        "chainstd": chainstd
    }
    return res

In [None]:
# Main function to perform MICE on multiple data types

def MICE(data, targets_cat, targets_num, m=10, maxit=5, d=5, k=1e-5, seed=123, 
         method_cat="boot", method_num="pmm"):
    """
    Implement multivariate imputation by chained equations (MICE). This function
    requires continuous and categorical target variables to be explicitly specified.
    Appropriate imputation method will then be applied to each target variable type.
    
    Parameters
    ----------
    data : Pandas DataFrame
        Data frame to be imputed
    targets_cat : list
        List of categorical target variables to be imputed. Pass an empty list if
        no categorical variable is to be imputed
    targets_num : list
        List of numeric target variables to be imputed. Pass an empty list if
        no numeric variable is to be imputed
    m : int, optional
        Number of multiply imputed data to be generated (default = 10)
    maxit : int, optional
        Maximum number of iterations for the MICE algorithm (default = 5)
    d : int, optional
        Number of donors in the donor set (default = 5)
    k : float, optional
        Ridge parameter for numerical stability (default = 1e-5)
    seed : int, optional
        Random seed
    method_cat : str, optional
        Imputation method for categorical target variables. Supports "boot" (default),
        "bayes", and "pmm"
    method_num : str, optional
        Imputation method for numeric target variables. Only supports "pmm" (default)
    
    Returns
    -------
    dict
        Python dictionary with the imputed data (`imp`), missing data flag (`missingflag`),
        and the chain statistics (`chainmean` and `chainstd`)
    """
    # Check validity of imputation methods
    assert method_cat in ["pmm", "boot", "bayes"], "Invalid method : {}".format(method_cat)
    assert method_num in ["pmm"], "Invalid method: {}".format(method_num)
    
    # Set random seed
    if seed is not None:
        np.random.seed(seed)

    # Create flags for missing value
    missingflag = data.isna()
    
    # Make m copies of the data
    imp = []
    for _ in range(m):
        # Initialize using random sample
        imp.append(ImputeRandomSample(data))
    
    # Initialize chain statistics
    chainmean = np.empty((data.shape[1], m, maxit+1))
    chainstd = np.empty((data.shape[1], m, maxit+1))
    for i in range(m):
        chainmean[:, i, 0], chainstd[:, i, 0] = getImputedStats(imp[i], missingflag)
    
    # Define dictionary to map appropriate functions
    imputefunc = {
        "pmm" : ImputePMM,
        "boot": ImputeLogRegBoot,
        "bayes": ImputeLogRegBayes,
    }
    
    # Iterate over maxit
    for j in tqdm(range(maxit)):
        #print("Iteration {}".format(j))
        
        for i in range(m):
            # Impute using appropriate method for each data type
            if len(targets_cat) > 0:
                imp[i] = imputefunc[method_cat](imp[i], missingflag, d=d, k=k, 
                                                targets=targets_cat)
            if len(targets_num) > 0:
                imp[i] = imputefunc[method_num](imp[i], missingflag, d=d, k=k, 
                                                targets=targets_num)
        
            # Calculate updated chain statistics
            chainmean[:, i, j+1], chainstd[:, i, j+1] = getImputedStats(imp[i], missingflag)
    
    # Return multiply imputed data and chain statistics
    res = {
        "imp": imp,
        "missingflag": missingflag,
        "chainmean": chainmean,
        "chainstd": chainstd
    }
    return res

## Step 2B: Apply multiple imputation on the dataset

## Step 3A: Implement two versions of tree-based models (with XAI) for multiply imputed data

### Method 1: Ensemble method

In [None]:
# Helper function to construct dataset for ensemble model training
def PrepareEnsembleData(res, outcome):
    """
    Prepare data set for ensemble model training using the output of MICE
    
    Parameters
    ----------
    res : dict
        Dictionary returned by `MICE()` which should contain the imputed data under `imp`
    outcome : str
        Outcome variable for the ensemble model
    
    Returns
    -------
    list
        List of multiply imputed covariate matrices
    list
        List of multiply imputed outcome vectors
    """
    X, y = [], []
    for i, df in enumerate(res["imp"]):
        y.append(df[outcome])
        X.append(df.drop(outcome, axis=1))
    return X, y

# Main class to implement ensemble model (can be RF or any sklearn classifiers basically)
class EnsembleClassifier(ClassifierMixin):
    def __init__(component, **kwargs):
        super().__init__(**kwargs)
        self.comp_model = component
    
    def fit(self, X, y):
        self.m = len(X) # number of ensemble models to be constructed
        self.components = [copy.deepcopy(self.comp_model)] * m # all models are identical
        for i in range(self.m):
            self.components[i].fit(X[i], y[i])
        return self
    
    def predict(self, X):
        predicted = np.zeros((X.shape[0], m))
        for i in range(self.m):
            predicted[:, i] = self.components[i].predict(X)
        return stats.mode(predicted, axis=1).mode # most common
    
    def predict_proba(self, X):
        predicted = np.zeros((X.shape[0], m))
        for i in range(self.m):
            predicted[:, i] = self.components[i].predict_proba(X)[:, 1]
        pos_p = predicted.mean(axis=1).reshape(-1, 1)
        neg_p = 1 - pos_p
        return np.hstack((neg_p, pos_p))
    
    def predict_log_proba(self, X):
        probs = self.predict_proba(X)
        return np.log(probs)
    
    def predict_log_odds(self, X):
        probs = self.predict_proba(X)
        return np.log(probs[:, 1]) - np.log(probs[:, 0])

In [None]:
# Implement SHAP explainer wrapper for classifiers
def CreateSHAPExplainer(model, res, outcome):
    ## DOCUMENTATION TO BE ADDED
    # Get observed data only for the background distribution
    any_missingflag = res["missingflag"].any(axis=1)
    Xobs = res["imp"][0].drop(outcome, axis=1).iloc[any_missingflag]
    background_data = shap.maskers.Independent(Xobs, max_samples=100)
    
    # Construct SHAP explainer
    explainer = shap.Explainer(model.predict_log_odds, background_data)
    
    # Calculate SHAP values on the entire observed data
    shap_values = explainer(Xobs)
    
    return explainer, shap_values

# Create wrapper for SHAP or partial dependence plot
def GenerateDependencePlot(model, res, feature, shap_values, shap=False, ax=None):
    ## DOCUMENTATION TO BE ADDED
    # Create figure object if needed
    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 5), ncols=1, nrows=1)
    else:
        fig = None
        
    # Get observed data only
    any_missingflag = res["missingflag"].any(axis=1)
    Xobs = res["imp"][0].drop(outcome, axis=1).iloc[any_missingflag]
    
    if shap:
        # Construct SHAP dependence plot
        shap.plots.scatter(
            shap_values[:, feature], show=False, colors=shap_values, ax=ax
        )
    else:
        # Construct partial dependence plot
        shap.partial_dependence_plot(
            feature, model.predict_log_odds, Xobs, model_expected_value=True, 
            feature_expected_value=True, show=False, ice=False, ax=ax
        )
    
    if fig is None:
        return ax
    else:
        return fig

# Create wrapper for beeswarm plot
def GenerateBeeswarmPlot(model, res, feature, shap_values, shap=False, ax=None):
    fig, ax = shap.plots.beeswarm(shap_values, show=False)
    
    return fig, ax

In [None]:
# Wrapper to implement k-fold cross validation on ensemble data

def KFoldEnsemble(n_splits, X, y, random_state=None):
    ## DOCUMENTATION TO BE ADDED
    # Generate k-fold object
    kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    
    # Get number of multiply imputed data
    m = len(X)
    
    # Iterate over the folds
    for i, (train_index, test_index) in enumerate(kf.split(X[0])):
        # Construct train and test data
        X_train, X_test, y_train, y_test = [], [], [], []
        for j in range(m):
            # SPLIT THE DATA HERE
        
        # Build model on training data
        
        # Assess performance metric on test data

### Method 2: Weighting method

In [None]:
# Helper function to construct weighted data for weighting method
def PrepareWeightedData(res, outcome):
    """
    Prepare data set for ensemble model training using the output of MICE
    
    Parameters
    ----------
    res : dict
        Dictionary returned by `MICE()` which should contain the imputed data under `imp`
    outcome : str
        Outcome variable for the ensemble model
    
    Returns
    -------
    pandas.DataFrame
        Modified covariate matrices with all observed and imputed observations
    pandas.DataFrame
        Modified outcome vectors with all observed and imputed observations
    numpy.ndarray
        Weights for all observations
    """
    # Get number of multiply imputed data
    m = len(res["imp"])
    
    # Extract missing flags
    missingflag = res["missingflag"]
    any_missingflag = res["missingflag"].any(axis=1)
    Nobs = (~any_missingflag).sum()
    Nmis = any_missingflag.sum()
    
    # Placeholder for the dataset and assigned weights
    Xs, ys, ws = [], [], []
    
    # Iterate over multiply imputed data
    for i, df in enumerate(res["imp"]):
        # Extract covariate and outcome data frames
        tempX = df.drop(outcome, axis=1)
        tempy = df[outcome]
        
        if i == 0:
            # Use the first dataset to separate out observed and imputed
            Xs.append(tempX.iloc[~any_missingflag])
            ys.append(tempy.iloc[~any_missingflag])
            ws.append(np.ones((Nobs, 1)))
        
        # Append imputed data only with appropriate weight
        Xs.append(tempX.iloc[any_missingflag])
        ys.append(tempy.iloc[any_missingflag])
        ws.append(np.ones((Nmis, 1)) / m)
    
    # Merge data together
    X = pd.concat(Xs, ignore_index=True)
    y = pd.concat(ys, ignore_index=True)
    w = np.concatenate(ws)
    
    return X, y, w

In [None]:
# Wrapper to implement k-fold cross validation on weighted data

## Step 3B: Apply tree-based models on multiply imputed data