## Disclaimer
The experiment was initially conducted with both nunmerical and categorical variables. After discussions with supervisors, we found it reasonable to stick to only categorical variables due to the problem of RMSE under MAR. This made changes to the code and a lot of variables may have redundant namings.

# Missing value experiment single

Notebook to overhaul previous version of generating missing values.

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

import numpy as np
import pandas as pd

import torch
from lifelines import CoxPHFitter
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.model_selection import train_test_split
from sksurv.metrics import (
    concordance_index_censored
)
from sksurv.util import Surv

try:
    from utils.utils import *
    from utils.encoders import MultiLabelEncoder
except:
    import sys
    sys.path.append('../')
    from utils.utils import *
    from utils.encoders import MultiLabelEncoder    
    

def mode_imputation(X_train, X_test):
    """
    Impute missing values using mode imputation.
    Note: Only for categorical features.
    
    Parameters
    ----------
    X_train : torch.DoubleTensor, shape (n_train, d)
        Data matrix with missing values.

    X_test : torch.DoubleTensor, shape (n_test, d)
        Data matrix with missing values.
    
    Returns
    ----------
    X_imp_train : torch.DoubleTensor, shape (n_train, d)
            Data matrix with missing values imputed using mode imputation.

    X_imp_test : torch.DoubleTensor, shape (n_test, d)
            Data matrix with missing values imputed using mode imputation from train.
    """
    
    X_imp_train = X_train.clone()
    X_imp_test = X_test.clone()
    mask_train = torch.isnan(X_imp_train)
    mask_test = torch.isnan(X_imp_test)

    mode_values = torch.mode(X_imp_train, dim = 0)[0]

    for i in range(X_imp_train.shape[1]):
        X_imp_train[mask_train[:, i], i] = mode_values[i]
        X_imp_test[mask_test[:, i], i] = mode_values[i]

    return {"train": X_imp_train, "test" : X_imp_test}

def knn_imputation(X_train, X_test, n_neighbors = 2):
    """
    Impute missing values using KNN imputation. Data is scaled.

    Parameters
    ----------
    X_train : torch.DoubleTensor, shape (n_train, d)
        Data matrix with missing values.

    X_test : torch.DoubleTensor, shape (n_test, d)
        Data matrix with missing values.

    n_neighbors : int
        Number of neighbors to use for imputation.
    
    Returns
    ----------
    X_train_imp : torch.DoubleTensor, shape (n_train, d)
            Data matrix with missing values imputed using KNN imputation.

    X_test_imp : torch.DoubleTensor, shape (n_test, d)  
            Data matrix with missing values imputed using KNN imputation from train.
    """

    X_imp_train = X_train.clone()
    X_imp_test = X_test.clone()

    # Impute train
    imputer = KNNImputer(n_neighbors = n_neighbors)
    scaler = StandardScaler()
    X_imp_train_sc = scaler.fit_transform(X_imp_train)
    imputed_train = imputer.fit_transform(X_imp_train_sc)
    imputed_train = scaler.inverse_transform(imputed_train)

    # Impute test using fit from train
    X_imp_test_sc = scaler.transform(X_imp_test)
    imputed_test = imputer.transform(X_imp_test_sc)
    imputed_test = scaler.inverse_transform(imputed_test)

    # Round to assure integers
    imputed_train, imputed_test = torch.from_numpy(imputed_train).round(), torch.from_numpy(imputed_test).round()
    imputed_train, imputed_test = imputed_train.double(), imputed_test.double()
    
    return {"train": imputed_train, "test" : imputed_test}
        
def accuracy(X_imp, X_true, mask, column_wise = False):
    """
    Accuracy between imputed variables and ground truth.
    Pytorch/Numpy agnostic

    Parameters
    ----------
    X_imp : torch.DoubleTensor or np.ndarray, shape (n, d)
        Data with imputed variables.
    
    X_true : torch.DoubleTensor or np.ndarray, shape (n, d)
        Ground truth.
    
    mask : torch.BoolTensor or np.ndarray of booleans, shape (n, d)
        Missing value mask (missing if True)

    Returns
    -------
    acc : torch.DoubleTensor or np.ndarray of floats, shape (d,) if column_wise = True, else float
        accuracy between imputed variables and ground truth.
    """

    if not mask.any():
        return torch.tensor(1, dtype=torch.float64)   
    
    if torch.is_tensor(X_imp):
        if column_wise:
            acc = []
            for col in range(X_imp.shape[1]):
                if not mask[:, col].any():
                    acc.append(torch.tensor(0, dtype=torch.float64))
                else:
                    diff = (X_imp[:, col] == X_true[:, col]).double()
                    valid_diff = diff[mask[:, col]]
                    acc.append(valid_diff.mean())
            return torch.stack(acc)
        else:
            return ((X_imp[mask] == X_true[mask]).double().mean())
    else:  # NumPy array
        if column_wise:
            acc = []
            for col in range(X_imp.shape[1]):
                if not mask[:, col].any():
                    acc.append(0)
                else:
                    diff = (X_imp[:, col] == X_true[:, col]).double()
                    valid_diff = diff[mask[:, col]]
                    acc.append(valid_diff.mean())
            return np.array(acc)
        else:
            return ((X_imp[mask] == X_true[mask]).mean())
        


In [6]:
class SimulationMV():
    def __init__(self, X, duration_col, event_col, cat_colnames):
        """
        Initialize the SimulationMV class. 
        Event and duration is removed for X before simulation.
        Encoding types used for modelling are specified for each categorical column.


        Parameters
        ----------
        X : pandas.DataFrame
            Complete data matrix
        duration_col : str
            Name of the column containing the durations.
        event_col : str
            Name of the column containing the events.
        cat_colnames : list of str
            List of categorical column names.        
        """

        # Assign parameters to instance
        self.duration_col = duration_col
        self.event_col = event_col

        self.categorical_colnames = cat_colnames

        # Reorder the columns
        new_order = cat_colnames+[duration_col, event_col]
        X = X[new_order]

        # Store original X for baseline evaluation
        self.X_original = X.copy()

        # Remove duration and event col. We don't want to impute and evaluate on these
        X_to_sim = self._remove_event_and_duration(X)
        X_to_sim = X_to_sim.to_numpy()

        # split into numerical and categorical
        X_cat_init = X_to_sim
        
        # Define ordinal, one-hot and target encoding columns
        self.ordinal_idx = []
        self.one_hot_idx = []
        self.target_idx = []

        for i in range(X_cat_init.shape[1]):
            if not isinstance(X_cat_init[0, i], str):
                self.ordinal_idx.append(i)
            else:
                if len(np.unique(X_cat_init[:, i])) == 2:
                    self.one_hot_idx.append(i)
                else:
                    self.target_idx.append(i)
        
        self.not_ordinal_idx = [i for i in range(X_cat_init.shape[1]) if i not in self.ordinal_idx] 
        self.one_hot_cols = [self.categorical_colnames[i] for i in self.one_hot_idx]
        self.target_enc_cols = [self.categorical_colnames[i] for i in self.target_idx]
        
        # (Temporarily) encode map categorical variables to integers for simulating missing values
        # Torch is unable to handle strings.
        X_cat_enc = self._map_categorical(X_cat_init)
        
        self.X = X_cat_enc

        # Evaluate groundtruth coxPH
        self.GT_weights, self.GT_concordance = self._fit_single_cox_PH(self.X_original, None).values()

    def _remove_event_and_duration(self, X):
        """
        Remove event and duration columns from the data.
        Re-set the indices for the categorical and numerical columns.
        This is to avoid simulating missing values with these columns.

        Parameters
        ----------
        X : pandas.DataFrame
            Data matrix.
        
        Returns
        -------
        pandas.DataFrame
            Data matrix with event and duration columns removed.
        """

        X = X.drop(columns = [self.duration_col, self.event_col])

        cat_idx = [X.columns.get_loc(col) for col in self.categorical_colnames]
        self.categorical_cols = cat_idx
        return X
    
    def _add_event_and_duration(self, X, duration_col, event_col):
        """
        Add event and duration columns to the data.

        Parameters
        ----------
        X : pandas.DataFrame or np.ndarray
            Data matrix.

        duration_col : pandas.Series
            Duration column.

        event_col : pandas.Series
            Event column.
        
        Returns
        -------
        pandas.DataFrame
            Data matrix with event and duration columns added.
        """

        if isinstance(X, pd.DataFrame):
            # Add columns
            X[self.duration_col] = duration_col
            X[self.event_col] = event_col
        else:
            X = np.concatenate(X, [duration_col, event_col], axis = 1)
        return X

    def _fit_single_cox_PH(self, X_train, X_test, penalizer = 0):
        """ 
        Fit cox PH model to the training data. Evaluates concordance on both train and test data.

        Parameters
        ----------
        X_train : pandas.DataFrame
            Training data. Categories are mapped back to strings.

        X_test : pandas.DataFrame
            Test data. Categories are mapped back to strings.

        penalizer : float
            Regularization strength.
        
        Returns
        -------
        dict
            Dictionary containing the weights and concordance index (train and test).
        """

        me = TargetEncoder(smooth = "auto", target_type = "continuous", random_state=3155135)
        sc = StandardScaler()
        cph = CoxPHSurvivalAnalysis(alpha=penalizer)
        util = Surv()

        if X_test is None:
            # Prepare X and y. This is the complete data.
            # Define proper encodings
            y = X_train[[self.event_col, self.duration_col]]
            X = X_train.drop([self.event_col, self.duration_col], axis=1)
            X = pd.get_dummies(X, columns = self.one_hot_cols, drop_first=True)
            X[self.target_enc_cols] = me.fit_transform(X[self.target_enc_cols], y[self.duration_col])
            y = y.to_records(index=False)
            X_sc = sc.fit_transform(X)
            X_sc = pd.DataFrame(X_sc, columns = sc.feature_names_in_)
            cph.fit(X_sc, y)
            weights = cph.coef_
            weights = dict(zip(X_sc.columns, weights))
            preds = cph.predict(X_sc)
            conc = concordance_index_censored(event_indicator = y[self.event_col], event_time = y[self.duration_col], estimate = preds)[0]
            conc = round(conc, 3)

            # Lifelines summary of complete model
            X = pd.get_dummies(X_train, columns = self.one_hot_cols, drop_first=True)
            X[self.target_enc_cols] = me.fit_transform(X[self.target_enc_cols], y[self.duration_col])
            # Temporarily remove y to not scale it
            y = X[[self.event_col, self.duration_col]]
            X = X.drop([self.event_col, self.duration_col], axis=1)
            X_sc = sc.fit_transform(X)
            X_sc = pd.DataFrame(X_sc, columns = sc.feature_names_in_)
            X_sc[[self.event_col, self.duration_col]] = y

            self.complete_model = CoxPHFitter(baseline_estimation_method='breslow', penalizer = penalizer)
            self.complete_model.fit(X_sc, 
                               duration_col = self.duration_col, 
                               event_col= self.event_col,
                               show_progress=False)

            return {'weights': weights, 
                    'concordance': conc}

        # One hot encode binary
        X_train = pd.get_dummies(X_train, columns = self.one_hot_cols, drop_first = True)
        X_test = pd.get_dummies(X_test, columns = self.one_hot_cols, drop_first = True)

        # Ensure all category levels are present
        for column in X_train.columns:
            if column not in X_test.columns:
                X_test[column] = 0
        X_test  = X_test[X_train.columns]

        # Technicality of converting to structured array
        y_train = util.from_arrays(event = X_train[self.event_col], time = X_train[self.duration_col], name_event = self.event_col, name_time = self.duration_col)
        y_test = util.from_arrays(event = X_test[self.event_col], time = X_test[self.duration_col], name_event = self.event_col, name_time = self.duration_col)
        X_train = X_train.drop([self.event_col, self.duration_col], axis=1)
        X_test = X_test.drop([self.event_col, self.duration_col], axis=1)

        # Target encode train and transform test
        X_train[self.target_enc_cols] = me.fit_transform(X_train[self.target_enc_cols], y_train[self.duration_col])
        X_test[self.target_enc_cols] = me.transform(X_test[self.target_enc_cols])

        # Scale X (not event and duration)
        X_train_sc = sc.fit_transform(X_train)
        X_train_sc = pd.DataFrame(X_train_sc, columns = sc.feature_names_in_)
        X_test_sc = sc.transform(X_test)
        X_test_sc = pd.DataFrame(X_test_sc, columns = sc.feature_names_in_)
        
        # For regularizing singular matrices
        alpha_values = [0, 10, 100, 1000, 10000]
        for alpha in alpha_values:
            try:
                cph = CoxPHSurvivalAnalysis(alpha = alpha)
                cph.fit(X_train_sc, y_train)
                preds_train = cph.predict(X_train_sc)
                preds_test = cph.predict(X_test_sc)
                
                conc_train = concordance_index_censored(event_indicator = y_train[self.event_col], event_time = y_train[self.duration_col], estimate = preds_train)[0]
                conc_test = concordance_index_censored(event_indicator = y_test[self.event_col], event_time = y_test[self.duration_col], estimate = preds_test)[0]
                conc_train = round(conc_train, 3)
                conc_test = round(conc_test, 3)
                
                weights = cph.coef_
                weights = dict(zip(X_train.columns, weights))

                return {'weights': weights, 
                'concordance_train': conc_train,
                'concordance_test': conc_test}
            
            except Exception as e:
                print(f"Failed with alpha={alpha}: {e}")

    def complete_summary(self):
        """ 
        Return a lifelines summary of cox ph of full data.
        This is used because SciKit Survival does not have statistical information such as p-values etc.
        """

        return self.complete_model.print_summary()
    
    def _map_categorical(self, X_cat):
        """
        Temporarily map categorical variables to integers.
        X_cat is assumed to be a numpy ndarray.

        Parameters
        ----------
        X_cat : np.ndarray
            Categorical data matrix.
    
        Returns
        -------
        X_cat_enc : np.ndarray
            Encoded categorical data matrix of integers.
        """
        self.multi_le = MultiLabelEncoder()
        
        # Initialize an array of the same shape as X_cat to hold encoded data
        X_cat_enc = np.empty(shape=X_cat.shape, dtype = "object")
        # create a list of the columns that are not ordinal

        encoded_columns = self.multi_le.fit_transform(X_cat[:, self.not_ordinal_idx])
        X_cat_enc[:, self.not_ordinal_idx] = encoded_columns
        X_cat_enc[:, self.ordinal_idx] = X_cat[:, self.ordinal_idx]
        return X_cat_enc

    def _decode_categorical(self, X_cat_enc):
        """
        Decode categorical variables, excluding columns specified in self.ordinal_idx.
        self.ordinal_idx contains indices of columns that should not be decoded because they are ordered.

        Parameters
        ----------
        X_cat_enc : np.ndarray
            Encoded categorical data matrix of integers.
        
        Returns
        -------
        X_cat_dec : np.ndarray
            Decoded categorical data matrix of strings.
        """

        # Initialize an array of the same shape as X_cat_enc to hold decoded data
        X_cat_dec = np.empty(shape=X_cat_enc.shape, dtype = "object")
        decoded_columns = self.multi_le.inverse_transform(X_cat_enc[:, self.not_ordinal_idx])
        X_cat_dec[:, self.not_ordinal_idx] = decoded_columns
        X_cat_dec[:, self.ordinal_idx] = X_cat_enc[:, self.ordinal_idx]
        return X_cat_dec
     
    def _simulate_single_na_dataset(self, p_miss, mecha = "MCAR", opt = None, p_obs = None, q = None,
                                    sample_seed = 135135, column_seed = 115342):
        """
        Generate missing values for specifics missing-data mechanism and proportion of missing values. 

        Parameters
        ----------
        p_miss : float
            Proportion of missing values to generate for variables which will have missing values.

        mecha : str, 
                Indicates the missing-data mechanism to be used. "MCAR" by default, "MAR" or "MNAR".

        opt: str, 
             For mecha = "MNAR", it indicates how the missing-data mechanism is generated: using a logistic regression ("logistic"), 
             quantile censorship ("quantile") or logistic regression for generating a self-masked MNAR mechanism ("selfmasked").

        p_obs : float
                If mecha = "MAR", or mecha = "MNAR" with opt = "logistic" or "quanti", proportion of variables with *no* 
                missing values that will be used for the logistic masking model.

        q : float
            If mecha = "MNAR" and opt = "quanti", quantile level at which the cuts should occur.

        sample_seed : int
            Seed for the random number generator used to generate the missing samples.
            Also used in train test split.

        column_seed : int
            Seed for the random number generator used to generate the missing columns.

        Returns
        ----------
        A dictionary containing:
        'X_init': the initial data matrix.
        'X_na': a tensor (n, d) containing the dataset with missing values.
        'mask': a tensor (n, d) containing the mask.
        'event_and_duration': a tensor (n, 2) containing the event and duration columns.
        """

        set_sample_seed(sample_seed)
        set_column_seed(column_seed)

        to_torch = torch.is_tensor(self.X) ## output a pytorch tensor, or a numpy array
        if not to_torch:
            X = self.X.astype(np.float32)
            X = torch.from_numpy(X)
        
        if mecha == "MAR":
            mask = MAR_mask(X, p_miss, p_obs).double()
        elif mecha == "MNAR" and opt == "logistic":
            mask = MNAR_mask_logistic(X, p_miss, p_obs).double()
        elif mecha == "MNAR" and opt == "quantile":
            mask = MNAR_mask_quantiles(X, p_miss, q, 1-p_obs).double()
        elif mecha == "MNAR" and opt == "selfmasked":
            mask = MNAR_self_mask_logistic(X, p_miss).double()
        else:
            mask = MCAR_mask(X, p_miss).double()
        
        X_nas = X.clone()
        X_nas[mask.bool()] = np.nan

        # Perform a train/test split on X_init, and use the same indices to split X_na and mask
        duration_and_event = self.X_original[[self.duration_col, self.event_col]].to_numpy()

        X_init_train, X_init_test, X_nas_train, X_nas_test, mask_train, mask_test, \
        event_and_duration_train, event_and_duration_test = train_test_split(X, X_nas, mask, duration_and_event, test_size=0.3, random_state = sample_seed)

        X_init = {'train': X_init_train.double(), 'test': X_init_test.double()}
        X_nas = {'train': X_nas_train.double(), 'test': X_nas_test.double()}
        mask = {'train': mask_train.bool(), 'test': mask_test.bool()}
        duration_and_event = {'train': event_and_duration_train, 'test': event_and_duration_test}
        
        return {'X_init': X_init, 'X_na': X_nas, 'mask': mask, 'event_and_duration': duration_and_event}
    
    def _simulate_M_na_datasets(self, M, p_miss, mecha = "MCAR", opt = None, p_obs = None, q = None,
                               vary_cols = True, save = False, sample_seed = 135135, column_seed = 115342):
        """
        Function to generate M datasets with missing values.

        Parameters
        ----------
        M : int
            Number of datasets to generate.
        
        vary_cols: bool
            If True, the column_seed will vary.
            This yields the most variation when simulating MAR because retained columns will vary. 

        See other params above

        Returns
        ----------
        A dictionary containing:
        'X_init': dictionary with train and test of shape (n_train, d, M), (n_test, d, M) respectively. 
            Contains the initial data matrix, but split into train and test.
        'X_na': dictionary with train and test of shape (n_train, d, M), (n_test, d, M) respectively.
            Contains the dataset with simulated missing values, but split into train and test.
        'mask': dictionary with train and test of shape (n_train, d, M), (n_test, d, M)
            Contains the mask, but split into train and test.
        'event_and_duration': dictionary with train and test of shape (n_train, 2, M), (n_test, 2, M)
        """

        to_torch = torch.is_tensor(self.X) ## output a pytorch tensor, or a numpy array
        if not to_torch:
            X = self.X.astype(np.float32)
            X = torch.from_numpy(X)

        # Set the dimensions used for initiating tensors
        temporary_train, temporary_test = train_test_split(X, test_size=0.3, random_state = sample_seed)
        n_train = temporary_train.shape[0]
        n_test = temporary_test.shape[0]
        n = X.shape[0]
        d = X.shape[1]

        # Initialize empty tensor for the data with missing values and the mask
        event_and_duration_train = np.empty((n_train, 2, M), dtype = object)
        event_and_duration_test = np.empty((n_test, 2, M), dtype = object)
        
        X_init_tensor_train = torch.empty((n_train, d,  M))
        X_init_tensor_test = torch.empty((n_test, d, M))

        X_na_tensor_train = torch.empty((n_train, d, M))
        X_na_tensor_test = torch.empty((n_test, d, M))

        X_mask_tensor_train = torch.empty((n_train, d, M), dtype = torch.bool)
        X_mask_tensor_test = torch.empty((n_test, d, M), dtype = torch.bool)

        for i in range(M):
            X_init, X_na, X_mask, event_and_duration = self._simulate_single_na_dataset(p_miss = p_miss,
                                                                    mecha = mecha,
                                                                    p_obs = p_obs,
                                                                    sample_seed = sample_seed,
                                                                    column_seed = column_seed).values()
            
            event_and_duration_train[:, :, i] = event_and_duration['train']
            event_and_duration_test[:, :, i] = event_and_duration['test']
            
            X_init_tensor_train[:, :, i] = X_init['train'].double()
            X_init_tensor_test[:, :, i] = X_init['test'].double()

            X_na_tensor_train[:, :, i] = X_na['train'].double()
            X_na_tensor_test[:, :, i] = X_na['test'].double()

            X_mask_tensor_train[:, :, i] = X_mask['train'].bool()
            X_mask_tensor_test[:, :, i] = X_mask['test'].bool()

            # Change seeds
            sample_seed += 1
            if vary_cols:
                column_seed += 1

        X_init = {'train': X_init_tensor_train, 'test': X_init_tensor_test}
        X_na = {'train': X_na_tensor_train, 'test': X_na_tensor_test}
        mask = {'train': X_mask_tensor_train, 'test': X_mask_tensor_test}
        event_and_duration = {'train': event_and_duration_train, 'test': event_and_duration_test}

        return {'X_init': X_init, 'X_na': X_na, 'mask': mask, 'event_and_duration': event_and_duration}
    
    def _impute_baselines(self, X_na_train, X_na_test, multivariate = False, n_neighbors = 2):
        """
        Impute missing values using the baseline approaches.
        Numerical and categorical variables are imputed separately for univariate.

        Parameters:
        -----------
        X_na_train : torch.DoubleTensor, shape (n_train, d, M)
            Train data tensor with missing values.

        X_na_test : torch.DoubleTensor, shape (n_test, d, M)
            Test data tensor with missing values.

        multivariate : bool
            If True, use KNN imputation.

        n_neighbors : int
            Number of neighbors to use for KNN imputation.

        Returns:
        --------
        A dictionary containing:
        If univariate:
            'train': dictionary with numerical and categorical imputations for train.
                Methods are mean, zero and mode.
            'test': dictionary with numerical and categorical imputations for test.
                Methods are mean, zero and mode.    
        If multivariate:
            'train': dictionary with KNN imputations for train.
            'test': dictionary with KNN imputations for test.
        """

        # separate continous and categorical columns
        X_na_cat_train = X_na_train 
        X_na_cat_test = X_na_test
 
        if not multivariate:
            X_mode_train = X_na_cat_train.clone()
            X_mode_test = X_na_cat_test.clone()

            for i in range(X_na_train.shape[2]):
                imputed_train, imputed_test = mode_imputation(X_train = X_na_cat_train[:, :, i], X_test = X_na_cat_test[:, :, i]).values()
                X_mode_train[:, :, i] = imputed_train
                X_mode_test[:, :, i] = imputed_test

            train = {"categorical": {"mode": X_mode_train}}
            test = {"categorical": {"mode": X_mode_test}}
            
            return {"train": train, "test": test}
        
        else:
            X_knn_train = X_na_train.clone()
            X_knn_test = X_na_test.clone()

            for i in range(X_na_train.shape[2]):
                imputed_train, imputed_test = knn_imputation(X_train = X_na_train[:, :, i], X_test = X_na_test[:, :, i], n_neighbors = n_neighbors).values()
                X_knn_train[:, :, i] = imputed_train
                X_knn_test[:, :, i] = imputed_test
            
            train = {"knn": X_knn_train}
            test = {"knn": X_knn_test}
            return {"train": train, "test": test}

    def _evaluate_univariate_acc(self, X_imp, X_true, X_mask, column_wise = False):
        """
        Function to evaluate the univariate imputation accuracy.
        
        Parameters
        ----------
        X_imp : dict
            Dictionary containing the imputed datasets. The imputations are numpy tensors.

        X_true: torch.DoubleTensor, shape (n_train, d, M) or (n_test, d, M)
            Groundtruth data tensor.

        X_mask: torch.BoolTensor, shape (n_train, d, M) or (n_test, d, M)
            Missing value mask tensor.

        Returns
        -------
        acc : dict
            Dictionary containing the pooled accuracy for the imputed datasets.
        """
        # fetch the imputations
        X_imp_mode = X_imp["categorical"]["mode"]

        # separate the true data into numerical and categorical
        X_true_cat = X_true
        X_mask_cat = X_mask

        acc = {"categorical": {"mode": []}}

        M = X_mask.shape[2]

        # Get accuracy
        for i in range(M):
            # evaluate accuracy for categorical features
            acc["categorical"]["mode"].append(accuracy(X_imp = X_imp_mode[:, :, i], 
                                                      X_true = X_true_cat[:, :, i],
                                                      mask = X_mask_cat[:, :, i],
                                                      column_wise = column_wise))
        # Take the average and std
        for key, value in acc.items():
            for k, v in value.items():
                acc[key][k] = np.matrix(v)
                if column_wise:
                    acc[key][k] = (acc[key][k].mean(axis = 0).round(4), acc[key][k].std(axis = 0).round(4))
                else:
                    acc[key][k] = (acc[key][k].mean().round(4), acc[key][k].std().round(4))
        return acc
        
    def _evaluate_multivariate_acc(self, X_imp, X_true, X_mask):
        """
        Function to evaluate the KNN imputation approach.

        Parameters
        ----------
        X_imp : dict
            Dictionary containing the imputed datasets, shape (n_train, d, M) or (n_test, d, M).
    
        X_true: torch.DoubleTensor, shape (n_train, d, M) or (n_test, d, M)
            Groundtruth data tensor.
        
        X_mask: torch.BoolTensor, shape (n_train, d, M) or (n_test, d, M)
            Missing value mask tensor.
        
        Returns
        -------
        acc : dict
            Dictionary containing the pooled accuracy for the imputed datasets.
        """

        X_imp_knn = X_imp["knn"]
        X_imp_cat = X_imp_knn
        X_true_cat = X_true
        X_mask_cat = X_mask

        acc = {"categorical": {"knn": []}}

        M = X_mask.shape[2]
        for i in range(M):
            acc["categorical"]["knn"].append(accuracy(X_imp = X_imp_cat[:, :, i],
                                                    X_true = X_true_cat[:, :, i],
                                                    mask = X_mask_cat[:, :, i]))
        # Take the average and std
        for key, value in acc.items():
            for k, v in value.items():
                acc[key][k] = np.matrix(v)
                acc[key][k] = (acc[key][k].mean().round(4), acc[key][k].std().round(4))
        return acc
        
    def _evaluate_univariate_coxPH(self, decoded_imps_train, decoded_imps_test, event_and_duration_train, event_and_duration_test):
        """ 
        Evaluate the univariate cox PH model on the imputed datasets.

        Parameters
        ----------
        decoded_imps_train : dict
            Dictionary containing the imputed datasets. The imputations are numpy tensors of shape (n_train, d, M).
        
        decoded_imps_test : dict
            Dictionary containing the imputed datasets. The imputations are numpy tensors of shape (n_test, d, M).
        
        event_and_duration_train : torch.DoubleTensor, shape (n_train, 2, M)
            Tensor containing the event and duration columns for the train set.

        event_and_duration_test : torch.DoubleTensor, shape (n_test, 2, M)
            Tensor containing the event and duration columns for the test set.

        Returns
        -------
        dict
            Dictionary containing the concordance index (train and test) and bias for the imputed datasets.
        """


        X_imp_mode_train = decoded_imps_train["categorical"]["mode"]
        X_imp_mode_test = decoded_imps_test["categorical"]["mode"]
    
        M = X_imp_mode_train.shape[2]
        c_index_train = {"mode": []}
        c_index_test = {"mode": []}
        bias = {"mode": []}
        
        for i in range(M):
            df_mode_train = pd.DataFrame(X_imp_mode_train[:, :, i], columns = self.categorical_colnames)
            df_mode_test = pd.DataFrame(X_imp_mode_test[:, :, i], columns = self.categorical_colnames)

            df_mode_train = self._add_event_and_duration(df_mode_train, event_and_duration_train[:, 0, i], event_and_duration_train[:, 1, i])
            df_mode_test = self._add_event_and_duration(df_mode_test, event_and_duration_test[:, 0, i], event_and_duration_test[:, 1, i])

            weights_mode, mode_conc_train, mode_conc_test = self._fit_single_cox_PH(df_mode_train, df_mode_test).values()

            c_index_train['mode'].append(mode_conc_train)
            c_index_test['mode'].append(mode_conc_test)

            abs_bias_mode = self._evaluate_bias_weights(weights_mode)
            bias['mode'].append(abs_bias_mode)

        
        # Take the average and std
        for key,value in c_index_train.items():
            c_index_train[key] = (np.mean(value).round(4), np.std(value).round(4))
        for key,value in c_index_test.items():
            c_index_test[key] = (np.mean(value).round(4), np.std(value).round(4))

        for key, value in bias.items():
            column = pd.DataFrame(value)
            mean_and_std = dict(zip(column.columns, zip(round(column.mean(),4), round(column.std(), 4))))
            bias[key] = mean_and_std
        return {'c_index_train': c_index_train, 'c_index_test': c_index_test, 'bias': bias}
    
    def _evaluate_multivariate_coxPH(self, decoded_imps_train, decoded_imps_test, event_and_duration_train, event_and_duration_test):
        """
        Evaluate the multivariate cox PH model on the imputed datasets.

        Parameters
        ----------
        decoded_imps_train : dict
            Dictionary containing the imputed datasets. The imputations are numpy tensors of shape (n_train, d, M).

        decoded_imps_test : dict
            Dictionary containing the imputed datasets. The imputations are numpy tensors of shape (n_test, d, M).

        event_and_duration_train : torch.DoubleTensor, shape (n_train, 2, M)
            Tensor containing the event and duration columns for the train set.

        event_and_duration_test : torch.DoubleTensor, shape (n_test, 2, M)
            Tensor containing the event and duration columns for the test set.

        Returns
        -------
        dict
            Dictionary containing the concordance index (train and test) and bias for the imputed datasets.
        """

        X_imp_cat_train = decoded_imps_train["categorical"]["knn"]
        X_imp_cat_test = decoded_imps_test["categorical"]["knn"]

        M = X_imp_cat_train.shape[2]

        c_index_train = {"knn": []}
        c_index_test = {"knn": []}
        bias = {"knn": []}
        
        for i in range(M):
            df_cat_train = pd.DataFrame(X_imp_cat_train[:, :, i], columns = self.categorical_colnames)
            df_joined_train = self._add_event_and_duration(df_cat_train, event_and_duration_train[:, 0, i], event_and_duration_train[:, 1, i])

            df_cat_test = pd.DataFrame(X_imp_cat_test[:, :, i], columns = self.categorical_colnames)
            df_joined_test = self._add_event_and_duration(df_cat_test, event_and_duration_test[:, 0, i], event_and_duration_test[:, 1, i])

            weights, conc_train, conc_test = self._fit_single_cox_PH(df_joined_train, df_joined_test).values()

            c_index_train['knn'].append(conc_train)
            c_index_test['knn'].append(conc_test)
            bias_weights = self._evaluate_bias_weights(weights)
            bias['knn'].append(bias_weights)

        # Take the average and std
        for key,value in c_index_train.items():
            c_index_train[key] = (np.mean(value).round(4), np.std(value).round(4))
        for key,value in c_index_test.items():
            c_index_test[key] = (np.mean(value).round(4), np.std(value).round(4))

        for key, value in bias.items():
            column = pd.DataFrame(value)
            mean_and_std = dict(zip(column.columns, zip(round(column.mean(),4), round(column.std(), 4))))
            bias[key] = mean_and_std
        return {'c_index_train': c_index_train, 'c_index_test': c_index_test, 'bias': bias}

    def _evaluate_bias_weights(self, weights):
        """
        Takes in weights of the ith dataset in M and calculates the absolute bias for each feature.

        Parameters
        ----------
        weights : dict
            Dictionary containing the weights of the cox PH model.
            Key is the feature name and value is the weight.

        Returns
        -------
        abs_bias : dict
            Dictionary containing the absolute bias for each feature.
        """

        GT_weights = self.GT_weights
        abs_bias = {}
        for key, value in weights.items():
            abs_bias[key] = (value - GT_weights[key])/abs(GT_weights[key])
        return abs_bias
    
    def _CCA(self, X_na_train, X_na_test, event_and_duration_train, event_and_duration_test):
        """
        Function to run CCA.

        Parameters:
        -----------
        X_na_train : np.ndarray, shape (n_train, d, M)
            Train data tensor with missing values.

        X_na_test : np.ndarray, shape (n_test, d, M)
            Test data tensor with missing values.

        event_and_duration_train : torch.DoubleTensor, shape (n_train, 2, M)
            Tensor containing the event and duration columns for the train set.

        event_and_duration_test : torch.DoubleTensor, shape (n_test, 2, M)
            Tensor containing the event and duration columns for the test set.

        Returns:
        --------
        dict
            Dictionary containing the concordance index (train and test) and bias for the imputed datasets.
        """
        
        M = X_na_train.shape[2]
        X_na_cat_train = X_na_train
        X_na_cat_test = X_na_test

        conc_train = []
        conc_test = []
        bias = []

        for i in range(M):
            X_na_cat_train[:, :, i] = self._decode_categorical(X_na_cat_train[:, :, i])
            cat_train = pd.DataFrame(X_na_cat_train[:, :, i], columns = self.categorical_colnames)
            df_train = cat_train
            df_train = self._add_event_and_duration(df_train, event_and_duration_train[:, 0, i], event_and_duration_train[:, 1, i])
            df_train = df_train.dropna()

            X_na_cat_test[:, :, i] = self._decode_categorical(X_na_cat_test[:, :, i])
            cat_test = pd.DataFrame(X_na_cat_test[:, :, i], columns = self.categorical_colnames)
            df_test = cat_test
            df_test = self._add_event_and_duration(df_test, event_and_duration_test[:, 0, i], event_and_duration_test[:, 1, i])
            df_test = df_test.dropna()

            weights, c_index_train, c_index_test = self._fit_single_cox_PH(df_train, df_test).values()
            abs_bias = self._evaluate_bias_weights(weights)

            conc_train.append(c_index_train)
            conc_test.append(c_index_test)
            bias.append(abs_bias)

        # Take the average and std
        conc_avg_train = np.mean(conc_train)
        conc_std_train = np.std(conc_train)
        c_index_train = (conc_avg_train.round(4), conc_std_train.round(4))

        conc_avg_test = np.mean(conc_test)
        conc_std_test = np.std(conc_test)
        c_index_test = (conc_avg_test.round(4), conc_std_test.round(4))
        
        bias_column = pd.DataFrame(bias)
        mean_and_std = dict(zip(bias_column.columns, zip(round(bias_column.mean(),4), round(bias_column.std(), 4))))
        bias = mean_and_std

        return {'c_index_train': c_index_train, 'c_index_test': c_index_test, 'bias': bias}
    
    def _run_CCA(self, M, p_miss, mecha, p_obs = None):
        """ 
        Run CCA on the imputed datasets.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : float
            Proportion of missing values to generate for variables which will have missing values.

        mecha : str,
                Indicates the missing-data mechanism to be used. "MCAR" by default, "MAR" or "MNAR".
        
        p_obs : float
            Proportion of variables to retain if the mechanism is MAR.

        Returns
        -------
        dict
            Dictionary containing the concordance index (train and test) and bias for the imputed datasets.
        """

        X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            p_obs = p_obs,
                                                            mecha = mecha).values()

        X_na_train = X_na['train']        
        X_na_train = X_na_train.numpy()
        X_na_train = X_na_train.astype(object)
        event_and_duration_train = event_and_duration['train']

        X_na_test = X_na['test']
        X_na_test = X_na_test.numpy()
        X_na_test = X_na_test.astype(object)
        event_and_duration_test = event_and_duration['test']
        
        c_index_train, c_index_test, bias = self._CCA(X_na_train, X_na_test, event_and_duration_train, event_and_duration_test).values()
        train = {'bias': bias, 'c_index': c_index_train}
        test = {'bias': bias, 'c_index': c_index_test}
        return {'train': train, 'test': test}
           
class SimulationMCAR(SimulationMV):
    """
    Subclass of SimulationMV to simulate MCAR missing values.
    """

    def __init__(self, X, duration_col, event_col, cat_colnames):
        super().__init__(X, duration_col, event_col, cat_colnames)
    
    def _simulate_MCAR_dataset(self, M, p_miss, column_wise = False,
                        sample_seed = 135135, column_seed = 115342):
            """
            Simulate M datasets with MCAR missing values. See parent class for more details.
            """

            X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            mecha = "MCAR",
                                                            sample_seed = sample_seed,
                                                            column_seed = column_seed).values()
            
            
            return {'X_init': X_init, 'X_na': X_na, 'mask': X_mask, 'event_and_duration': event_and_duration}
    
    def _run_univariate_imputations(self, M, p_miss):
            """ 
            Simulate M datasets of missing values and impute them using univariate imputation methods.

            Parameters
            ----------
            M : int
                Number of datasets to generate.
            
            p_miss : float
                Proportion of missing values to generate for variables which will have missing values.

            Returns
            -------
            dict
                Dictionary containing train and test accuacy, concordance index and bias of the imputed datasets.
            """
            
            X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            mecha = 'MCAR').values()
            
            X_init_train = X_init['train']
            X_na_train = X_na['train']
            X_mask_train = X_mask['train']
            event_and_duration_train = event_and_duration['train']           

            X_init_test = X_init['test']
            X_na_test = X_na['test']
            X_mask_test = X_mask['test']
            event_and_duration_test = event_and_duration['test']


            # impute data
            imputations_train, imputation_test = self._impute_baselines(X_na_train, X_na_test).values()

            # Get accuracy
            acc_dict_train = self._evaluate_univariate_acc(X_imp = imputations_train,
                                        X_true = X_init_train,
                                        X_mask = X_mask_train) 
            acc_dict_test = self._evaluate_univariate_acc(X_imp = imputation_test,
                                        X_true = X_init_test,
                                        X_mask = X_mask_test)      

            # Map integer representation back to categories
            mode_imp_train = imputations_train["categorical"]["mode"]
            mode_imp_train = mode_imp_train.numpy()
            mode_imp_train = mode_imp_train.astype("object")

            mode_imp_test = imputation_test["categorical"]["mode"]
            mode_imp_test = mode_imp_test.numpy()
            mode_imp_test = mode_imp_test.astype("object")

            for i in range(M):
                mode_imp_train[:, :, i] = self._decode_categorical(mode_imp_train[:, :, i])
                mode_imp_test[:, :, i] = self._decode_categorical(mode_imp_test[:, :, i])
            

            decoded_imps_train = {"categorical": {"mode": mode_imp_train}}
            decoded_imps_test = {"categorical": {"mode": mode_imp_test}}
            
            c_index_train, c_index_test, bias = self._evaluate_univariate_coxPH(decoded_imps_train, decoded_imps_test, event_and_duration_train, event_and_duration_test).values()
            
            train = {'acc': acc_dict_train, 'c_index': c_index_train, 'bias': bias}
            test = {'acc': acc_dict_test, 'c_index': c_index_test, 'bias': bias}
            
            return {'train': train, 'test': test}
    
    def _run_multivariate_imputations(self, M, p_miss, n_neighbors = 2):
        """
        Simulate M datasets of missing values and impute them using multivariate imputation methods.

        Parameters
        ----------
        M : int
            Number of datasets to generate.
        
        p_miss : float
            Proportion of missing values to generate for variables which will have missing values.
        
        n_neighbors : int
            Number of neighbors to use for KNN imputation.

        Returns
        -------
        dict
            Dictionary containing train and test accuracy, concordance index and bias of the imputed datasets.
        """
        X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            mecha = 'MCAR').values()

        X_init_train = X_init['train']
        X_na_train = X_na['train']
        X_mask_train = X_mask['train']
        event_and_duration_train = event_and_duration['train']           

        X_init_test = X_init['test']
        X_na_test = X_na['test']
        X_mask_test = X_mask['test']
        event_and_duration_test = event_and_duration['test']

        # impute data
        imputations_train, imputation_test = self._impute_baselines(X_na_train, X_na_test, n_neighbors=n_neighbors, multivariate = True).values()

        # Get accuracy
        acc_dict_train = self._evaluate_multivariate_acc(X_imp = imputations_train,
                                    X_true = X_init_train,
                                    X_mask = X_mask_train) 
        acc_dict_test = self._evaluate_multivariate_acc(X_imp = imputation_test,
                                    X_true = X_init_test,
                                    X_mask = X_mask_test)    
        
        # decode the categorical variables
        X_imp_train = imputations_train["knn"]
        X_imp_cat_train = X_imp_train
        X_imp_cat_train = X_imp_cat_train.numpy()
        X_imp_cat_train = X_imp_cat_train.astype(object)

        X_imp_test = imputation_test["knn"]
        X_imp_cat_test = X_imp_test
        X_imp_cat_test = X_imp_cat_test.numpy()
        X_imp_cat_test = X_imp_cat_test.astype(object)

        for i in range(M):
            X_imp_cat_train[:, :, i] = self._decode_categorical(X_imp_cat_train[:, :, i])
            X_imp_cat_test[:, :, i] = self._decode_categorical(X_imp_cat_test[:, :, i])
                    
        decoded_imps_train = {"categorical": {"knn": X_imp_cat_train}}
        decoded_imps_test = {"categorical": {"knn": X_imp_cat_test}}

        c_index_train, c_index_test, bias = self._evaluate_multivariate_coxPH(decoded_imps_train, decoded_imps_test, event_and_duration_train, event_and_duration_test).values()
    
        train = {'acc': acc_dict_train, 'c_index': c_index_train, 'bias': bias}
        test = {'acc': acc_dict_test, 'c_index': c_index_test, 'bias': bias}
        return {'train': train, 'test': test}                                                  
        
    def _get_univariate_acc(self, acc_in):
        """ 
        Get the univariate accuracy for the imputed datasets.

        Parameters
        ----------
        acc_in : dict
            Dictionary containing the accuracy for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the accuracy for the imputed datasets of each p_miss.
        """

        acc_cols = pd.MultiIndex.from_tuples([
                        ('Categorical', 'Mode')
                        ])
        acc_row = []

        for miss in self.p_miss:
            acc = acc_in[self.p_miss.index(miss)]

            mode_mean = acc['categorical']['mode'][0]
            mode_std = acc['categorical']['mode'][1]

            row = [f"{mode_mean:.4f} ± {mode_std:.4f}"]
            acc_row.append(row)
        
        acc_out = pd.DataFrame(acc_row, index = self.p_miss , columns = acc_cols)
        acc_out.index.name = 'p_miss'
        return acc_out
    
    def _get_multivariate_acc(self, acc_in):
        """
        Get the multivariate accuracy for the imputed datasets.

        Parameters
        ----------
        acc_in : dict
            Dictionary containing the accuracy for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the accuracy for the imputed datasets of each p_miss.
        """

        acc_cols = pd.MultiIndex.from_tuples([
                        ('Categorical', 'KNN')
                        ])
        acc_row = []

        for miss in self.p_miss:
            acc = acc_in[self.p_miss.index(miss)]
                    
            knn_mean_cat = acc['categorical']['knn'][0]
            knn_std_cat = acc['categorical']['knn'][1]

            row = [f"{knn_mean_cat:.4f} ± {knn_std_cat:.4f}"]
            acc_row.append(row)
        
        acc_out = pd.DataFrame(acc_row, index = self.p_miss , columns = acc_cols)
        acc_out.index.name = 'p_miss'
        return acc_out

    def _get_univariate_conc(self, conc_in):
        """
        Get the univariate concordance index for the imputed datasets.

        Parameters
        ----------
        conc_in : dict
            Dictionary containing the concordance index for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the concordance index for the imputed datasets of each p_miss.
        """

        conc_cols = ['mode']
        conc_row = []

        for miss in self.p_miss:
            conc = conc_in[self.p_miss.index(miss)]
 
            mode_mean = conc['mode'][0]
            mode_std = conc['mode'][1]

            row = [f"{mode_mean:.4f} ± {mode_std:.4f}"]
            conc_row.append(row)

        conc_out = pd.DataFrame(conc_row, index = self.p_miss , columns = conc_cols)
        conc_out.index.name = 'p_miss'
        return conc_out

    def _get_multivariate_conc(self, conc_in):
        """
        Get the multivariate concordance index for the imputed datasets.

        Parameters
        ----------
        conc_in : dict
            Dictionary containing the concordance index for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the concordance index for the imputed datasets of each p_miss.
        """

        column = ['KNN']
        conc_row = []

        for miss in self.p_miss:
            conc = conc_in[self.p_miss.index(miss)]
            knn_mean = conc['knn'][0]
            knn_std = conc['knn'][1]
            row = [f"{knn_mean} ± {knn_std}"]
            conc_row.append(row)

        conc_out = pd.DataFrame(conc_row, index = self.p_miss , columns = column)
        conc_out.index.name = 'p_miss'
        return conc_out

    def _get_univariate_bias(self):
        """
        Get the univariate bias for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the bias for the imputed datasets of each p_miss.
        """

        feature_cols = list(self.bias[0]['mode'].keys())
        bias_cols = pd.MultiIndex.from_tuples([
            ('mode', col) for col in feature_cols
        ])
        bias_row = []
    
        for miss in self.p_miss:
            mode_bias = list(self.bias[self.p_miss.index(miss)]['mode'].values())

            mode_row = []

            for column_index in range(len(mode_bias)):
                mode_bias_mean = mode_bias[column_index][0]
                mode_bias_std = mode_bias[column_index][1]

                mode_row.append(f"{mode_bias_mean:.4f} ± {mode_bias_std:.4f}")
                
            row = mode_row
            
            bias_row.append(row)

        bias = pd.DataFrame(bias_row, index = self.p_miss , columns=bias_cols)
        bias.index.name = 'p_miss'

        return bias

    def _get_multivariate_bias(self):
        """
        Get the multivariate bias for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the bias for the imputed datasets of each p_miss.
        """

        feature_cols = list(self.bias[0]['knn'].keys())
        bias_row = []

        for miss in self.p_miss:
            knn_bias = list(self.bias[self.p_miss.index(miss)]['knn'].values())

            knn_row = []
            for column_index in range(len(knn_bias)):
                knn_bias_mean = knn_bias[column_index][0]
                knn_bias_std = knn_bias[column_index][1]
                knn_row.append(f"{knn_bias_mean} ± {knn_bias_std}")

            row = knn_row
            bias_row.append(row)

        bias = pd.DataFrame(bias_row, index = self.p_miss , columns = feature_cols)
        bias.index.name = 'p_miss'
        return bias
    
    def _get_cca_conc(self, conc_in):
        """
        Get the concordance index for the CCA datasets.

        Parameters
        ----------
        conc_in : dict
            Dictionary containing the concordance index for the CCA datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the concordance index for the CCA datasets of each p_miss.
        """

        column = ['CCA']
        conc_row = []

        for miss in self.p_miss:
            conc = conc_in[self.p_miss.index(miss)]
            mean = conc[0]
            std = conc[1]

            row = [f"{mean:.4f} ± {std:.4f}"]
            conc_row.append(row)

        conc_out = pd.DataFrame(conc_row, index = self.p_miss , columns = column)
        conc_out.index.name = 'p_miss'
        return conc_out

    def _get_cca_bias(self):
        """
        Get the bias for the CCA datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the bias for the CCA datasets of each p_miss.
        """

        feature_cols = list(self.bias[0].keys())
        complete_row = []

        for miss in self.p_miss:
            bias = list(self.bias[self.p_miss.index(miss)].values())
            column_row = []

            for column_index in range(len(bias)):
                mean = bias[column_index][0]
                std = bias[column_index][1]
                bias_row = f"{mean} ± {std}"
                column_row.append(bias_row)
            complete_row.append(column_row)
            
        bias = pd.DataFrame(complete_row, index = self.p_miss , columns = feature_cols)
        bias.index.name = 'p_miss'
        return bias

    def simulate_univariate_imputations(self, M, p_miss = [0.1, 0.2, 0.3, 0.4, 0.5]):
        """
        Run a simulation with univariate imputations.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : list
            List of proportions of missing values to generate for variables which will have missing values.

        """

        self.p_miss = p_miss

        self.acc_train = []
        self.acc_test = []
        self.conc_train = []
        self.conc_test = []
        self.bias = []

        # Get mean and std of each p_miss
        for miss in p_miss:
            print(f"Currently simulating for p_miss: {miss}")
            train, test = self._run_univariate_imputations(M = M, p_miss = miss).values()

            acc_train = train['acc']
            conc_train = train['c_index']
            acc_test = test['acc']
            conc_test = test['c_index']
            bias = train['bias']
                    
            self.acc_train.append(acc_train)
            self.acc_test.append(acc_test)
            self.conc_train.append(conc_train)
            self.conc_test.append(conc_test)
            self.bias.append(bias)

    def get_univariate_imputed_results(self):
        """
        Get the results of the univariate imputation simulation.

        Returns
        -------
        dict
            A dictionary containing the accuracy, concordance index and bias for the univariate imputation simulation.
            The dictionary values are dataframes.
        """

        acc_train, acc_test = self._get_univariate_acc(self.acc_train), self._get_univariate_acc(self.acc_test)
        conc_train, conc_test = self._get_univariate_conc(self.conc_train), self._get_univariate_conc(self.conc_test)
        bias = self._get_univariate_bias()

        return {'acc_train': acc_train, 'acc_test': acc_test, 'c_index_train': conc_train, 'c_index_test': conc_test, 'bias': bias}
    
    def simulate_multivariate_imputations(self, M, p_miss = [0.1, 0.2, 0.3, 0.4, 0.5], n_neighbors = 2):
        """
        Run a simulation with multivariate imputations.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : list
            List of proportions of missing values to generate for variables which will have missing values.

        n_neighbors : int
            Number of neighbors to use for KNN imputation.
        """

        self.p_miss = p_miss

        self.acc_train = []
        self.acc_test = []
        self.conc_train = []
        self.conc_test = []
        self.bias = []

        for miss in p_miss:
            print(f"Currently simulating for p_miss: {miss}")
            train, test = self._run_multivariate_imputations(M = M, p_miss = miss, n_neighbors = n_neighbors).values()

            acc_train = train['acc']
            conc_train = train['c_index']
            acc_test = test['acc']
            conc_test = test['c_index']
            bias = train['bias']
                    
            self.acc_train.append(acc_train)
            self.acc_test.append(acc_test)
            self.conc_train.append(conc_train)
            self.conc_test.append(conc_test)
            self.bias.append(bias)

    def get_multivariate_imputed_results(self):
        """ 
        Get the results of the multivariate imputation simulation.
        
        Returns
        -------
        dict
            A dictionary containing the accuracy, concordance index and bias for the multivariate imputation simulation.
            The dictionary values are dataframes.
        """
        acc_train, acc_test = self._get_multivariate_acc(self.acc_train), self._get_multivariate_acc(self.acc_test)
        conc_train, conc_test = self._get_multivariate_conc(self.conc_train), self._get_multivariate_conc(self.conc_test)
        bias = self._get_multivariate_bias()

        return {'acc_train': acc_train, 'acc_test': acc_test, 'c_index_train': conc_train, 'c_index_test': conc_test, 'bias': bias}
            
    def simulate_cca(self, M, p_miss = [0.1, 0.2, 0.3]):
        """
        Run a simulation with CCA.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : list
            List of proportions of missing values to generate for variables which will have missing values.
        """

        self.p_miss = p_miss

        self.conc_train = []
        self.conc_test = []
        self.bias = []

        for miss in p_miss:
            print(f"Currently simulating for p_miss: {miss}")
            train, test = self._run_CCA(M = M, p_miss = miss, mecha = 'MCAR').values()
            conc_train = train['c_index']
            conc_test = test['c_index']
            bias = train['bias']
            
            self.conc_train.append(conc_train)
            self.conc_test.append(conc_test)
            self.bias.append(bias)

    def get_cca_results(self):
        """
        Get the results of the CCA simulation.

        Returns
        -------
        dict
            A dictionary containing the concordance index and the bias for the CCA simulation.
            The dictionary values are dataframes.        
        """
        
        conc_train, conc_test = self._get_cca_conc(self.conc_train), self._get_cca_conc(self.conc_test)
        bias = self._get_cca_bias()

        return {'c_index_train': conc_train, 'c_index_test': conc_test, 'bias': bias}

class SimulationMAR(SimulationMV):
    """
    Subclass of SimulationMV to simulate MAR missing values.
    """
    def __init__(self, X, duration_col, event_col, cat_colnames):
        super().__init__(X, duration_col, event_col, cat_colnames)
    
    def _simulate_MAR_dataset(self, M, p_miss, p_obs, column_wise = False,
                        sample_seed = 135135, column_seed = 115342):
            """
            Simulate M datasets with MAR missing values.
            See parent class for more details.
            """

            X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            mecha = "MAR",
                                                            p_obs = p_obs,
                                                            sample_seed = sample_seed,
                                                            column_seed = column_seed).values()           
            
            return {'X_init': X_init, 'X_na': X_na, 'mask': X_mask, 'event_and_duration': event_and_duration}
    
    def _run_univariate_imputations(self, M, p_miss, p_obs):
            """ 
            Simulate M datasets of missing values and impute them using univariate imputation methods.

            Parameters
            ----------
            M : int
                Number of datasets to generate.
            
            p_miss : float
                Proportion of missing values to generate for variables which will have missing values.

            p_obs : float
                Proportion of variables to not contain missing values.

            Returns
            -------
            dict
                Dictionary containing train and test accuracy, concordance index and bias of the imputed datasets.
            """

            X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            mecha = 'MAR',
                                                            p_obs = p_obs).values()

            X_init_train = X_init['train']
            X_na_train = X_na['train']
            X_mask_train = X_mask['train']
            event_and_duration_train = event_and_duration['train']           

            X_init_test = X_init['test']
            X_na_test = X_na['test']
            X_mask_test = X_mask['test']
            event_and_duration_test = event_and_duration['test']

            # impute data
            imputations_train, imputation_test = self._impute_baselines(X_na_train, X_na_test).values()

            # Get accuracy
            acc_dict_train = self._evaluate_univariate_acc(X_imp = imputations_train,
                                        X_true = X_init_train,
                                        X_mask = X_mask_train) 
            acc_dict_test = self._evaluate_univariate_acc(X_imp = imputation_test,
                                        X_true = X_init_test,
                                        X_mask = X_mask_test)            


            # Map integer representation back to categories
            mode_imp_train = imputations_train["categorical"]["mode"]
            mode_imp_train = mode_imp_train.numpy()
            mode_imp_train = mode_imp_train.astype("object")

            mode_imp_test = imputation_test["categorical"]["mode"]
            mode_imp_test = mode_imp_test.numpy()
            mode_imp_test = mode_imp_test.astype("object")

            for i in range(M):
                mode_imp_train[:, :, i] = self._decode_categorical(mode_imp_train[:, :, i])
                mode_imp_test[:, :, i] = self._decode_categorical(mode_imp_test[:, :, i])
            

            decoded_imps_train = {"categorical": {"mode": mode_imp_train}}
            decoded_imps_test = {"categorical": {"mode": mode_imp_test}}
            
            c_index_train, c_index_test, bias = self._evaluate_univariate_coxPH(decoded_imps_train, decoded_imps_test, event_and_duration_train, event_and_duration_test).values()
            
            train = {'acc': acc_dict_train, 'c_index': c_index_train, 'bias': bias}
            test = {'acc': acc_dict_test, 'c_index': c_index_test, 'bias': bias}
            
            return {'train': train, 'test': test}
    
    def _run_multivariate_imputations(self, M, p_miss, p_obs, n_neighbors = 2):
        """
        Simulate M datasets of missing values and impute them using multivariate imputation methods.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : float
            Proportion of missing values to generate for variables which will have missing values.

        p_obs : float
            Proportion of variables to retain if the mechanism is MAR.

        n_neighbors : int
            Number of neighbors to use for KNN imputation.

        Returns
        -------
        dict
            Dictionary containing train and test accuracy, concordance index and bias of the imputed datasets.
        """

        X_init, X_na, X_mask, event_and_duration = self._simulate_M_na_datasets(M = M,
                                                            p_miss = p_miss,
                                                            mecha = 'MAR',
                                                            p_obs = p_obs).values()
        X_init_train = X_init['train']
        X_na_train = X_na['train']
        X_mask_train = X_mask['train']
        event_and_duration_train = event_and_duration['train']           

        X_init_test = X_init['test']
        X_na_test = X_na['test']
        X_mask_test = X_mask['test']
        event_and_duration_test = event_and_duration['test']

        # impute data
        imputations_train, imputation_test = self._impute_baselines(X_na_train, X_na_test, n_neighbors=n_neighbors, multivariate = True).values()

        # Get accuracy
        acc_dict_train = self._evaluate_multivariate_acc(X_imp = imputations_train,
                                    X_true = X_init_train,
                                    X_mask = X_mask_train) 
        acc_dict_test = self._evaluate_multivariate_acc(X_imp = imputation_test,
                                    X_true = X_init_test,
                                    X_mask = X_mask_test)    
        
        # decode the categorical variables
        X_imp_train = imputations_train["knn"]
        X_imp_cat_train = X_imp_train
        X_imp_cat_train = X_imp_cat_train.numpy()
        X_imp_cat_train = X_imp_cat_train.astype(object)

        X_imp_test = imputation_test["knn"]
        X_imp_cat_test = X_imp_test
        X_imp_cat_test = X_imp_cat_test.numpy()
        X_imp_cat_test = X_imp_cat_test.astype(object)

        for i in range(M):
            X_imp_cat_train[:, :, i] = self._decode_categorical(X_imp_cat_train[:, :, i])
            X_imp_cat_test[:, :, i] = self._decode_categorical(X_imp_cat_test[:, :, i])
                    
        decoded_imps_train = {"categorical": {"knn": X_imp_cat_train}}
        decoded_imps_test = {"categorical": {"knn": X_imp_cat_test}}

        c_index_train, c_index_test, bias = self._evaluate_multivariate_coxPH(decoded_imps_train, decoded_imps_test, event_and_duration_train, event_and_duration_test).values()
    
        train = {'acc': acc_dict_train, 'c_index': c_index_train, 'bias': bias}
        test = {'acc': acc_dict_test, 'c_index': c_index_test, 'bias': bias}
        return {'train': train, 'test': test}     
    

    def _get_univariate_acc(self, acc_in):
        """
        Get the univariate accuracy for the imputed datasets.
        
        Parameters
        ----------
        acc_in : dict
            Dictionary containing the accuracy for the imputed datasets.
            
        Returns
        -------
        pd.DataFrame
            DataFrame containing the accuracy for the imputed datasets for each p_miss and p_obs.
        """

        acc_cols = pd.MultiIndex.from_tuples([
                ('Categorical', 'Mode')
                ])
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        acc_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                acc = acc_in[self.p_miss.index(miss), self.p_obs.index(obs)]

                mode_mean = acc['categorical']['mode'][0]
                mode_std = acc['categorical']['mode'][1]
                row = [f"{mode_mean} ± {mode_std}"]

                acc_row.append(row)

        return pd.DataFrame(acc_row, index = row_index, columns = acc_cols)
    
    def _get_multivariate_acc(self, acc_in):
        """
        Get the multivariate accuracy for the imputed datasets.

        Parameters
        ----------
        acc_in : dict
            Dictionary containing the accuracy for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the accuracy for the imputed datasets for each p_miss and p_obs.
        """

        acc_cols = pd.MultiIndex.from_tuples([
                ('Categorical', 'KNN')
                ])
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        acc_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                acc = acc_in[self.p_miss.index(miss), self.p_obs.index(obs)]

                knn_mean_cat = acc['categorical']['knn'][0]
                knn_std_cat = acc['categorical']['knn'][1]

                row = [f"{knn_mean_cat} ± {knn_std_cat}"]
                acc_row.append(row)

        return pd.DataFrame(acc_row, index = row_index, columns = acc_cols)

    def _get_univariate_conc(self, conc_in):
        """
        Get the univariate concordance index for the imputed datasets.

        Parameters
        ----------
        conc_in : dict
            Dictionary containing the concordance index for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the concordance index for the imputed datasets for each p_miss and p_obs.
        """

        conc_cols = ['mode']
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        conc_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                conc = conc_in[self.p_miss.index(miss), self.p_obs.index(obs)]

                mode_mean = conc['mode'][0]
                mode_std = conc['mode'][1]

                row = [f"{mode_mean} ± {mode_std}"]
                conc_row.append(row)
        return pd.DataFrame(conc_row, index = row_index, columns = conc_cols)

    def _get_multivariate_conc(self, conc_in):
        """
        Get the multivariate concordance index for the imputed datasets.

        Parameters
        ----------
        conc_in : dict
            Dictionary containing the concordance index for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the concordance index for the imputed datasets for each p_miss and p_obs.
        """

        column = ['KNN']
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        conc_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                conc = conc_in[self.p_miss.index(miss), self.p_obs.index(obs)]
                knn_mean = conc['knn'][0]
                knn_std = conc['knn'][1]
                row = [f"{knn_mean} ± {knn_std}"]
                conc_row.append(row)

        return pd.DataFrame(conc_row, index = row_index, columns = column)

    def _get_univariate_bias(self):
        """
        Get the univariate bias for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the bias for the imputed datasets for each p_miss and p_obs.
        """

        feature_cols = list(self.bias[0][0]['mode'].keys())
        bias_cols = pd.MultiIndex.from_tuples([
            ('mode', col) for col in feature_cols
        ])
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        bias_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                mode_bias = list(self.bias[self.p_miss.index(miss), self.p_obs.index(obs)]['mode'].values())
                mode_row = []
                for column_index in range(len(mode_bias)):
                    mode_bias_mean = mode_bias[column_index][0]
                    mode_bias_std = mode_bias[column_index][1]

                    mode_row.append(f"{mode_bias_mean} ± {mode_bias_std}")
                row = mode_row
                bias_row.append(row)
    
        return pd.DataFrame(bias_row, index = row_index, columns = bias_cols)
    
    def _get_multivariate_bias(self):
        """
        Get the multivariate bias for the imputed datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the bias for the imputed datasets for each p_miss and p_obs.
        """

        feature_cols = list(self.bias[0][0]['knn'].keys())
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        bias_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                knn_bias = list(self.bias[self.p_miss.index(miss), self.p_obs.index(obs)]['knn'].values())
                knn_row = []
                for column_index in range(len(knn_bias)):
                    knn_bias_mean = knn_bias[column_index][0]
                    knn_bias_std = knn_bias[column_index][1]
                    knn_row.append(f"{knn_bias_mean} ± {knn_bias_std}")
                bias_row.append(knn_row)
        return pd.DataFrame(bias_row, index = row_index, columns = feature_cols)
    

    def _get_cca_conc(self, conc_in):
        """
        Get the concordance index for the CCA datasets.

        Parameters
        ----------
        conc_in : dict
            Dictionary containing the concordance index for the CCA datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the concordance index for the CCA datasets of each p_miss and p_obs.
        """

        column = ['CCA']
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        conc_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                conc = conc_in[self.p_miss.index(miss), self.p_obs.index(obs)]
                mean = conc[0]
                std = conc[1]

                row = [f"{mean} ± {std}"]
                conc_row.append(row)

        conc_out = pd.DataFrame(conc_row, index = row_index, columns = column)
        conc_out.index.name = 'p_miss'
        return conc_out

    def _get_cca_bias(self):
        """
        Get the bias for the CCA datasets.

        Returns
        -------
        pd.DataFrame
            DataFrame containing the bias for the CCA datasets of each p_miss and p_obs.
        """

        feature_cols = list(self.bias[0][0].keys())
        index = ((miss, obs) for miss in self.p_miss for obs in self.p_obs)
        row_index = pd.MultiIndex.from_tuples(index, names=['p_miss', 'p_obs'])
        complete_row = []

        for miss in self.p_miss:
            for obs in self.p_obs:
                bias = list(self.bias[self.p_miss.index(miss), self.p_obs.index(obs)].values())

                column_row = []  
                for column_index in range(len(bias)):                
                    mean = bias[column_index][0]
                    std = bias[column_index][1]
                    bias_row = f"{mean} ± {std}"
                    column_row.append(bias_row)

                complete_row.append(column_row)

        bias = pd.DataFrame(complete_row, index = row_index, columns = feature_cols)
        bias.index.name = 'p_miss'
        return bias
    
    def simulate_univariate_imputations(self, M, p_miss = [0.1, 0.2, 0.3, 0.4, 0.5], p_obs = [8/9, 7/9, 6/9, 5/9, 4/9]):
        """
        Run a simulation with univariate imputations.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : list
            List of proportions of missing values to generate for variables which will have missing values.

        p_obs : list
            List of proportions of variables to retain.
        """

        self.p_miss = p_miss
        self.p_obs = p_obs

        self.acc_train = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.acc_test = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.conc_train = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.conc_test = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.bias = np.empty((len(p_miss), len(p_obs)), dtype=object)

        for miss in p_miss:
            for obs in p_obs:
                print(f"Currently running for p_miss: {miss} and p_obs: {obs}")
                train, test = self._run_univariate_imputations(M = M, p_miss = miss, p_obs = obs).values()
                acc_train = train['acc']
                conc_train = train['c_index']
                acc_test = test['acc']
                conc_test = test['c_index']
                bias = train['bias']

                self.acc_train[p_miss.index(miss), p_obs.index(obs)] = acc_train
                self.acc_test[p_miss.index(miss), p_obs.index(obs)] = acc_test
                self.conc_train[p_miss.index(miss), p_obs.index(obs)] = conc_train
                self.conc_test[p_miss.index(miss), p_obs.index(obs)] = conc_test
                self.bias[p_miss.index(miss), p_obs.index(obs)] = bias

    def get_univariate_imputed_results(self):
        """
        Get the results of the univariate imputation simulation.

        Returns
        -------
        dict
            A dictionary containing the accuracy, concordance index and bias for the univariate imputation simulation.
            The dictionary values are dataframes.
        """

        acc_train, acc_test = self._get_univariate_acc(self.acc_train), self._get_univariate_acc(self.acc_test)
        conc_train, conc_test = self._get_univariate_conc(self.conc_train), self._get_univariate_conc(self.conc_test)
        bias = self._get_univariate_bias()

        return {'acc_train': acc_train, 'acc_test': acc_test, 'c_index_train': conc_train, 'c_index_test': conc_test, 'bias': bias}
    
    def simulate_multivariate_imputations(self, M, p_miss = [0.1, 0.2, 0.3, 0.4, 0.5], p_obs = [8/9, 7/9, 6/9, 5/9, 4/9], n_neighbors = 2):
        """
        Run a simulation with multivariate imputations.

        Parameters
        ----------
        M : int
            Number of datasets to generate.
        p_miss : list
            List of proportions of missing values to generate for variables which will have missing values.

        p_obs : list
            List of proportions of variables to retain.

        n_neighbors : int
            Number of neighbors to use for KNN imputation.
        """

        self.p_miss = p_miss
        self.p_obs = p_obs

        self.acc_train = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.acc_test = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.conc_train = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.conc_test = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.bias = np.empty((len(p_miss), len(p_obs)), dtype=object)

        for miss in p_miss:
            for obs in p_obs:
                print(f"Currently running for p_miss: {miss} and p_obs: {obs}")
                train, test = self._run_multivariate_imputations(M = M, p_miss = miss, p_obs = obs, n_neighbors = n_neighbors).values()
                acc_train = train['acc']
                conc_train = train['c_index']
                acc_test = test['acc']
                conc_test = test['c_index']
                bias = train['bias']

                self.acc_train[p_miss.index(miss), p_obs.index(obs)] = acc_train
                self.acc_test[p_miss.index(miss), p_obs.index(obs)] = acc_test
                self.conc_train[p_miss.index(miss), p_obs.index(obs)] = conc_train
                self.conc_test[p_miss.index(miss), p_obs.index(obs)] = conc_test
                self.bias[p_miss.index(miss), p_obs.index(obs)] = bias

    def get_multivariate_imputed_results(self):
        """
        Get the results of the multivariate imputation simulation.

        Returns
        -------
        dict
            A dictionary containing the accuracy, concordance index and bias for the multivariate imputation simulation.
            The dictionary values are dataframes.
        """

        acc_train, acc_test = self._get_multivariate_acc(self.acc_train), self._get_multivariate_acc(self.acc_test)
        conc_train, conc_test = self._get_multivariate_conc(self.conc_train), self._get_multivariate_conc(self.conc_test)
        bias = self._get_multivariate_bias()

        return {'acc_train': acc_train, 'acc_test': acc_test, 'c_index_train': conc_train, 'c_index_test': conc_test, 'bias': bias}

    def simulate_cca(self, M, p_miss = [0.1, 0.2, 0.3, 0.4, 0.5], p_obs = [8/9, 7/9, 6/9, 5/9, 4/9]):
        """
        Run a simulation with CCA.

        Parameters
        ----------
        M : int
            Number of datasets to generate.

        p_miss : list
            List of proportions of missing values to generate for variables which will have missing values.

        p_obs : list
            List of proportions of variables to retain.
        """

        self.p_miss = p_miss
        self.p_obs = p_obs

        self.conc_train = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.conc_test  = np.empty((len(p_miss), len(p_obs)), dtype=object)
        self.bias = np.empty((len(p_miss), len(p_obs)), dtype=object)

        for miss in p_miss:
            for obs in p_obs:
                print(f"Currently running for p_miss: {miss} and p_obs: {obs}")
                train, test = self._run_CCA(M = M, p_miss = miss, p_obs = obs, mecha = 'MAR').values()
                conc_train = train['c_index']
                conc_test = test['c_index']
                bias = train['bias']
            
                self.conc_train[p_miss.index(miss), p_obs.index(obs)] = conc_train
                self.conc_test[p_miss.index(miss), p_obs.index(obs)] = conc_test
                self.bias[p_miss.index(miss), p_obs.index(obs)] = bias

    def get_cca_results(self):
        """
        Get the results of the CCA simulation.

        Returns
        -------
        dict
            A dictionary containing the concordance index and the bias for the CCA simulation.
            The dictionary values are dataframes.        
        """

        conc_train, conc_test = self._get_cca_conc(self.conc_train), self._get_cca_conc(self.conc_test)
        bias = self._get_cca_bias()

        return {'c_index_train': conc_train, 'c_index_test': conc_test, 'bias': bias}


## Reading data

In [14]:
df = pd.read_csv('../../data/colon/colon.csv', sep=',', index_col=0)
# Start by subetting data where etype == 2
df = df[df['etype'] == 2]
# drop node4 column 
df = df.drop('node4', axis=1)

df.dropna(axis=0, inplace=True)
cox_ph_cols = df.columns.drop(['id', 'study', 'age', 'etype'])

to_keep = [col for col in df.columns if col not in ['etype', 'study', 'id', 'age']]
df = df[to_keep]
df = df.reset_index(drop=True)
df['sex'] = df['sex'].map({1: 'M', 0: 'F'})
df['obstruct'] = df['obstruct'].map({1: 'Y', 0: 'N'})
df['perfor'] = df['perfor'].map({1: 'Y', 0: 'N'})
df['adhere'] = df['adhere'].map({1: 'Y', 0: 'N'})
df['extent'] = df['extent'].map({1: 'Submucosa', 2: 'Muscule', 3: 'Serosa', 4: 'Contiguous_structures'})
df['surg'] = df['surg'].map({1: 'L', 0: 'S'})
df['status'] = df['status'].map({1: True, 0: False})
df["nodes"] = pd.cut(df["nodes"], bins = [-1, 1, 3, 7, 100], labels = [1.0, 2.0, 3.0, 4.0])
df.head()


Unnamed: 0,rx,sex,obstruct,perfor,adhere,nodes,status,differ,extent,surg,time
0,Lev+5FU,M,N,N,N,3.0,True,2.0,Serosa,S,1521
1,Lev+5FU,M,N,N,N,1.0,False,2.0,Serosa,S,3087
2,Obs,F,N,N,Y,3.0,True,2.0,Muscule,S,963
3,Lev+5FU,F,Y,N,N,3.0,True,2.0,Serosa,L,293
4,Obs,M,N,N,N,4.0,True,2.0,Serosa,L,659


In [15]:
cat_colnames = ['rx', 'sex', 'obstruct', 'perfor', 'adhere', 'nodes', 'differ', 'extent', 'surg']

## Running simulations

### Initial summary

In [16]:
init = SimulationMCAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
init.complete_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'status'
baseline estimation,breslow
number of observations,888
number of events observed,430
partial log-likelihood,-2707.79
time fit was run,2024-05-10 11:46:22 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
rx,-0.08,0.92,0.05,-0.18,0.02,0.83,1.02,0.0,-1.6,0.11,3.19
nodes,0.44,1.55,0.05,0.34,0.53,1.41,1.71,0.0,9.06,<0.005,62.73
differ,0.09,1.09,0.05,-0.02,0.19,0.98,1.2,0.0,1.66,0.10,3.37
extent,-0.16,0.85,0.05,-0.27,-0.05,0.77,0.95,0.0,-2.94,<0.005,8.26
sex_M,-0.01,0.99,0.05,-0.11,0.08,0.9,1.08,0.0,-0.31,0.76,0.4
obstruct_Y,0.09,1.09,0.05,-0.0,0.18,1.0,1.2,0.0,1.89,0.06,4.1
perfor_Y,-0.03,0.98,0.05,-0.12,0.07,0.89,1.07,0.0,-0.54,0.59,0.76
adhere_Y,0.09,1.09,0.05,-0.0,0.18,1.0,1.19,0.0,1.86,0.06,4.0
surg_S,-0.11,0.9,0.05,-0.2,-0.02,0.82,0.98,0.0,-2.32,0.02,5.61

0,1
Concordance,0.66
Partial AIC,5433.59
log-likelihood ratio test,120.23 on 9 df
-log2(p) of ll-ratio test,69.50


### MCAR

#### Univariate

In [17]:
M = 50
mcar_uni = SimulationMCAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
mcar_uni.simulate_univariate_imputations(M = M, p_miss = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5])

Currently simulating for p_miss: 0.05
Currently simulating for p_miss: 0.1
Currently simulating for p_miss: 0.2
Currently simulating for p_miss: 0.3
Currently simulating for p_miss: 0.4
Currently simulating for p_miss: 0.5


In [19]:
acc_train, acc_test, conc_train, conc_test, bias = mcar_uni.get_univariate_imputed_results().values()
acc_train.to_csv(f'../results/acc_train_MCAR{M}_univariate.csv')
acc_test.to_csv(f'../results/acc_test_MCAR{M}_univariate.csv')
conc_train.to_csv(f'../results/c_index_train_MCAR{M}_univariate.csv')
conc_test.to_csv(f'../results/c_index_test_MCAR{M}_univariate.csv')
bias.to_csv(f'../results/bias_MCAR{M}_univariate.csv')

#### Multivariate (kNN)

In [20]:
n_neighbors = 10
M = 50

mcar_knn = SimulationMCAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
mcar_knn.simulate_multivariate_imputations(M = M, p_miss = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5], n_neighbors = n_neighbors)

Currently simulating for p_miss: 0.05
Currently simulating for p_miss: 0.1
Currently simulating for p_miss: 0.2
Currently simulating for p_miss: 0.3
Currently simulating for p_miss: 0.4
Currently simulating for p_miss: 0.5


In [23]:
acc_train, acc_test, conc_train, conc_test, bias = mcar_knn.get_multivariate_imputed_results().values()
acc_train.to_csv(f'../results/acc_train_MCAR{M}_knn_N{n_neighbors}.csv')
acc_test.to_csv(f'../results/acc_test_MCAR{M}_knn_N{n_neighbors}.csv')
conc_train.to_csv(f'../results/c_index_train_MCAR{M}_knn_N{n_neighbors}.csv')
conc_test.to_csv(f'../results/c_index_test_MCAR{M}_knn_N{n_neighbors}.csv')
bias.to_csv(f'../results/bias_MCAR{M}_knn_N{n_neighbors}.csv')

#### CCA

In [24]:
M = 50

mcar_cca = SimulationMCAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
mcar_cca.simulate_cca(M = M, p_miss = [0.05, 0.1, 0.15, 0.2, 0.25])

Currently simulating for p_miss: 0.05
Currently simulating for p_miss: 0.1
Currently simulating for p_miss: 0.15
Currently simulating for p_miss: 0.2
Currently simulating for p_miss: 0.25


In [25]:
conc_train, conc_test, bias = mcar_cca.get_cca_results().values()
conc_train.to_csv(f'../results/c_index_train_MCAR{M}_cca.csv')
conc_test.to_csv(f'../results/c_index_test_MCAR{M}_cca.csv')
bias.to_csv(f'../results/bias_MCAR{M}_cca.csv')

### MAR

#### Univariate

In [26]:
M = 50

mar_uni = SimulationMAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
mar_uni.simulate_univariate_imputations(M = M, p_miss = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5], p_obs = [8/9, 7/9, 6/9, 5/9, 4/9, 3/9])

Currently running for p_miss: 0.05 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.05 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.05 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.05 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.05 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.05 and p_obs: 0.3333333333333333
Currently running for p_miss: 0.1 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.1 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.1 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.1 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.1 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.1 and p_obs: 0.3333333333333333
Currently running for p_miss: 0.2 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.2 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.2 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.2 

In [None]:
acc_train, acc_test, conc_train, conc_test, bias = mar_uni.get_univariate_imputed_results().values()
acc_train.to_csv(f'../results/acc_train_MAR{M}_univariate.csv')
acc_test.to_csv(f'../results/acc_test_MAR{M}_univariate.csv')
conc_train.to_csv(f'../results/c_index_train_MAR{M}_univariate.csv')
conc_test.to_csv(f'../results/c_index_test_MAR{M}_univariate.csv')
bias.to_csv(f'../results/bias_MAR{M}_univariate.csv')

#### Multivariate

In [52]:
M = 50
n_neighbors = 10

mar_multi = SimulationMAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
mar_multi.simulate_multivariate_imputations(M = M, p_miss = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5], p_obs = [8/9, 7/9, 6/9, 5/9, 4/9, 3/9], n_neighbors = n_neighbors)

Currently running for p_miss: 0.05 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.05 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.05 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.05 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.05 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.05 and p_obs: 0.3333333333333333
Currently running for p_miss: 0.1 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.1 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.1 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.1 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.1 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.1 and p_obs: 0.3333333333333333
Currently running for p_miss: 0.2 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.2 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.2 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.2 

In [None]:
acc_train, acc_test, conc_train, conc_test, bias = mar_multi.get_multivariate_imputed_results().values()
acc_train.to_csv(f'../results/acc_train_MAR{M}_knn_N{n_neighbors}.csv')
acc_test.to_csv(f'../results/acc_test_MAR{M}_knn_N{n_neighbors}.csv')
conc_train.to_csv(f'../results/c_index_train_MAR{M}_knn_N{n_neighbors}.csv')
conc_test.to_csv(f'../results/c_index_test_MAR{M}_knn_N{n_neighbors}.csv')
bias.to_csv(f'../results/bias_MAR{M}_knn_N{n_neighbors}.csv')

#### CCA

In [90]:
M = 50
mar_cca = SimulationMAR(df, duration_col = 'time', event_col = 'status', cat_colnames = cat_colnames)
mar_cca.simulate_cca(M = M, p_miss = [0.05, 0.1, 0.15, 0.2, 0.25], p_obs = [8/9, 7/9, 6/9, 5/9, 4/9, 3/9])

Currently running for p_miss: 0.05 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.05 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.05 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.05 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.05 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.05 and p_obs: 0.3333333333333333


  risk_set2 += np.exp(xw[k])


Currently running for p_miss: 0.1 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.1 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.1 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.1 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.1 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.1 and p_obs: 0.3333333333333333


  delta = solve(
  loss -= (numerator - n_events * np.log(risk_set)) / n_samples
  risk_set2 += np.exp(xw[k])
  loss -= (numerator - n_events * np.log(risk_set)) / n_samples
  exp_xw = np.exp(offset + np.dot(x, w))
  z = risk_set_x / risk_set
  a = risk_set_xx / risk_set


Failed with alpha=0: search direction contains NaN or infinite values
Currently running for p_miss: 0.15 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.15 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.15 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.15 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.15 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.15 and p_obs: 0.3333333333333333


  risk_set2 += np.exp(xw[k])


Currently running for p_miss: 0.2 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.2 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.2 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.2 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.2 and p_obs: 0.4444444444444444


  risk_set2 += np.exp(xw[k])


Currently running for p_miss: 0.2 and p_obs: 0.3333333333333333
Currently running for p_miss: 0.25 and p_obs: 0.8888888888888888
Currently running for p_miss: 0.25 and p_obs: 0.7777777777777778
Currently running for p_miss: 0.25 and p_obs: 0.6666666666666666
Currently running for p_miss: 0.25 and p_obs: 0.5555555555555556
Currently running for p_miss: 0.25 and p_obs: 0.4444444444444444
Currently running for p_miss: 0.25 and p_obs: 0.3333333333333333


In [92]:
conc_train, conc_test, bias = mar_cca.get_cca_results().values()
conc_train.to_csv(f'../results/c_index_train_MAR{M}_cca.csv')
conc_test.to_csv(f'../results/c_index_test_MAR{M}_cca.csv')
bias.to_csv(f'../results/bias_MAR{M}_cca.csv')