# BASELINE IMPUTATION OF MISSING VALUES

## Description of this notebook

This notebook imputes the missing data using univariate and multivariate techniques.

In [23]:
# Import all the necessary libraries

import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import miceforest as mf
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold


In [None]:
# os.chdir('..') # move to the general directory if necessary

In [24]:
# Load the incomplete dataset
data_incomplete = pd.read_csv('DATA/data.csv')

# Load the complete observed data to evaluate the performance
data_observed = pd.read_csv('DATA/input_DATA_NO_NAs_INPUT.csv')

In [None]:
# Define the group of features
subject_details = ['AGE', 'PTGENDER', 'PTEDUCAT', 'APOE4']
fdg_pet = ['AngularLeft', 'AngularRight', 'CingulumPostBilateral', 'TemporalLeft', 'TemporalRight']
nepb = ['MMSE', 'RAVLT_learning', 'RAVLT_immediate', 'RAVLT_perc_forgetting', 'FAQ']
av_45 = ['CEREBELLUMGREYMATTER_UCBERKELEYAV45_10_17_16', 'WHOLECEREBELLUM_UCBERKELEYAV45_10_17_16', 'ERODED_SUBCORTICALWM_UCBERKELEYAV45_10_17_16', 'FRONTAL_UCBERKELEYAV45_10_17_16', 
               'CINGULATE_UCBERKELEYAV45_10_17_16', 'PARIETAL_UCBERKELEYAV45_10_17_16', 'TEMPORAL_UCBERKELEYAV45_10_17_16']
csf_values = ['ABETA_UPENNBIOMK9_04_19_17', 'TAU_UPENNBIOMK9_04_19_17', 'PTAU_UPENNBIOMK9_04_19_17']
mri = ['Hippocampus', 'WholeBrain', 'Ventricles', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV']

groups = [subject_details, fdg_pet, nepb, av_45, csf_values, mri]
group_names = ['subject_details', 'fdg_pet', 'nepb', 'av_45', 'csf_values', 'mri']

## **1.** Univariate Imputation

### **Overall Performance**

In [None]:
# This function evaluates the overall imputation performance of the univariate models using cros-validation
def baseline_univariate_imputation_cv_performance(data, method_name, n_splits=10):
   
    r2_scores = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index].copy()
        test_data = data.iloc[test_index].copy()

        # Identify observed (not missing) values in the test set
        observed_mask = test_data.notnull().values
        observed_indices = np.where(observed_mask.flatten())[0]

        # Randomly select 20% of these observed positions to insert synthetic NAs
        n_mask = int(0.2 * len(observed_indices))
        mask_indices = np.random.choice(observed_indices, size=n_mask, replace=False)

        # Save the true values before masking for later comparison
        test_data_flat = test_data.values.flatten()
        true_values = test_data_flat[mask_indices].copy()

        # Mask the defined positions
        test_data_flat[mask_indices] = np.nan
        test_data_masked = test_data_flat.reshape(test_data.shape)
        test_data_masked_df = pd.DataFrame(test_data_masked, columns=test_data.columns, index=test_data.index)

        # Fit the imputer using the training data
        mean_imputer = SimpleImputer(strategy=method_name)
        mean_imputer.fit(train_data)

        # Impute the masked test data using the imputer fitted on training data
        imputed_array = mean_imputer.transform(test_data_masked_df)
        imputed_flat = imputed_array.flatten()

        # Save the imputed values for later comparison
        imputed_values = imputed_flat[mask_indices]

        # Compute r2 score for each fold
        score = r2_score(true_values, imputed_values)
        r2_scores.append(score)

    # Summarize the results computing the mean and standard deviation of the R2 scores across all 10 folds
    avg_r2 = np.mean(r2_scores)
    std_r2 = np.std(r2_scores)
    return (f"R² Score ({method_name} with CV): {avg_r2:.3f} ± {std_r2:.3f}")

### **Subset Performance** 

In [None]:
# This function evaluates the imputation performance in each subset of the univariate models using cros-validation
def baseline_imputation_groups_cv(data, method_name, groups, group_names, n_splits=10):

    group_r2_scores = {name: [] for name in group_names}

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(data):
        train_data = data.iloc[train_idx].copy()
        test_data = data.iloc[test_idx].copy()

        # Fit the corresponding imputer using the training data
        imputer = SimpleImputer(strategy=method_name)
        imputer.fit(train_data)

        # Predict the masked data for each group
        for group, name in zip(groups, group_names):
            test_data_copy = test_data.copy()  

            # Identify the observed (non-missing) positions in the group columns
            observed_mask = test_data_copy[group].notnull().values
            observed_positions = np.where(observed_mask.flatten())[0]

            # Randomly select 20% of observed values to impute synthetic NAs
            n_mask = int(len(observed_positions) * 0.2)
            mask_indices = np.random.choice(observed_positions, size=n_mask, replace=False)

            # Save true values before masking
            group_values_flat = test_data_copy[group].values.flatten()
            true_values = group_values_flat[mask_indices].copy()

            # Mask the values in the full test_data_copy
            group_values_flat[mask_indices] = np.nan
            test_data_copy[group] = pd.DataFrame(group_values_flat.reshape(test_data_copy[group].shape), columns=group, 
                                                index=test_data_copy.index)

            # Impute the masked positions
            imputed_test_array = imputer.transform(test_data_copy)
            imputed_test_df = pd.DataFrame(imputed_test_array, columns=test_data.columns, index=test_data.index)

            # Store the imputed values 
            imputed_group_flat = imputed_test_df[group].values.flatten()
            imputed_values = imputed_group_flat[mask_indices]

            # Compute the r2 score
            r2 = r2_score(true_values, imputed_values)
            group_r2_scores[name].append(r2)

    # Summarize the results by computing mean and standard deviation throughout all the folds
    for name in group_names:
        avg_r2 = np.mean(group_r2_scores[name])
        std_r2 = np.std(group_r2_scores[name])
        print(f"R² ({name}): {avg_r2:.3f} ± {std_r2:.3f}")
        print('---------------------------------------------')

### **1.1 Mean Imputation**

#### Performance Evaluation

In [49]:
baseline_univariate_imputation_cv_performance(data_incomplete, 'mean')

'R² Score (mean with CV): 0.988 ± 0.002'

In [48]:
baseline_imputation_groups_cv(data_incomplete, 'mean', groups, group_names)

R² (subject_details): 0.984 ± 0.003
---------------------------------------------
R² (fdg_pet): 0.040 ± 0.034
---------------------------------------------
R² (nepb): 0.591 ± 0.088
---------------------------------------------
R² (av_45): 0.582 ± 0.048
---------------------------------------------
R² (csf_values): 0.604 ± 0.047
---------------------------------------------
R² (mri): 0.982 ± 0.003
---------------------------------------------


#### Create Imputed Dataset

In [None]:
# Impute the missing entries in the incomplete data

mean_imputer = SimpleImputer(strategy="mean")

mean_imputed_data = mean_imputer.fit_transform(data_incomplete)

mean_imputed_df = pd.DataFrame(mean_imputed_data, columns=data_incomplete.columns)

In [6]:
mean_imputed_df.to_csv('BASELINE IMPUTED DATASETS/mean_imputed_df.csv', index=False)

### **1.2 Median Imputation**

#### Performance Evaluation

In [27]:
baseline_univariate_imputation_cv_performance(data_incomplete, 'median')

'R² Score (median with CV): 0.987 ± 0.003'

In [51]:
baseline_imputation_groups_cv(data_incomplete, 'median', groups, group_names)

R² (subject_details): 0.981 ± 0.003
---------------------------------------------
R² (fdg_pet): 0.025 ± 0.068
---------------------------------------------
R² (nepb): 0.586 ± 0.118
---------------------------------------------
R² (av_45): 0.574 ± 0.066
---------------------------------------------
R² (csf_values): 0.559 ± 0.040
---------------------------------------------
R² (mri): 0.983 ± 0.004
---------------------------------------------


#### Create Imputed Dataset

In [None]:
# Impute the missing entries in the incomplete data

median_imputer = SimpleImputer(strategy="median")

median_imputed_data = median_imputer.fit_transform(data_incomplete)

median_imputed_df = pd.DataFrame(median_imputed_data, columns=data_incomplete.columns)

In [8]:
median_imputed_df.to_csv('BASELINE IMPUTED DATASETS/median_imputed_df.csv', index=False)

### **1.3 Mode Imputation**

#### Performance Evaluation

In [28]:
baseline_univariate_imputation_cv_performance(data_incomplete, 'most_frequent')

'R² Score (most_frequent with CV): 0.974 ± 0.006'

In [52]:
baseline_imputation_groups_cv(data_incomplete, 'most_frequent', groups, group_names)

R² (subject_details): 0.982 ± 0.003
---------------------------------------------
R² (fdg_pet): -0.730 ± 0.287
---------------------------------------------
R² (nepb): 0.019 ± 0.294
---------------------------------------------
R² (av_45): -0.075 ± 0.282
---------------------------------------------
R² (csf_values): 0.510 ± 0.074
---------------------------------------------
R² (mri): 0.968 ± 0.005
---------------------------------------------


#### Create Imputed Dataset

In [None]:
# Impute the missing entries in the incomplete data

mode_imputer = SimpleImputer(strategy="most_frequent")

mode_imputed_data = mode_imputer.fit_transform(data_incomplete)

mode_imputed_df = pd.DataFrame(mode_imputed_data, columns=data_incomplete.columns)

In [10]:
mode_imputed_df.to_csv('BASELINE IMPUTED DATASETS/mode_imputed_df.csv', index=False)

## **2.** Multivariate Imputation

### **2.1 KNN Imputation**

#### **Overall Performance**

In [None]:
# This function evaluates the overall imputation performance of KNN using cros-validation

def knn_univariate_imputation_cv_performance(data, method_name, n_neighbors=3, n_splits=10):
    
    r2_scores = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index].copy()
        test_data = data.iloc[test_index].copy()

        # Identify observed values in the test set
        observed_mask = test_data.notnull().values
        observed_indices = np.where(observed_mask.flatten())[0]

        # Randomly select 20% of these observed positions to impute synthetic NAs
        n_mask = int(0.2 * len(observed_indices))
        mask_indices = np.random.choice(observed_indices, size=n_mask, replace=False)

        # Save the true values 
        test_data_flat = test_data.values.flatten()
        true_values = test_data_flat[mask_indices].copy()

        # Impute the synthetic NAs
        test_data_flat[mask_indices] = np.nan
        test_data_masked = test_data_flat.reshape(test_data.shape)
        test_data_masked_df = pd.DataFrame(test_data_masked, columns=test_data.columns, index=test_data.index)

        # Fit imputer on the training data 
        mean_imputer = KNNImputer(n_neighbors=n_neighbors)
        mean_imputer.fit(train_data)

        # Impute the masked test data 
        imputed_array = mean_imputer.transform(test_data_masked_df)
        imputed_flat = imputed_array.flatten()

        # Extract imputed values 
        imputed_values = imputed_flat[mask_indices]

        # Compute the r2 score
        score = r2_score(true_values, imputed_values)
        r2_scores.append(score)

    # Summarize results obtaining the mean and standard deviation across all the folds
    avg_r2 = np.mean(r2_scores)
    std_r2 = np.std(r2_scores)
    return (f"R² Score ({method_name} with CV): {avg_r2:.3f} ± {std_r2:.3f}")

#### **Subset Performance**

In [None]:
# This function evaluates the imputation performance in each subset of KNN using cros-validation
def knn_imputation_groups_cv(data, groups, group_names, n_splits=10):

    group_r2_scores = {name: [] for name in group_names}

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(data):
        train_data = data.iloc[train_idx].copy()
        test_data = data.iloc[test_idx].copy()

        # Fit the imputer using the training data
        imputer = KNNImputer(n_neighbors=3)
        imputer.fit(train_data)

        # Impute the masked values in the test set for each group
        for group, name in zip(groups, group_names):
            test_data_copy = test_data.copy()  

            # Identify observed positions in the group columns
            observed_mask = test_data_copy[group].notnull().values
            observed_positions = np.where(observed_mask.flatten())[0]

            # Randomly select 20% of observed values to mask
            n_mask = int(len(observed_positions) * 0.2)
            mask_indices = np.random.choice(observed_positions, size=n_mask, replace=False)

            # Save true values 
            group_values_flat = test_data_copy[group].values.flatten()
            true_values = group_values_flat[mask_indices].copy()

            # Mask the values in the test data
            group_values_flat[mask_indices] = np.nan
            test_data_copy[group] = pd.DataFrame(group_values_flat.reshape(test_data_copy[group].shape), columns=group, 
                                                index=test_data_copy.index)

            # Impute the masked values
            imputed_test_array = imputer.transform(test_data_copy)
            imputed_test_df = pd.DataFrame(imputed_test_array, columns=test_data.columns, index=test_data.index)

            # Store the imputed values 
            imputed_group_flat = imputed_test_df[group].values.flatten()
            imputed_values = imputed_group_flat[mask_indices]

            # Compute the r2 score 
            r2 = r2_score(true_values, imputed_values)
            group_r2_scores[name].append(r2)

    # Summarize the results computing the mean and standard deviation across all the folds
    for name in group_names:
        avg_r2 = np.mean(group_r2_scores[name])
        std_r2 = np.std(group_r2_scores[name])
        print(f"R² ({name}): {avg_r2:.3f} ± {std_r2:.3f}")
        print('---------------------------------------------')

#### **Step 1:** Decide which is the best number of nearest neighbors

For a tradeoff between simplicity of the model and good performance, the number of nearest neighbors chosen to perform the imputation is 3. 

In [67]:
for n in range(2, 11):
    print(f'n_neighbors={n}')
    print(knn_univariate_imputation_cv_performance(data_incomplete, 'KNN', n))

n_neighbors=2
R² Score (KNN with CV): 0.979 ± 0.005
n_neighbors=3
R² Score (KNN with CV): 0.981 ± 0.003
n_neighbors=4
R² Score (KNN with CV): 0.979 ± 0.005
n_neighbors=5
R² Score (KNN with CV): 0.982 ± 0.004
n_neighbors=6
R² Score (KNN with CV): 0.983 ± 0.004
n_neighbors=7
R² Score (KNN with CV): 0.984 ± 0.003
n_neighbors=8
R² Score (KNN with CV): 0.985 ± 0.004
n_neighbors=9
R² Score (KNN with CV): 0.984 ± 0.002
n_neighbors=10
R² Score (KNN with CV): 0.985 ± 0.002


#### **Step 2:** Impute the missing values

##### Performance Evaluation

In [65]:
knn_univariate_imputation_cv_performance(data_incomplete, 'KNN', 3)

'R² Score (KNN with CV): 0.980 ± 0.005'

In [73]:
knn_imputation_groups_cv(data_incomplete, groups, group_names)

R² (subject_details): 0.981 ± 0.004
---------------------------------------------
R² (fdg_pet): -0.230 ± 0.189
---------------------------------------------
R² (nepb): 0.557 ± 0.036
---------------------------------------------
R² (av_45): 0.593 ± 0.059
---------------------------------------------
R² (csf_values): 0.465 ± 0.128
---------------------------------------------
R² (mri): 0.974 ± 0.006
---------------------------------------------


##### Create Imputed Dataset

In [None]:
# Impute the missing entries in the incomplete data

knn_imputer = KNNImputer(n_neighbors=3)  

knn_imputed_data = knn_imputer.fit_transform(data_incomplete)

knn_imputed_df = pd.DataFrame(knn_imputed_data, columns=data_incomplete.columns)

In [15]:
knn_imputed_df.to_csv('BASELINE IMPUTED DATASETS/knn_imputed_df.csv', index=False)

### **2.2 Iterative Imputation**

#### **Overall Performance**

In [None]:
# This function evaluates the overall imputation performance of Iterative using cros-validation

def iter_univariate_imputation_cv_performance(data, method_name, n_splits=10):

    r2_scores = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index].copy()
        test_data = data.iloc[test_index].copy()

        # Identify observed values in the test set
        observed_mask = test_data.notnull().values
        observed_indices = np.where(observed_mask.flatten())[0]

        # Randomly select 20% of these observed positions to mask 
        n_mask = int(0.2 * len(observed_indices))
        mask_indices = np.random.choice(observed_indices, size=n_mask, replace=False)

        # Save the true values 
        test_data_flat = test_data.values.flatten()
        true_values = test_data_flat[mask_indices].copy()

        # Mask the positions
        test_data_flat[mask_indices] = np.nan
        test_data_masked = test_data_flat.reshape(test_data.shape)
        test_data_masked_df = pd.DataFrame(test_data_masked, columns=test_data.columns, index=test_data.index)

        # Fit imputer on training data 
        mean_imputer = IterativeImputer(estimator=LinearRegression(), max_iter=50, random_state=42)
        mean_imputer.fit(train_data)

        # Impute the masked test data using the imputer fitted on training data
        imputed_array = mean_imputer.transform(test_data_masked_df)
        imputed_flat = imputed_array.flatten()

        # Store the imputed values 
        imputed_values = imputed_flat[mask_indices]

        # Compute r2 score
        score = r2_score(true_values, imputed_values)
        r2_scores.append(score)

    # Summarize results by computing the mean and standard deviation across all the folds
    avg_r2 = np.mean(r2_scores)
    std_r2 = np.std(r2_scores)
    return (f"R² Score ({method_name} with CV): {avg_r2:.3f} ± {std_r2:.3f}")

#### **Subset Performance**

In [None]:
# This function evaluates the imputation performance in each subset of Iterative using cros-validation

def iter_imputation_groups_cv(data, groups, group_names, n_splits=10):

    group_r2_scores = {name: [] for name in group_names}
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(data):
        train_data = data.iloc[train_idx].copy()
        test_data = data.iloc[test_idx].copy()

        # Fit the imputer on the training data
        imputer = IterativeImputer(estimator=LinearRegression(), max_iter=50, random_state=42)
        imputer.fit(train_data)

        # Predict the mask positions in each group
        for group, name in zip(groups, group_names):
            test_data_copy = test_data.copy()  

            # Identify observed positions in the group columns
            observed_mask = test_data_copy[group].notnull().values
            observed_positions = np.where(observed_mask.flatten())[0]

            # Randomly select 20% of observed values to mask
            n_mask = int(len(observed_positions) * 0.2)
            mask_indices = np.random.choice(observed_positions, size=n_mask, replace=False)

            # Save true values 
            group_values_flat = test_data_copy[group].values.flatten()
            true_values = group_values_flat[mask_indices].copy()

            # Mask the values
            group_values_flat[mask_indices] = np.nan
            test_data_copy[group] = pd.DataFrame(group_values_flat.reshape(test_data_copy[group].shape), 
                                                columns=group, 
                                                index=test_data_copy.index)

            # Impute the masked values
            imputed_test_array = imputer.transform(test_data_copy)
            imputed_test_df = pd.DataFrame(imputed_test_array, columns=test_data.columns, index=test_data.index)

            # Store the imputed values 
            imputed_group_flat = imputed_test_df[group].values.flatten()
            imputed_values = imputed_group_flat[mask_indices]

            # Compute the r2 score
            r2 = r2_score(true_values, imputed_values)
            group_r2_scores[name].append(r2)

    # Summarize the results by computing the mean and standard deviation of the scores across all the folds
    for name in group_names:
        avg_r2 = np.mean(group_r2_scores[name])
        std_r2 = np.std(group_r2_scores[name])
        print(f"R² ({name}): {avg_r2:.3f} ± {std_r2:.3f}")
        print('---------------------------------------------')

#### Performance Evaluation

In [76]:
iter_univariate_imputation_cv_performance(data_incomplete, 'Iterative')

'R² Score (Iterative with CV): 0.997 ± 0.001'

In [78]:
iter_imputation_groups_cv(data_incomplete, groups, group_names)

R² (subject_details): 0.988 ± 0.001
---------------------------------------------
R² (fdg_pet): 0.763 ± 0.050
---------------------------------------------
R² (nepb): 0.823 ± 0.072
---------------------------------------------
R² (av_45): 0.966 ± 0.008
---------------------------------------------
R² (csf_values): 0.787 ± 0.072
---------------------------------------------
R² (mri): 0.996 ± 0.001
---------------------------------------------


#### Create Imputed Dataset

In [None]:
# Impute the missing entries in the incomplete data

iter_imputer = IterativeImputer(estimator=LinearRegression(), max_iter=50, random_state=42)

iterative_imputer_data = iter_imputer.fit_transform(data_incomplete)

iter_imputed_df = pd.DataFrame(iterative_imputer_data, columns=data_incomplete.columns)

In [17]:
iter_imputed_df.to_csv('BASELINE IMPUTED DATASETS/iter_imputed_df.csv', index=False)

### **2.3 MICE Imputation**

#### **Define the Imputer**

In [None]:
# Define the class to perform MICE
class MICE_Imputer:
    def __init__(self, num_datasets=10, mice_iterations=5, random_state=42):
        # number of imputed datasets to generate in each iteration
        self.num_datasets = num_datasets 
        # number of iterations desired
        self.mice_iterations = mice_iterations
        self.random_state = random_state

    def fit(self, X):
        # Create of kernel object and input data
        self.kernel = mf.ImputationKernel(
            X,
            num_datasets=self.num_datasets,
            random_state=self.random_state
        )
        # Run the MICE iteration 
        self.kernel.mice(self.mice_iterations)

    def transform(self, X):
        # Create a new kernel for imputing new values
        new_kernel = mf.ImputationKernel(
            X,
            num_datasets=self.num_datasets,
            random_state=self.random_state,
        )
        # Trigger initialization of the kernel
        new_kernel.complete_data(dataset=0)  

        # Use parameters from the firsr kernel to initialize the new kernel
        new_kernel.models = self.kernel.models

        # Run mice iterations on new data
        new_kernel.mice(self.mice_iterations)

        # Return completed dataset 
        completed_df = [self.kernel.complete_data(dataset=i) for i in range(self.num_datasets)]
        df = sum(completed_df) / len(completed_df)
        return df

    def fit_transform(self, X):
        # Fit the imputer
        self.fit(X)

        # Generate the multiple imputed datasets
        imputed_dfs = [self.kernel.complete_data(dataset=i) for i in range(self.num_datasets)]

        # Average the imputed datasets to general a final one
        df = sum(imputed_dfs) / len(imputed_dfs)
        
        return pd.DataFrame(df, columns=X.columns)


#### **Overall Performance**

In [None]:
# This function evaluates the overall imputation performance of MICE using cros-validation

def mice_univariate_imputation_cv_performance(data, method_name, n_splits=10):
    
    r2_scores = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index].copy().reset_index(drop=True)
        test_data = data.iloc[test_index].copy().reset_index(drop=True)

        # Identify observed values in the test set
        observed_mask = test_data.notnull().values
        observed_indices = np.where(observed_mask.flatten())[0]

        # Randomly select 20% of these observed positions to mask 
        n_mask = int(0.2 * len(observed_indices))
        mask_indices = np.random.choice(observed_indices, size=n_mask, replace=False)

        # Save the true values
        test_data_flat = test_data.values.flatten()
        true_values = test_data_flat[mask_indices].copy()

        # Mask those positions
        test_data_flat[mask_indices] = np.nan
        test_data_masked = test_data_flat.reshape(test_data.shape)
        test_data_masked_df = pd.DataFrame(test_data_masked, columns=test_data.columns, index=test_data.index)

        # Fit imputer on training data 
        mean_imputer = MICE_Imputer(num_datasets=10, mice_iterations=5)
        mean_imputer.fit(train_data)

        # Impute the masked test data using the imputer fitted on training data
        imputed_array = mean_imputer.transform(test_data_masked_df)
        imputed_array = np.array(imputed_array)
        if isinstance(imputed_array, pd.DataFrame):
            imputed_flat = imputed_array.values.flatten()
        else:
            imputed_flat = imputed_array.flatten()

        # Store the imputed values 
        imputed_values = imputed_flat[mask_indices]

        # Compute r2 score
        score = r2_score(true_values, imputed_values)
        r2_scores.append(score)

    # Summarize results by computing the mean and standard deviation of the scores obtained in all the folds
    avg_r2 = np.mean(r2_scores)
    std_r2 = np.std(r2_scores)
    return (f"R² Score ({method_name} with CV): {avg_r2:.3f} ± {std_r2:.3f}")

#### **Subset Performance**

In [None]:
# This function evaluates the imputation performance in each subset of MICE using cros-validation

def mice_imputation_groups_cv(data, groups, group_names, n_splits=10):

    group_r2_scores = {name: [] for name in group_names}
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, test_idx in kf.split(data):

        train_data = data.iloc[train_idx].copy().reset_index(drop=True)
        test_data = data.iloc[test_idx].copy().reset_index(drop=True)

        # Fit the imputer using the training data
        imputer = MICE_Imputer(num_datasets=10, mice_iterations=5)
        imputer.fit(train_data)
        
        # Predict the masked test data in each group
        for group, name in zip(groups, group_names):
            test_data_copy = test_data.copy()  

            # Identify observed positions in the group columns
            observed_mask = test_data_copy[group].notnull().values
            observed_positions = np.where(observed_mask.flatten())[0]

            # Randomly select 20% of observed values to mask
            n_mask = int(len(observed_positions) * 0.2)
            mask_indices = np.random.choice(observed_positions, size=n_mask, replace=False)

            # Save true values
            group_values_flat = test_data_copy[group].values.flatten()
            true_values = group_values_flat[mask_indices].copy()

            # Mask values
            group_values_flat[mask_indices] = np.nan
            test_data_copy[group] = pd.DataFrame(group_values_flat.reshape(test_data_copy[group].shape), 
                                                columns=group, 
                                                index=test_data_copy.index)

            # Impute the masked values
            imputed_test_array = imputer.transform(test_data_copy)
            imputed_test_df = pd.DataFrame(imputed_test_array, columns=test_data.columns, index=test_data.index)

            # Store the imputed values
            imputed_group_flat = imputed_test_df[group].values.flatten()
            imputed_values = imputed_group_flat[mask_indices]

            # Compute the r2 score
            r2 = r2_score(true_values, imputed_values)
            group_r2_scores[name].append(r2)

    # Summarize the results calculating the mean and standard deviation of the scores across all the folds
    for name in group_names:
        avg_r2 = np.mean(group_r2_scores[name])
        std_r2 = np.std(group_r2_scores[name])
        print(f"R² ({name}): {avg_r2:.3f} ± {std_r2:.3f}")
        print('---------------------------------------------')

#### Performance Evaluation

In [None]:
mice_univariate_imputation_cv_performance(data_incomplete, 'MICE')

'R² Score (MICE with CV): 0.994 ± 0.001'

In [None]:
mice_imputation_groups_cv(data_incomplete, groups, group_names)

R² (subject_details): 0.983 ± 0.004
---------------------------------------------
R² (fdg_pet): 0.638 ± 0.079
---------------------------------------------
R² (nepb): 0.770 ± 0.063
---------------------------------------------
R² (av_45): 0.914 ± 0.038
---------------------------------------------
R² (csf_values): 0.660 ± 0.094
---------------------------------------------
R² (mri): 0.993 ± 0.001
---------------------------------------------


#### Create Imputed Dataset

In [None]:
# Impute the missing entries in the incomplete data

mice_imputer = MICE_Imputer(num_datasets=10, mice_iterations=5)

mice_imputed_df = mice_imputer.fit_transform(data_incomplete)

In [24]:
mice_imputed_df.to_csv('BASELINE IMPUTED DATASETS/mice_imputed_df.csv', index=False)