# BASELINE IMPUTATION OF MISSING VALUES

## Description of this notebook

This notebook imputes the missing data using univariate and multivariate techniques.

In [35]:
# Import all the necessary libraries

import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import miceforest as mf
import warnings
warnings.filterwarnings("ignore")


In [36]:
os.chdir('..') # move to the general directory

In [39]:
# Load the incomplete dataset
data_incomplete = pd.read_csv('TFG_GABRIELA_MARIN/DATA/data.csv')

# Load the complete observed data to evaluate the performance
data_observed = pd.read_csv('TFG_GABRIELA_MARIN/DATA/input_DATA_NO_NAs_INPUT.csv')

In [41]:
# Function to evaluate the performance of the imputations

def baseline_imputation_performance(data_complete, imputer, method_name, n_iter=10):

    r2_scores = []

    for i in range(n_iter):
        # 1. Copy the invomplete data and mask some values to evaluate the performance
        data_incomplete_copy = data_complete.copy()

        # 2. Get indices of the non-missing values
        mask = data_incomplete_copy.notnull()

        # 3. Randomly mask 20% of the known values
        mask_idx = np.random.choice(np.where(mask.values.flatten())[0], size=int(0.2 * mask.values.sum()), replace=False)

        # Obtain the data values that will be masked
        data_masked = data_incomplete_copy.values.copy().flatten()

        # Save the corresponding original values
        original_values = data_masked[mask_idx].copy()

        # Mask the chosen values
        data_masked[mask_idx] = np.nan

        # Make sure the data masked is a 2D array
        data_masked = data_masked.reshape(data_incomplete_copy.shape)
        data_masked = pd.DataFrame(data_masked, columns=data_incomplete_copy.columns, index=data_incomplete_copy.index)


        # 4. Impute the masked data
        imputed_array = imputer.fit_transform(data_masked)
        imputed_array = np.array(imputed_array)
        if isinstance(imputed_array, pd.DataFrame):
            imputed_flat = imputed_array.values.flatten()
        else:
            imputed_flat = imputed_array.flatten()
        imputed_values = imputed_flat[mask_idx]

        # Compute the r-squared score
        score = r2_score(original_values, imputed_values)
        r2_scores.append(score)

    # Print the results
    avg_r2 = np.mean(r2_scores)
    print(f"Average R² Score over {n_iter} iterations ({method_name}): {avg_r2:.4f}")
    return avg_r2

In [46]:
def baseline_imputation_groups(data_complete, imputer, groups, group_names, n_runs=10):
    
    # Initialize dicts of lists to collect scores per group
    group_r2_scores = {name: [] for name in group_names}
    
    for group, name in zip(groups, group_names):
        for _ in range(n_runs):
            data_original = data_complete[group].copy()
            data_missing = data_original.copy()

            total_values = data_missing.size
            missing_count = int(total_values * 0.2)
            missing_indices = np.random.choice(total_values, size=missing_count, replace=False)
            row_indices, col_indices = np.unravel_index(missing_indices, data_missing.shape)
            data_missing.values[row_indices, col_indices] = np.nan

            imputed_array = imputer.fit_transform(data_missing)
            data_imputed = pd.DataFrame(imputed_array, columns=data_original.columns)

            mask = np.zeros_like(data_missing.values, dtype=bool)
            mask[row_indices, col_indices] = True

            true_values = data_original.values[mask]
            imputed_values = data_imputed.values[mask]

            r2 = r2_score(true_values, imputed_values)
            
            group_r2_scores[name].append(r2)
            
    for name in group_names:
        print(f"Overall R2 {name}: {np.mean(group_r2_scores[name]):.4f}")
        print('---------------------------------------------')


## 1. Univariate Imputation

### 1.1 Mean Imputation

In [5]:
mean_imputer = SimpleImputer(strategy="mean")

mean_imputed_data = mean_imputer.fit_transform(data_incomplete)

mean_imputed_df = pd.DataFrame(mean_imputed_data, columns=data_incomplete.columns)

In [6]:
mean_imputed_df.to_csv('BASELINE IMPUTED DATASETS/mean_imputed_df.csv', index=False)

### 1.2 Median Imputation

In [7]:
median_imputer = SimpleImputer(strategy="median")

median_imputed_data = median_imputer.fit_transform(data_incomplete)

median_imputed_df = pd.DataFrame(median_imputed_data, columns=data_incomplete.columns)

In [8]:
median_imputed_df.to_csv('BASELINE IMPUTED DATASETS/median_imputed_df.csv', index=False)

### 1.3 Mode Imputation

In [9]:
mode_imputer = SimpleImputer(strategy="most_frequent")

mode_imputed_data = mode_imputer.fit_transform(data_incomplete)

mode_imputed_df = pd.DataFrame(mode_imputed_data, columns=data_incomplete.columns)

In [10]:
mode_imputed_df.to_csv('BASELINE IMPUTED DATASETS/mode_imputed_df.csv', index=False)

## 2. Multivariate Imputation

### 2.1 KNN Imputation

#### **Step 1:** Decide which is the best number of nearest neighbors

For a tradeoff between simplicity of the model and good performance, the number of nearest neighbors chosen to perform the imputation is 3. 

In [13]:
# 1. Copy the invomplete data and mask some values to evaluate the performance
data_incomplete_copy = data_incomplete.copy()

# 2. Get indices of the non-missing values
mask = data_incomplete_copy.notnull()

# 3. Randomly mask 20% of the known values
mask_idx = np.random.choice(np.where(mask.values.flatten())[0], size=int(0.2 * mask.values.sum()), replace=False)

# Obtain the data values that will be masked
data_masked = data_incomplete_copy.values.copy().flatten()

# Save the corresponding original values
original_values = data_masked[mask_idx].copy()

# Mask the chosen values
data_masked[mask_idx] = np.nan

# Make sure the data masked is a 2D array
data_masked = data_masked.reshape(data_incomplete_copy.shape)
data_masked = pd.DataFrame(data_masked, columns=data_incomplete_copy.columns)

# 4. Try different numbers for the neighbors and evaluate the performance using r-squared
r2_scores_neighbors = {}

for k in range(2, 11):
    imputer = KNNImputer(n_neighbors=k)
    imputed = imputer.fit_transform(data_masked)

    # Compare the imputed values against the original values
    imputed_flat = imputed.flatten()
    imputed_values = imputed_flat[mask_idx]

    # Compute the r-squared score
    score = r2_score(original_values, imputed_values)
    r2_scores_neighbors[k] = score
    print(f"n_neighbors={k} → R²: {score:.4f}")

# Step 3: Select best k
best_k = max(r2_scores_neighbors, key=r2_scores_neighbors.get)
print(f"\nBest n_neighbors based on R²: {best_k}")


n_neighbors=2 → R²: 0.9817
n_neighbors=3 → R²: 0.9836
n_neighbors=4 → R²: 0.9845
n_neighbors=5 → R²: 0.9853
n_neighbors=6 → R²: 0.9860
n_neighbors=7 → R²: 0.9865
n_neighbors=8 → R²: 0.9865
n_neighbors=9 → R²: 0.9865
n_neighbors=10 → R²: 0.9867

Best n_neighbors based on R²: 10


#### **Step 2:** Impute the missing values

In [14]:
knn_imputer = KNNImputer(n_neighbors=3)  

knn_imputed_data = knn_imputer.fit_transform(data_incomplete)

knn_imputed_df = pd.DataFrame(knn_imputed_data, columns=data_incomplete.columns)

In [15]:
knn_imputed_df.to_csv('BASELINE IMPUTED DATASETS/knn_imputed_df.csv', index=False)

### 2.2 Iterative Imputation

In [16]:
iter_imputer = IterativeImputer(estimator=LinearRegression(), max_iter=50, random_state=42)

iterative_imputer_data = iter_imputer.fit_transform(data_incomplete)

iter_imputed_df = pd.DataFrame(iterative_imputer_data, columns=data_incomplete.columns)

In [17]:
iter_imputed_df.to_csv('BASELINE IMPUTED DATASETS/iter_imputed_df.csv', index=False)

### 2.3 MICE Imputation

In [None]:
class MICE_Imputer:
    def __init__(self, num_datasets=10, mice_iterations=5, random_state=42):
        self.num_datasets = num_datasets
        self.mice_iterations = mice_iterations
        self.random_state = random_state

    def fit_transform(self, X):
        kernel = mf.ImputationKernel(
            X,
            num_datasets=self.num_datasets,
            random_state=self.random_state
        )
        kernel.mice(self.mice_iterations)
        
        # Average the imputed datasets
        imputed_dfs = [kernel.complete_data(dataset=i) for i in range(self.num_datasets)]
        df = sum(imputed_dfs) / len(imputed_dfs)
        return pd.DataFrame(df, columns=X.columns)


In [22]:
mice_imputer = MICE_Imputer(num_datasets=10, mice_iterations=5)

mice_imputed_df = mice_imputer.fit_transform(data_incomplete)

In [24]:
mice_imputed_df.to_csv('BASELINE IMPUTED DATASETS/mice_imputed_df.csv', index=False)

## 3. Evaluate Imputation Performance

#### Overall Imputation Performance

In [43]:
methods = ['Mean', 'Median', 'Mode', 'KNN', 'Iterative', 'MICE']
imputers = [mean_imputer, median_imputer, mode_imputer, knn_imputer, iter_imputer, mice_imputer]

for idx, imp in enumerate(imputers):
    baseline_imputation_performance(data_observed, imp, methods[idx], n_iter=10)

Average R² Score over 10 iterations (Mean): 0.9884
Average R² Score over 10 iterations (Median): 0.9886
Average R² Score over 10 iterations (Mode): 0.9849
Average R² Score over 10 iterations (KNN): 0.9899
Average R² Score over 10 iterations (Iterative): 0.9967
Average R² Score over 10 iterations (MICE): 0.9970


#### Per Subset of Features Imputation Performance

In [44]:
subject_details = ['AGE', 'PTGENDER', 'PTEDUCAT', 'APOE4']
fdg_pet = ['AngularLeft', 'AngularRight', 'CingulumPostBilateral', 'TemporalLeft', 'TemporalRight']
nepb = ['MMSE', 'RAVLT_learning', 'RAVLT_immediate', 'RAVLT_perc_forgetting', 'FAQ']
av_45 = ['CEREBELLUMGREYMATTER_UCBERKELEYAV45_10_17_16', 'WHOLECEREBELLUM_UCBERKELEYAV45_10_17_16', 'ERODED_SUBCORTICALWM_UCBERKELEYAV45_10_17_16', 'FRONTAL_UCBERKELEYAV45_10_17_16', 
               'CINGULATE_UCBERKELEYAV45_10_17_16', 'PARIETAL_UCBERKELEYAV45_10_17_16', 'TEMPORAL_UCBERKELEYAV45_10_17_16']
csf_values = ['ABETA_UPENNBIOMK9_04_19_17', 'TAU_UPENNBIOMK9_04_19_17', 'PTAU_UPENNBIOMK9_04_19_17']
mri = ['Hippocampus', 'WholeBrain', 'Ventricles', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV']

groups = [subject_details, fdg_pet, nepb, av_45, csf_values, mri]
group_names = ['subject_details', 'fdg_pet', 'nepb', 'av_45', 'csf_values', 'mri']

In [49]:
for idx, imp in enumerate(imputers):
    print(methods[idx])
    baseline_imputation_groups(data_observed, imputer, groups, group_names, n_runs=10)
    print(" ")

Mean
Overall R2 subject_details: 0.9820
---------------------------------------------
Overall R2 fdg_pet: 0.6864
---------------------------------------------
Overall R2 nepb: 0.7280
---------------------------------------------
Overall R2 av_45: 0.9089
---------------------------------------------
Overall R2 csf_values: 0.6559
---------------------------------------------
Overall R2 mri: 0.9898
---------------------------------------------
 
Median
Overall R2 subject_details: 0.9828
---------------------------------------------
Overall R2 fdg_pet: 0.6766
---------------------------------------------
Overall R2 nepb: 0.7001
---------------------------------------------
Overall R2 av_45: 0.9194
---------------------------------------------
Overall R2 csf_values: 0.6732
---------------------------------------------
Overall R2 mri: 0.9893
---------------------------------------------
 
Mode
Overall R2 subject_details: 0.9817
---------------------------------------------
Overall R2 fdg_pet