# Imputation

Date: 21 August 2025<br>
By: Eunhye Yang<br>
Description:<br>
1. Train MissForest imputer
2. Impute dataset

In [32]:
import pandas as pd
import numpy as np
from missforest import MissForest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, accuracy_score

df = pd.read_csv('./cleaned_nosurgery_QMH_2025Nov.csv')
data = pd.read_csv('./cleaned_nosurgery_EXT_2025Nov.csv')

## Separate complete and incomplete rows

In [2]:
complete = df.dropna()

## Train Test split

In [3]:
y = complete['@6MmRS'].copy()
X = complete.drop(['@6MmRS', '6M mortality', 'ICH Score'], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## K-fold

In [5]:
X_train.columns

Index(['Age', 'Sex0female1male', 'Race0chinese1nonchinese', 'DM0no1yes',
       'HT0no1yes', 'Allstroke0no1yes', 'IschemicHeartdisease',
       'Atrialfibirllation', 'Antiplateletmedication',
       'Anticoagulant0no1warfarin2NOAC3Heparin', 'AdmissionGCS', 'Esubscore',
       'Vsubscore', 'Msubscore', 'ULpower', 'LLpower', 'AdmissionsystolicBP',
       'AdmissiondiastolicBP', 'CKDclass',
       'location1tha2ICGP3LN4caudate5lobar6brainstem7cerebellum8whitematter',
       'location0supra1infra', 'laterality0L1R3C', 'LARGESTICHVOLUME',
       'GRAEBSCORE', 'EverIVH'],
      dtype='object')

In [6]:
categorical = [
    'Sex0female1male', 'Race0chinese1nonchinese', 'DM0no1yes',
    'HT0no1yes', 'Allstroke0no1yes', 'IschemicHeartdisease',
    'Atrialfibirllation', 'Antiplateletmedication',
    'Anticoagulant0no1warfarin2NOAC3Heparin',
    'location1tha2ICGP3LN4caudate5lobar6brainstem7cerebellum8whitematter',
    'location0supra1infra', 'laterality0L1R3C', 'EverIVH'
]
ordinal = [
    'AdmissionGCS', 'Esubscore', 'Vsubscore', 'Msubscore',
    'ULpower', 'LLpower', 'CKDclass', 'GRAEBSCORE'
]
continuous = [
    'Age', 'AdmissionsystolicBP', 'AdmissiondiastolicBP', 'LARGESTICHVOLUME'
]

In [7]:
def to_nearest_half(val):
    return round((round(val * 2) / 2), 1)

In [8]:
def evaluate_categoricalandordinal(pred, true, i):
    if i == 'ULpower' or i == 'LLpower':
        pred[i] = pred[i].map(to_nearest_half)
        acc = accuracy_score(true[i]*2, pred[i]*2)
    else:
        pred[i] = round(pred[i])
        acc = accuracy_score(true[i], pred[i])
    return acc

In [9]:
def evaluate_kfold_performance(model_kfold_results, pred, true, i):
    for j in true.columns:
        if j in continuous:
            mse = mean_squared_error(true[j], pred[j])
            model_kfold_results[i][j] = mse
        else:
            acc = evaluate_categoricalandordinal(pred, true, j)
            model_kfold_results[i][j] = acc

In [10]:
def fit_evaluate(model, train, test, true, model_kfold_results, i):
    model.fit(train)

    # Predict
    pred = model.transform(test)
    pred = pd.DataFrame(pred, columns=test.columns)

    # Reformat prediction dataframe
    evaluate_kfold_performance(model_kfold_results, pred, true, i)

In [11]:
kf = KFold(n_splits=10, random_state=42, shuffle=True)

imputer_kfold_results = [{} for _ in range(10)]

np.random.seed(42)

for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):

    # Define and generate train-test of i-th fold
    train = X_train.iloc[train_idx].copy()
    true = X_train.iloc[test_idx].copy()
    test = X_train.iloc[test_idx].copy()
    mask = np.random.rand(*train.shape) < 0.2
    train[mask] = np.nan
    mask = np.random.rand(*test.shape) < 0.2
    test[mask] = np.nan

    # Define imputer
    clf = RandomForestClassifier(random_state=42)
    rgr = RandomForestRegressor(random_state=42)
    imputer = MissForest(clf=clf, rgr=rgr, max_iter=10, categorical=categorical)

    print(f'Fold {i+1}')

    fit_evaluate(imputer, train, test, true, imputer_kfold_results, i)

imputer_kfold_results = np.array(imputer_kfold_results)



Fold 1


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:03<00:00,  6.34s/it]
 70%|███████   | 7/10 [00:01<00:00,  5.45it/s]


Fold 2


100%|██████████| 10/10 [01:01<00:00,  6.17s/it]
 60%|██████    | 6/10 [00:01<00:00,  5.39it/s]


Fold 3


100%|██████████| 10/10 [01:02<00:00,  6.27s/it]
100%|██████████| 10/10 [00:01<00:00,  6.59it/s]


Fold 4


100%|██████████| 10/10 [01:02<00:00,  6.27s/it]
100%|██████████| 10/10 [00:01<00:00,  6.06it/s]


Fold 5


100%|██████████| 10/10 [01:03<00:00,  6.36s/it]
 70%|███████   | 7/10 [00:01<00:00,  5.61it/s]


Fold 6


100%|██████████| 10/10 [01:03<00:00,  6.37s/it]
 70%|███████   | 7/10 [00:01<00:00,  5.11it/s]


Fold 7


100%|██████████| 10/10 [01:03<00:00,  6.36s/it]
100%|██████████| 10/10 [00:01<00:00,  6.44it/s]


Fold 8


100%|██████████| 10/10 [01:03<00:00,  6.36s/it]
 40%|████      | 4/10 [00:00<00:01,  4.98it/s]


Fold 9


100%|██████████| 10/10 [01:02<00:00,  6.27s/it]
100%|██████████| 10/10 [00:01<00:00,  6.48it/s]


Fold 10


100%|██████████| 10/10 [01:02<00:00,  6.23s/it]
100%|██████████| 10/10 [00:01<00:00,  6.62it/s]


## Imputer performance

In [12]:
imputer_kfold_results_df = pd.DataFrame(columns=imputer_kfold_results[0].keys())

for i in range(10):
    for j in imputer_kfold_results_df.keys():
        imputer_kfold_results_df.loc[i, j] = imputer_kfold_results[i][j]

### K-fold performance summary

In [13]:
for i in imputer_kfold_results_df.columns:
    metric = 'accuracy'
    val = round(np.mean(imputer_kfold_results_df[i]), 5)
    if i in continuous:
        metric = 'mse'
        val = round(np.sqrt(np.mean(imputer_kfold_results_df[i])), 5)
    print(f'Average {metric} for {i} : {val}')

Average mse for Age : 6.25413
Average accuracy for Sex0female1male : 0.93373
Average accuracy for Race0chinese1nonchinese : 0.98784
Average accuracy for DM0no1yes : 0.96174
Average accuracy for HT0no1yes : 0.98953
Average accuracy for Allstroke0no1yes : 0.97562
Average accuracy for IschemicHeartdisease : 0.97559
Average accuracy for Atrialfibirllation : 0.98252
Average accuracy for Antiplateletmedication : 0.96352
Average accuracy for Anticoagulant0no1warfarin2NOAC3Heparin : 0.99135
Average accuracy for AdmissionGCS : 0.9494
Average accuracy for Esubscore : 0.96688
Average accuracy for Vsubscore : 0.95989
Average accuracy for Msubscore : 0.9686
Average accuracy for ULpower : 0.89522
Average accuracy for LLpower : 0.88857
Average mse for AdmissionsystolicBP : 11.29237
Average mse for AdmissiondiastolicBP : 7.02183
Average accuracy for CKDclass : 0.88155
Average accuracy for location1tha2ICGP3LN4caudate5lobar6brainstem7cerebellum8whitematter : 0.89047
Average accuracy for location0supra1

### Hold-out test

In [14]:
clf = RandomForestClassifier(random_state=42)
rgr = RandomForestRegressor(random_state=42)
imputer = MissForest(clf=clf, rgr=rgr, max_iter=10, categorical=categorical)

In [15]:
X_train_true = X_train.copy()
X_train_mask = X_train.copy()
mask = np.random.rand(*X_train.shape) < 0.2
X_train_mask[mask] = np.nan

X_test_true = X_test.copy()
X_test_mask = X_test.copy()
mask = np.random.rand(*X_test.shape) < 0.2
X_test_mask[mask] = np.nan

In [16]:
imputer.fit(X_train_mask)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:07<00:00,  6.72s/it]


<missforest.missforest.MissForest at 0x7f8e68855610>

In [17]:
X_test_pred = imputer.transform(X_test_mask)
X_test_pred = pd.DataFrame(X_test_pred, columns=X_test_mask.columns)

100%|██████████| 10/10 [00:01<00:00,  5.57it/s]


In [18]:
for j in X_test_true.columns:
    if j in continuous:
        metric = 'mse'
        val = np.sqrt(mean_squared_error(X_test_true[j], X_test_pred[j]))
    else:
        metric = 'accuracy'
        val = evaluate_categoricalandordinal(X_test_pred, X_test_true, j)
        # if j == 'ULpower' or j == 'LLpower':
        #     X_test_pred[j] = X_test_pred[j].map(to_nearest_half)
        #     val = accuracy_score(X_test_true[j]*2, X_test_pred[j]*2)
        # else:
        #     X_test_pred[j] = round(X_test_pred[j])
        #     val = accuracy_score(X_test_true[j], X_test_pred[j])
    print(f'{metric} for {j} : {round(val, 5)}')

mse for Age : 4.28778
accuracy for Sex0female1male : 0.94444
accuracy for Race0chinese1nonchinese : 0.97917
accuracy for DM0no1yes : 0.95833
accuracy for HT0no1yes : 0.99306
accuracy for Allstroke0no1yes : 0.97222
accuracy for IschemicHeartdisease : 0.98611
accuracy for Atrialfibirllation : 0.98611
accuracy for Antiplateletmedication : 0.97917
accuracy for Anticoagulant0no1warfarin2NOAC3Heparin : 0.98611
accuracy for AdmissionGCS : 0.94444
accuracy for Esubscore : 0.9375
accuracy for Vsubscore : 0.95139
accuracy for Msubscore : 0.98611
accuracy for ULpower : 0.91667
accuracy for LLpower : 0.86111
mse for AdmissionsystolicBP : 14.19724
mse for AdmissiondiastolicBP : 8.75237
accuracy for CKDclass : 0.90972
accuracy for location1tha2ICGP3LN4caudate5lobar6brainstem7cerebellum8whitematter : 0.88194
accuracy for location0supra1infra : 1.0
accuracy for laterality0L1R3C : 0.90972
mse for LARGESTICHVOLUME : 12.21598
accuracy for GRAEBSCORE : 0.94444
accuracy for EverIVH : 0.99306


## Impute dataset

In [33]:
irb_code = data['IRB CODE'].copy()
mRS = data['@6MmRS'].copy()
mortality = data['6M mortality'].copy()
ichscore = data['ICH Score'].copy()
data = data.drop(['IRB CODE', '@6MmRS', '6M mortality', 'ICH Score'], axis=1)
# mRS = df['@6MmRS'].copy()
# mortality = df['6M mortality'].copy()
# ichscore = df['ICH Score'].copy()
# data = df.drop(['@6MmRS', '6M mortality', 'ICH Score'], axis=1)

df_imputed = imputer.transform(data)
df_imputed = pd.DataFrame(df_imputed, columns=data.columns)

for i in data.columns:
    if i not in continuous:
        if i == 'ULpower' or i == 'LLpower':
            df_imputed[i] = df_imputed[i].map(to_nearest_half)
        else:
            df_imputed[i] = round(df_imputed[i])

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 24.73it/s]


In [34]:
print(df_imputed.isnull().sum())

Age                                                                    0
Sex0female1male                                                        0
Race0chinese1nonchinese                                                0
DM0no1yes                                                              0
HT0no1yes                                                              0
Allstroke0no1yes                                                       0
IschemicHeartdisease                                                   0
Atrialfibirllation                                                     0
Antiplateletmedication                                                 0
Anticoagulant0no1warfarin2NOAC3Heparin                                 0
AdmissionGCS                                                           0
Esubscore                                                              0
Vsubscore                                                              0
Msubscore                                          

In [35]:
df_imputed = pd.concat([irb_code, df_imputed, mRS, mortality, ichscore], axis=1)

In [36]:
print(df_imputed.isnull().sum())

IRB CODE                                                                0
Age                                                                     0
Sex0female1male                                                         0
Race0chinese1nonchinese                                                 0
DM0no1yes                                                               0
HT0no1yes                                                               0
Allstroke0no1yes                                                        0
IschemicHeartdisease                                                    0
Atrialfibirllation                                                      0
Antiplateletmedication                                                  0
Anticoagulant0no1warfarin2NOAC3Heparin                                  0
AdmissionGCS                                                            0
Esubscore                                                               0
Vsubscore                             

In [None]:
# df_imputed.to_csv('imputed_nosurgery_EXT_2025Nov.csv', index=False)