In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap
import pickle
import os

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             RocCurveDisplay, PrecisionRecallDisplay)
from pandas.api.types import CategoricalDtype

In [2]:
# Import MICE functions
from mice_functions import *

## Load incomplete dataset

In [3]:
# Load incomplete dataset
props_name = [10, 20, 30, 40, 50]
boston_path = "../../data/toy-dataset/boston-processed/"
biopsy_path = "../../data/toy-dataset/biopsy-processed/"

# Starting with boston dataset
boston_incomp = {}
for p in props_name:
    boston_incomp[p] = pd.read_csv(boston_path + "boston_{}.csv".format(p))


# Followed by biopsy dataset
biopsy_incomp = {}
for p in props_name:
    biopsy_incomp[p] = pd.read_csv(biopsy_path + "biopsy_{}.csv".format(p))

In [4]:
# List down target variables for imputation
boston_targets_cat = ["chas"]
boston_targets_num = ["nox"]
boston_imputed_vars = boston_targets_cat + boston_targets_num
#boston_all_vars = boston_incomp[props_name[0]].columns.values

biopsy_targets_cat = []
biopsy_targets_num = ["V1", "V2", "V3"]
biopsy_imputed_vars = biopsy_targets_cat + biopsy_targets_num
#biopsy_all_vars = biopsy_incomp[props_name[0]].columns.values

## Apply MICE algorithm
### General setup

In [5]:
# Select imputation models
method_cat = "pmm"
method_num = "pmm"

# Set number of imputations and maximum number of iteration
maxit = 40
m = 20

# Set number of donors for PMM
d = 10

# Set random seed
SEED = 2023

# Path to store any plots assessing convergence
conv_path = "../../results/figures/python_convergence_V2/"
if not os.path.exists(conv_path):
    os.mkdir(conv_path)

### `boston` dataset

In [6]:
# Prevent plots from being printed
%matplotlib agg

# Prepare dictionary to store imputed data
boston_imputed = {}

for p in props_name:
    print("Applying on dataset {}".format(p))
    
    # Apply MICE algorithm
    boston_imputed[p] = MICE(boston_incomp[p], boston_targets_cat, boston_targets_num, 
                             m=m, maxit=maxit, d=d, seed=SEED, 
                             method_cat=method_cat, method_num=method_num)
    
    # Construct trace plot
    f = ChainStatsViz(boston_imputed[p])
    f.savefig(conv_path + "boston_traceplot_{}.pdf".format(p))
    
    # Construct strip plots for imputed variables
    for c in ["chas", "nox"]:
        f = plotImputedData(boston_imputed[p], c)
        f.savefig(conv_path + "boston_{}_stripplot_{}.pdf".format(c, p))
    
plt.clf()

Applying on dataset 10


100%|███████████████████████████████████████████| 40/40 [00:05<00:00,  7.21it/s]


Applying on dataset 20


100%|███████████████████████████████████████████| 40/40 [00:06<00:00,  5.78it/s]


Applying on dataset 30


100%|███████████████████████████████████████████| 40/40 [00:07<00:00,  5.59it/s]


Applying on dataset 40


100%|███████████████████████████████████████████| 40/40 [00:07<00:00,  5.51it/s]


Applying on dataset 50


100%|███████████████████████████████████████████| 40/40 [00:07<00:00,  5.21it/s]


In [7]:
# Save multiply imputed data as Python objects
with open(boston_path + "imputed.pickle", "wb") as handle:
    pickle.dump(boston_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# Retrieve chain statistics from last iteration for the report
for p in props_name:
    print("p = {}".format(p / 100))
    tempmu = np.median(boston_imputed[p]["chainmean"][3:5, -1, :], axis=1)
    print("Posterior median for mu = {}".format(np.round(tempmu, 3)))
    tempsig = np.median(boston_imputed[p]["chainstd"][3:5, -1, :], axis=1)
    print("Posterior median for sigma = {}".format(np.round(tempsig, 3)))

p = 0.1
Posterior median for mu = [0.028 0.628]
Posterior median for sigma = [0.118 0.074]
p = 0.2
Posterior median for mu = [0.39  0.626]
Posterior median for sigma = [0.493 0.053]
p = 0.3
Posterior median for mu = [0.285 0.605]
Posterior median for sigma = [0.43  0.061]
p = 0.4
Posterior median for mu = [0.427 0.625]
Posterior median for sigma = [0.495 0.079]
p = 0.5
Posterior median for mu = [0.083 0.541]
Posterior median for sigma = [0.277 0.051]


In [16]:
boston_imputed[10]["chainmean"][4].min()

0.3981428571428572

In [17]:
boston_imputed[10]["chainmean"][4].max()

0.7026666666666666

### `biopsy` dataset

In [9]:
# Prepare dictionary to store imputed data
biopsy_imputed = {}

for p in props_name:
    print("Applying on dataset {}".format(p))
    
    # Apply MICE algorithm
    biopsy_imputed[p] = MICE(biopsy_incomp[p], biopsy_targets_cat, biopsy_targets_num, 
                             m=m, maxit=maxit, d=d, seed=SEED, 
                             method_cat=method_cat, method_num=method_num)
    
    # Construct trace plot
    f = ChainStatsViz(biopsy_imputed[p])
    f.savefig(conv_path + "biopsy_traceplot_{}.pdf".format(p))
    
    # Construct strip plots for imputed variables
    for c in ["V1", "V2", "V3"]:
        f = plotImputedData(biopsy_imputed[p], c)
        f.savefig(conv_path + "biopsy_{}_stripplot_{}.pdf".format(c, p))

    plt.clf()

Applying on dataset 10


100%|███████████████████████████████████████████| 40/40 [00:08<00:00,  4.53it/s]


Applying on dataset 20


100%|███████████████████████████████████████████| 40/40 [00:10<00:00,  3.99it/s]


Applying on dataset 30


100%|███████████████████████████████████████████| 40/40 [00:13<00:00,  2.93it/s]


Applying on dataset 40


100%|███████████████████████████████████████████| 40/40 [00:15<00:00,  2.53it/s]


Applying on dataset 50


100%|███████████████████████████████████████████| 40/40 [00:15<00:00,  2.53it/s]


In [10]:
# Save multiply imputed data as Python objects
with open(biopsy_path + "imputed.pickle", "wb") as handle:
    pickle.dump(biopsy_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# Retrieve chain statistics from last iteration for the report
for p in props_name:
    print("p = {}".format(p / 100))
    tempmu = np.median(biopsy_imputed[p]["chainmean"][:3, -1, :], axis=1)
    print("Posterior median for mu = {}".format(np.round(tempmu, 2)))
    tempsig = np.median(biopsy_imputed[p]["chainstd"][:3, -1, :], axis=1)
    print("Posterior median for sigma = {}".format(np.round(tempsig, 2)))

p = 0.1
Posterior median for mu = [5.76 6.73 6.8 ]
Posterior median for sigma = [2.59 3.34 3.16]
p = 0.2
Posterior median for mu = [5.28 6.52 5.81]
Posterior median for sigma = [2.63 3.41 3.14]
p = 0.3
Posterior median for mu = [5.8  5.09 5.23]
Posterior median for sigma = [2.66 3.32 3.37]
p = 0.4
Posterior median for mu = [5.37 4.98 4.64]
Posterior median for sigma = [2.79 3.38 3.36]
p = 0.5
Posterior median for mu = [5.24 4.37 4.72]
Posterior median for sigma = [2.79 3.47 3.46]


## Apply complete case analysis

### `boston` dataset

In [12]:
# Prepare dictionary to store complete case data
boston_cc = {}

for p in tqdm(props_name):
    # Apply complete case analysis on each dataset
    boston_cc[p] = boston_incomp[p].dropna()
    print("Proportion = {}. Dimension of complete case data = {}".format(
        p/100, boston_cc[p].shape))

100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 1378.07it/s]

Proportion = 0.1. Dimension of complete case data = (467, 13)
Proportion = 0.2. Dimension of complete case data = (411, 13)
Proportion = 0.3. Dimension of complete case data = (365, 13)
Proportion = 0.4. Dimension of complete case data = (321, 13)
Proportion = 0.5. Dimension of complete case data = (262, 13)





In [13]:
# Create directory if not present yet
boston_cc_path = "../../data/toy-dataset/boston-complete-case/"
if not os.path.exists(boston_cc_path):
    os.mkdir(boston_cc_path)

# Store complete case data
for p in props_name:
    boston_cc[p].to_csv(boston_cc_path + "boston_{}.csv".format(p), 
                        index=False)

### `biopsy` dataset

In [14]:
# Prepare dictionary to store complete case data
biopsy_cc = {}

for p in tqdm(props_name):
    # Apply complete case analysis on each dataset
    biopsy_cc[p] = biopsy_incomp[p].dropna()
    print("Proportion = {}. Dimension of complete case data = {}".format(
        p/100, biopsy_cc[p].shape))

100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 911.77it/s]

Proportion = 0.1. Dimension of complete case data = (642, 9)
Proportion = 0.2. Dimension of complete case data = (583, 9)
Proportion = 0.3. Dimension of complete case data = (510, 9)
Proportion = 0.4. Dimension of complete case data = (432, 9)
Proportion = 0.5. Dimension of complete case data = (338, 9)





In [15]:
# Create directory if not present yet
biopsy_cc_path = "../../data/toy-dataset/biopsy-complete-case/"
if not os.path.exists(biopsy_cc_path):
    os.mkdir(biopsy_cc_path)

# Store complete case data
for p in props_name:
    biopsy_cc[p].to_csv(biopsy_cc_path + "biopsy_{}.csv".format(p), 
                        index=False)