In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap
import pickle
import os

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             RocCurveDisplay, PrecisionRecallDisplay)
from pandas.api.types import CategoricalDtype

In [2]:
# Import libraries to interface with R
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from IPython.display import Image, display

In [3]:
# Load relevant R packages
base = importr("base")
rstats = importr("stats")
mice = importr("mice")
grdevices = importr("grDevices")

In [4]:
# Import MICE functions
from mice_functions import *

## Load incomplete dataset

In [5]:
# Load incomplete dataset
props_name = [10, 20, 30, 40, 50]
boston_path = "../../data/toy-dataset/boston-processed/"
biopsy_path = "../../data/toy-dataset/biopsy-processed/"

# Starting with boston dataset
boston_incomp = {}
for p in props_name:
    boston_incomp[p] = pd.read_csv(boston_path + "boston_{}.csv".format(p))


# Followed by biopsy dataset
biopsy_incomp = {}
for p in props_name:
    biopsy_incomp[p] = pd.read_csv(biopsy_path + "biopsy_{}.csv".format(p))

In [6]:
# Convert data frames into R objects
boston_incompr = {}
for p in props_name:
    with localconverter(robjects.default_converter + pandas2ri.converter):
        boston_incompr[p] = robjects.conversion.py2rpy(boston_incomp[p])

biopsy_incompr = {}
for p in props_name:
    with localconverter(robjects.default_converter + pandas2ri.converter):
        biopsy_incompr[p] = robjects.conversion.py2rpy(biopsy_incomp[p])

In [7]:
# List down target variables for imputation
boston_targets_cat = ["chas"]
boston_targets_num = ["nox"]
boston_imputed_vars = boston_targets_cat + boston_targets_num
#boston_all_vars = boston_incomp[props_name[0]].columns.values

biopsy_targets_cat = []
biopsy_targets_num = ["V1", "V2", "V3"]
biopsy_imputed_vars = biopsy_targets_cat + biopsy_targets_num
#biopsy_all_vars = biopsy_incomp[props_name[0]].columns.values

## Apply MICE algorithm

### General setup

In [8]:
# Select imputation models
# Boston dataset
# Create a placeholder MICE model to modify the method
temp_mice = mice.mice(boston_incompr[10], maxit=0, method="pmm", print=False)
with localconverter(robjects.default_converter + pandas2ri.converter):
    bostonmthd = robjects.conversion.rpy2py(temp_mice.rx2("method"))

# Add line here if any method is to be modified
#chas_idx = list(boston_incomp[10].columns).index("chas")
#bostonmthd[chas_idx] = "logreg.boot"
print(bostonmthd)

# Biopsy dataset
# Create a placeholder MICE model to modify the method
temp_mice = mice.mice(biopsy_incompr[10], maxit=0, method="pmm", print=False)
with localconverter(robjects.default_converter + pandas2ri.converter):
    biopsymthd = robjects.conversion.rpy2py(temp_mice.rx2("method"))

# Add line here if any method is to be modified
print(biopsymthd)

# Set number of imputations and maximum number of iterations
maxit = 40
m = 20

# Set number of donors for PMM
d = 10

# Set random seed
SEED = 2023

# Path to store any plots assessing convergence
conv_path = "../../results/figures/r_convergence_V2/"
if not os.path.exists(conv_path):
    os.mkdir(conv_path)

   crim      zn   indus    chas     nox      rm     age     dis     tax ptratio 
     ""      ""      ""   "pmm"   "pmm"      ""      ""      ""      ""      "" 
  black   lstat    medv 
     ""      ""      "" 

             V1              V2              V3              V4              V5 
          "pmm"           "pmm"           "pmm"              ""              "" 
             V7              V8              V9 class_malignant 
             ""              ""              ""              "" 



### `boston` dataset

In [9]:
# Prepare dictionary to store imputation objects from R
boston_imputedr = {}

for p in props_name:
    print("Applying on dataset {}".format(p))
    
    # Apply MICE algorithm
    boston_imputedr[p] = mice.mice(boston_incompr[p], m=m, maxit=maxit, method=bostonmthd, 
                                   seed=SEED, donors=d, print=False)
    
    # Construct trace plot (moved to below)
    # grdevices.pdf(file=conv_path + "boston_traceplot_{}.pdf".format(p))
    # f = mice.plot_mids(boston_imputedr[p], layout=robjects.IntVector((2, len(boston_imputed_vars))))
    # base.print(f)
    # grdevices.dev_off()
    
    # Construct strip plots for imputed variables
    #for c in boston_imputed_vars:
    #    # Set up image file
    #    grdevices.pdf(file=conv_path + "boston_{}_stripplot_{}.pdf".format(c, p))
    #    # Create and print plot
    #    f = mice.stripplot_mids(boston_imputedr[p], rstats.as_formula("{} ~ .imp".format(c)), 
    #                            pch=20, cex=2)
    #    base.print(f)
    #    # Close file
    #    grdevices.dev_off()
    

Applying on dataset 10
Applying on dataset 20
Applying on dataset 30
Applying on dataset 40
Applying on dataset 50


In [10]:
# Convert to Python compatible format
# We need the imputed data and the missing flag

# Set up placeholder dictionary
boston_imputed = {}

for p in tqdm(props_name):
    # Retrieve imputed data
    tempr = mice.complete_mids(boston_imputedr[p], action="all", include=False)
    with localconverter(robjects.default_converter + pandas2ri.converter):
        temp = robjects.conversion.rpy2py(tempr)
    # Note that reset_index is necessary as R had converted all indices to strings
    boston_imputed[p] = {"imp": [v.reset_index(drop=True) for k, v in temp.items()]}
    
    # Construct missing flag using the original data
    boston_imputed[p]["missingflag"] = boston_incomp[p].isna()
    
    # Retrieve chain statistics
    # NOTE: In R, the variance is stored instead of standard deviations
    with localconverter(robjects.default_converter + pandas2ri.converter):
        tempchainmu = robjects.conversion.rpy2py(boston_imputedr[p].rx2("chainMean"))
        tempchainsig = robjects.conversion.rpy2py(boston_imputedr[p].rx2("chainVar"))
    boston_imputed[p]["chainmean"] = tempchainmu
    with np.errstate(invalid="ignore"): # suppress warnings for the NAs
        boston_imputed[p]["chainstd"] = np.sqrt(tempchainsig)
    
    # Include maxit
    boston_imputed[p]["maxit"] = maxit

100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 11.20it/s]


In [11]:
# Prevent plots from being printed
%matplotlib agg

# Construct trace plot and strip plot
for p in tqdm(props_name):
    # Create and save trace plot
    f = ChainStatsViz(boston_imputed[p], missingvars=boston_imputed_vars)
    f.savefig(conv_path + "boston_traceplot_{}.pdf".format(p))
    
    # Create and save strip plot
    for c in boston_imputed_vars:
        f = plotImputedData(boston_imputed[p], c)
        f.savefig(conv_path + "boston_{}_stripplot_{}.pdf".format(c, p))

100%|█████████████████████████████████████████████| 5/5 [00:14<00:00,  2.83s/it]


In [12]:
# Save multiply imputed data as Python objects
with open(boston_path + "imputed_r.pickle", "wb") as handle:
    pickle.dump(boston_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# Retrieve chain statistics from last iteration for the report
for p in props_name:
    print("p = {}".format(p / 100))
    tempmu = np.median(boston_imputed[p]["chainmean"][3:5, -1, :], axis=1)
    print("Posterior median for mu = {}".format(np.round(tempmu, 3)))
    tempsig = np.median(boston_imputed[p]["chainstd"][3:5, -1, :], axis=1)
    print("Posterior median for sigma = {}".format(np.round(tempsig, 3)))

p = 0.1
Posterior median for mu = [0.056 0.623]
Posterior median for sigma = [0.236 0.12 ]
p = 0.2
Posterior median for mu = [0.098 0.608]
Posterior median for sigma = [0.3   0.117]
p = 0.3
Posterior median for mu = [0.069 0.597]
Posterior median for sigma = [0.255 0.112]
p = 0.4
Posterior median for mu = [0.073 0.598]
Posterior median for sigma = [0.261 0.117]
p = 0.5
Posterior median for mu = [0.071 0.584]
Posterior median for sigma = [0.257 0.123]


In [22]:
boston_imputed[10]["chainmean"][4].min()

0.5941428571428572

In [23]:
boston_imputed[10]["chainmean"][4].max()

0.6833333333333333

### `biopsy` dataset

In [14]:
# Prepare dictionary to store imputation objects from R
biopsy_imputedr = {}

for p in props_name:
    print("Applying on dataset {}".format(p))
    
    # Apply MICE algorithm
    biopsy_imputedr[p] = mice.mice(biopsy_incompr[p], m=m, maxit=maxit, method=biopsymthd, 
                                   seed=SEED, donors=d, print=False)
    
    # Construct trace plot
    #grdevices.pdf(file=conv_path + "biopsy_traceplot_{}.pdf".format(p))
    #f = mice.plot_mids(biopsy_imputedr[p], layout=robjects.IntVector((2, len(biopsy_imputed_vars))))
    #base.print(f)
    #grdevices.dev_off()
    
    # Construct strip plots for imputed variables
    #for c in biopsy_imputed_vars:
    #    # Set up image file
    #    grdevices.pdf(file=conv_path + "biopsy_{}_stripplot_{}.pdf".format(c, p))
    #    # Create and print plot
    #    f = mice.stripplot_mids(biopsy_imputedr[p], rstats.as_formula("{} ~ .imp".format(c)), 
    #                            pch=20, cex=2)
    #    base.print(f)
    #    # Close file
    #    grdevices.dev_off()


Applying on dataset 10
Applying on dataset 20
Applying on dataset 30
Applying on dataset 40
Applying on dataset 50


In [15]:
# Convert to Python compatible format
# We need the imputed data and the missing flag

# Set up placeholder dictionary
biopsy_imputed = {}

for p in tqdm(props_name):
    # Retrieve imputed data
    tempr = mice.complete_mids(biopsy_imputedr[p], action="all", include=False)
    with localconverter(robjects.default_converter + pandas2ri.converter):
        temp = robjects.conversion.rpy2py(tempr)
    # Note that reset_index is necessary as R had converted all indices to strings
    biopsy_imputed[p] = {"imp": [v.reset_index(drop=True) for k, v in temp.items()]}
    
    # Construct missing flag using the original data
    biopsy_imputed[p]["missingflag"] = biopsy_incomp[p].isna()
    
    # Retrieve chain statistics
    # NOTE: In R, the variance is stored instead of standard deviations
    with localconverter(robjects.default_converter + pandas2ri.converter):
        tempchainmu = robjects.conversion.rpy2py(biopsy_imputedr[p].rx2("chainMean"))
        tempchainsig = robjects.conversion.rpy2py(biopsy_imputedr[p].rx2("chainVar"))
    biopsy_imputed[p]["chainmean"] = tempchainmu
    with np.errstate(invalid="ignore"): # suppress warnings for the NAs
        biopsy_imputed[p]["chainstd"] = np.sqrt(tempchainsig)
    
    # Include maxit
    biopsy_imputed[p]["maxit"] = maxit

100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 10.98it/s]


In [16]:
# Prevent plots from being printed
%matplotlib agg

# Construct trace plot and strip plot
for p in tqdm(props_name):
    # Create and save trace plot
    f = ChainStatsViz(biopsy_imputed[p], missingvars=biopsy_imputed_vars)
    f.savefig(conv_path + "biopsy_traceplot_{}.pdf".format(p))
    
    # Create and save strip plot
    for c in biopsy_imputed_vars:
        f = plotImputedData(biopsy_imputed[p], c)
        f.savefig(conv_path + "biopsy_{}_stripplot_{}.pdf".format(c, p))

100%|█████████████████████████████████████████████| 5/5 [00:26<00:00,  5.39s/it]


In [17]:
# Save multiply imputed data as Python objects
with open(biopsy_path + "imputed_r.pickle", "wb") as handle:
    pickle.dump(biopsy_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
# Retrieve chain statistics from last iteration for the report
for p in props_name:
    print("p = {}".format(p / 100))
    tempmu = np.median(biopsy_imputed[p]["chainmean"][:3, -1, :], axis=1)
    print("Posterior median for mu = {}".format(np.round(tempmu, 2)))
    tempsig = np.median(biopsy_imputed[p]["chainstd"][:3, -1, :], axis=1)
    print("Posterior median for sigma = {}".format(np.round(tempsig, 2)))

p = 0.1
Posterior median for mu = [6.29 6.88 6.5 ]
Posterior median for sigma = [2.81 3.4  3.43]
p = 0.2
Posterior median for mu = [6.52 6.35 6.14]
Posterior median for sigma = [2.85 3.39 3.33]
p = 0.3
Posterior median for mu = [6.1  5.25 5.06]
Posterior median for sigma = [2.79 3.4  3.48]
p = 0.4
Posterior median for mu = [5.58 4.67 4.58]
Posterior median for sigma = [2.9 3.3 3.3]
p = 0.5
Posterior median for mu = [5.36 4.22 4.15]
Posterior median for sigma = [2.96 3.38 3.23]
