In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap
import pickle
import os

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             RocCurveDisplay, PrecisionRecallDisplay)
from pandas.api.types import CategoricalDtype

In [2]:
# Import libraries to interface with R
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from IPython.display import Image, display

In [3]:
# Load relevant R packages
base = importr("base")
rstats = importr("stats")
mice = importr("mice")
grdevices = importr("grDevices")

## Load incomplete dataset

In [4]:
# Load incomplete dataset
props_name = [10, 20, 30, 40, 50]
boston_path = "../../data/toy-dataset/boston-processed/"
biopsy_path = "../../data/toy-dataset/biopsy-processed/"

# Starting with boston dataset
boston_incomp = {}
for p in props_name:
    boston_incomp[p] = pd.read_csv(boston_path + "boston_{}.csv".format(p))


# Followed by biopsy dataset
biopsy_incomp = {}
for p in props_name:
    biopsy_incomp[p] = pd.read_csv(biopsy_path + "biopsy_{}.csv".format(p))

In [5]:
# Convert data frames into R objects
boston_incompr = {}
for p in props_name:
    with localconverter(robjects.default_converter + pandas2ri.converter):
        boston_incompr[p] = robjects.conversion.py2rpy(boston_incomp[p])

biopsy_incompr = {}
for p in props_name:
    with localconverter(robjects.default_converter + pandas2ri.converter):
        biopsy_incompr[p] = robjects.conversion.py2rpy(biopsy_incomp[p])

In [6]:
# List down target variables for imputation
boston_targets_cat = ["chas"]
boston_targets_num = ["nox"]

biopsy_targets_cat = []
biopsy_targets_num = ["V1", "V2", "V3"]

## Apply MICE algorithm

### General setup

In [47]:
# Select imputation models
# Boston dataset
# Create a placeholder MICE model to modify the method
temp_mice = mice.mice(boston_incompr[10], maxit=0, method="pmm", print=False)
with localconverter(robjects.default_converter + pandas2ri.converter):
    bostonmthd = robjects.conversion.rpy2py(temp_mice.rx2("method"))

# Add line here if any method is to be modified
chas_idx = list(boston_incomp[10].columns).index("chas")
bostonmthd[chas_idx] = "logreg.boot"
print(bostonmthd)

# Biopsy dataset
# Create a placeholder MICE model to modify the method
temp_mice = mice.mice(biopsy_incompr[10], maxit=0, method="pmm", print=False)
with localconverter(robjects.default_converter + pandas2ri.converter):
    biopsymthd = robjects.conversion.rpy2py(temp_mice.rx2("method"))

# Add line here if any method is to be modified
print(biopsymthd)

# Set number of imputations and maximum number of iterations
maxit = 40
m = 20

# Set number of donors for PMM
d = 10

# Set random seed
SEED = 2023

# Path to store any plots assessing convergence
conv_path = "../../results/figures/r_convergence/"
if not os.path.exists(conv_path):
    os.mkdir(conv_path)

         crim            zn         indus          chas           nox 
           ""            ""            "" "logreg.boot"         "pmm" 
           rm           age           dis           tax       ptratio 
           ""            ""            ""            ""            "" 
        black         lstat          medv 
           ""            ""            "" 

             V1              V2              V3              V4              V5 
          "pmm"           "pmm"           "pmm"              ""              "" 
             V7              V8              V9 class_malignant 
             ""              ""              ""              "" 



### `boston` dataset

In [8]:
# Prepare dictionary to store imputation objects from R
boston_imputedr = {}

for p in props_name:
    print("Applying on dataset {}".format(p))
    
    # Apply MICE algorithm
    boston_imputedr[p] = mice.mice(boston_incompr[p], m=m, maxit=maxit, method=method, 
                                   seed=SEED, donors=d, print=False)
    
    # Construct trace plot
    grdevices.pdf(file=conv_path + "boston_traceplot_{}.pdf".format(p))
    f = mice.plot_mids(boston_imputedr[p])
    base.print(f)
    grdevices.dev_off()
    
    # Construct strip plots for imputed variables
    for c in ["chas", "nox"]:
        # Set up image file
        grdevices.pdf(file=conv_path + "boston_{}_stripplot_{}.pdf".format(c, p))
        # Create and print plot
        f = mice.stripplot_mids(boston_imputedr[p], rstats.as_formula("{} ~ .imp".format(c)), 
                                pch=20, cex=2)
        base.print(f)
        # Close file
        grdevices.dev_off()
    

Applying on dataset 10
Applying on dataset 20
Applying on dataset 30
Applying on dataset 40
Applying on dataset 50


In [28]:
# Convert to Python compatible format
# We need the imputed data and the missing flag

# Set up placeholder dictionary
boston_imputed = {}

for p in tqdm(props_name):
    # Retrieve imputed data
    tempr = mice.complete_mids(boston_imputedr[p], action="all", include=False)
    with localconverter(robjects.default_converter + pandas2ri.converter):
        temp = robjects.conversion.rpy2py(tempr)
    boston_imputed[p] = {"imp": [v for k, v in temp.items()]}
    
    # Construct missing flag using the original data
    boston_imputed[p]["missingflag"] = boston_incomp[p].isna()
    
    # Retrieve chain statistics
    # NOTE: Here, the variance is retrieved instead of standard deviations
    with localconverter(robjects.default_converter + pandas2ri.converter):
        tempchainmu = robjects.conversion.rpy2py(boston_imputedr[p].rx2("chainMean"))
        tempchainsig = robjects.conversion.rpy2py(boston_imputedr[p].rx2("chainVar"))
    boston_imputed[p]["chainmean"] = tempchainmu
    boston_imputed[p]["chainvar"] = tempchainsig

100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 13.07it/s]


In [None]:
# Save multiply imputed data as Python objects
with open(boston_path + "imputed_r.pickle", "wb") as handle:
    pickle.dump(boston_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

### `biopsy` dataset

In [33]:
# Prepare dictionary to store imputation objects from R
biopsy_imputedr = {}

for p in props_name:
    print("Applying on dataset {}".format(p))
    
    # Apply MICE algorithm
    biopsy_imputedr[p] = mice.mice(biopsy_incompr[p], m=m, maxit=maxit, method=method, 
                                   seed=SEED, donors=d, print=False)
    
    # Construct trace plot
    grdevices.pdf(file=conv_path + "biopsy_traceplot_{}.pdf".format(p))
    f = mice.plot_mids(biopsy_imputedr[p])
    base.print(f)
    grdevices.dev_off()
    
    # Construct strip plots for imputed variables
    for c in ["V1", "V2", "V3"]:
        # Set up image file
        grdevices.pdf(file=conv_path + "biopsy_{}_stripplot_{}.pdf".format(c, p))
        # Create and print plot
        f = mice.stripplot_mids(biopsy_imputedr[p], rstats.as_formula("{} ~ .imp".format(c)), 
                                pch=20, cex=2)
        base.print(f)
        # Close file
        grdevices.dev_off()


Applying on dataset 10
Applying on dataset 20
Applying on dataset 30
Applying on dataset 40
Applying on dataset 50


In [34]:
# Convert to Python compatible format
# We need the imputed data and the missing flag

# Set up placeholder dictionary
biopsy_imputed = {}

for p in tqdm(props_name):
    # Retrieve imputed data
    tempr = mice.complete_mids(biopsy_imputedr[p], action="all", include=False)
    with localconverter(robjects.default_converter + pandas2ri.converter):
        temp = robjects.conversion.rpy2py(tempr)
    biopsy_imputed[p] = {"imp": [v for k, v in temp.items()]}
    
    # Construct missing flag using the original data
    biopsy_imputed[p]["missingflag"] = biopsy_incomp[p].isna()
    
    # Retrieve chain statistics
    # NOTE: Here, the variance is retrieved instead of standard deviations
    with localconverter(robjects.default_converter + pandas2ri.converter):
        tempchainmu = robjects.conversion.rpy2py(biopsy_imputedr[p].rx2("chainMean"))
        tempchainsig = robjects.conversion.rpy2py(biopsy_imputedr[p].rx2("chainVar"))
    biopsy_imputed[p]["chainmean"] = tempchainmu
    biopsy_imputed[p]["chainvar"] = tempchainsig

100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 12.97it/s]


In [None]:
# Save multiply imputed data as Python objects
with open(biopsy_path + "imputed_r.pickle", "wb") as handle:
    pickle.dump(biopsy_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)