In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import copy
import shap
import pickle

from scipy import linalg
from scipy.special import expit
from scipy import stats
from tqdm import tqdm
from matplotlib import cm
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (roc_auc_score, f1_score, precision_score, recall_score, 
                             RocCurveDisplay, PrecisionRecallDisplay)
from pandas.api.types import CategoricalDtype

In [None]:
# Import MICE functions
from mice_functions import *

## Load incomplete dataset

In [None]:
# Load incomplete dataset
props_name = [10, 20, 30, 40, 50]
boston_path = "../../data/toy-dataset/boston-processed/"
biopsy_path = "../../data/toy-dataset/biopsy-processed/"

# Starting with boston dataset
boston_incomp = {}
for p in props_name:
    boston_incomp[p] = pd.read_csv(boston_path + "boston_{}.csv".format(p))


# Followed by biopsy dataset
biopsy_incomp = {}
for p in props_name:
    biopsy_incomp[p] = pd.read_csv(biopsy_path + "biopsy_{}.csv".format(p))

In [None]:
# List down target variables for imputation
boston_targets_cat = ["chas"]
boston_targets_num = ["nox"]

biopsy_targets_cat = []
biopsy_targets_num = ["V1", "V2", "V3"]

## Apply MICE algorithm
### General setup

In [None]:
# Select imputation models
method_cat = "bayes"
method_num = "pmm"

# Set number of imputations and maximum number of iteration
maxit = 10
m = 20

# Set number of donors for PMM
d = 5

# Set random seed
SEED = 2023

### `boston` dataset

In [None]:
# Apply MICE on boston dataset
boston_imputed = {}

for p in props:
    boston_imputed[p] = MICE(boston_incomp[p], boston_targets_cat, boston_targets_num, 
                             m=m, maxit=maxit, d=d, seed=SEED, 
                             method_cat=method_cat, method_num=method_num)

In [None]:
# Save multiply imputed data as Python objects
with open(boston_path + "imputed.pickle", "wb") as handle:
    pickle.dump(boston_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

### `biopsy` dataset

In [None]:
# Apply MICE on biopsy dataset
biopsy_imputed = {}

for p in props:
    biopsy_imputed[p] = MICE(biopsy_incomp[p], biopsy_targets_cat, biopsy_targets_num, 
                             m=m, maxit=maxit, d=d, seed=SEED, 
                             method_cat=method_cat, method_num=method_num)

In [None]:
# Save multiply imputed data as Python objects
with open(biopsy_path + "imputed.pickle", "wb") as handle:
    pickle.dump(biopsy_imputed, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Apply complete case analysis

### `boston` dataset

In [None]:
# Apply complete case analysis on each dataset
boston_cc = {}

for p in props:
    boston_cc[p] = boston_incomp[p].dropna()
    print("Proportion = {}. Dimension of complete case data = {}".format(
        p/100, boston_cc[p].shape))

In [None]:
# Create directory if not present yet
boston_cc_path = "../../data/toy-dataset/boston-complete-case/"
if not os.path.exists(boston_cc_path):
    os.mkdir(boston_cc_path)

# Store complete case data
for p in props:
    boston_cc[p].to_csv(boston_cc_path + "boston_{}.csv".format(int(p * 100)), 
                        index=False)

### `biopsy` dataset

In [None]:
# Apply complete case analysis on each dataset
biopsy_cc = {}

for p in props:
    biopsy_cc[p] = biopsy_incomp[p].dropna()
    print("Proportion = {}. Dimension of complete case data = {}".format(
        p/100, biopsy_cc[p].shape))

In [None]:
# Create directory if not present yet
biopsy_cc_path = "../../data/toy-dataset/biopsy-complete-case/"
if not os.path.exists(biopsy_cc_path):
    os.mkdir(biopsy_cc_path)

# Store complete case data
for p in props:
    biopsy_cc[p].to_csv(biopsy_cc_path + "biopsy_{}.csv".format(int(p * 100)), 
                        index=False)