In [16]:
# list of all available imputers
from hyperimpute.plugins.imputers import Imputers
imputers = Imputers()
imputers.list()

['median',
 'sklearn_ice',
 'mice',
 'nop',
 'missforest',
 'EM',
 'ice',
 'most_frequent',
 'mean',
 'miracle',
 'miwae',
 'hyperimpute',
 'gain',
 'sklearn_missforest',
 'softimpute',
 'sinkhorn']

You can also add custom imputers if you wish

In [17]:
# load and pre-process dataset
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np

preproc = MinMaxScaler()

# TODO: convert to generic dataset
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(preproc.fit_transform(X, y))
y = pd.Series(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
# using mean imputer
from hyperimpute.plugins.utils.simulate import simulate_nan
# helps to simulate nan based on the mechanisms available!

def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return pd.DataFrame(x), pd.DataFrame(x_miss), pd.DataFrame(mask)

datasets = {}
headers = ["Plugin"]

pct = 0.3

# Missing Completely At Random (MCAR) if the probability of being missing is the same for all observations
# Missing At Random (MAR) if the probability of being missing only depends on observed values.
# Missing Not At Random (MNAR) if the unavailability of the data depends on both observed and unobserved data such as its value itself.
mechanisms = ["MAR", "MNAR", "MCAR"]
percentages = [pct]

plugins = ["mean"]

for ampute_mechanism in mechanisms:
    for p_miss in percentages:
        if ampute_mechanism not in datasets:
            datasets[ampute_mechanism] = {}

        headers.append(ampute_mechanism + "-" + str(p_miss))
        datasets[ampute_mechanism][p_miss] = ampute(X_train, ampute_mechanism, p_miss)

In [29]:
import time
from tqdm import tqdm
from hyperimpute.plugins.utils.metrics import RMSE

results = []
duration = []

print(datasets)

for plugin in tqdm(plugins):
    plugin_results = [plugin]
    plugin_duration = [plugin]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            ctx = imputers.get(plugin)
            #print(datasets[ampute_mechanism][p_miss])
            x, x_miss, mask = datasets[ampute_mechanism][p_miss]
            start = time.time() * 1000
            x_imp = ctx.fit_transform(x_miss)

            plugin_duration.append(round(time.time() * 1000 - start, 4))
            plugin_results.append(RMSE(x_imp.values, x.values, mask.values))

    results.append(plugin_results)
    duration.append(plugin_duration)

{'MAR': {0.3: (           0         1         2         3         4         5         6   \
338  0.145251  0.264457  0.142492  0.070965  0.433962  0.165266  0.058833   
427  0.180747  0.414948  0.172759  0.091792  0.319401  0.116711  0.084677   
406  0.433480  0.174163  0.418147  0.278473  0.382053  0.201307  0.128866   
96   0.246060  0.274941  0.234953  0.130477  0.468268  0.157015  0.058341   
490  0.249373  0.430504  0.237648  0.137010  0.264422  0.100055  0.040159   
..        ...       ...       ...       ...       ...       ...       ...   
277  0.559847  0.347311  0.532859  0.406575  0.330414  0.121036  0.187910   
9    0.259312  0.484613  0.277659  0.140997  0.595558  0.675480  0.532568   
359  0.116191  0.291173  0.110773  0.057306  0.435768  0.123244  0.063496   
192  0.129632  0.287792  0.117062  0.061336  0.152298  0.012453  0.000000   
559  0.214350  0.480893  0.212356  0.110286  0.360928  0.253727  0.260544   

           7         8         9   ...        20        21  

100%|██████████| 1/1 [00:00<00:00, 44.91it/s]


In [20]:
# display 
from IPython.display import HTML, display
import tabulate

# results - RMSE; lower is better
display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

# duration taken
display(HTML(tabulate.tabulate(duration, headers=headers, tablefmt="html")))

Plugin,MAR-0.3,MNAR-0.3,MCAR-0.3
mean,0.195269,0.170658,0.146133
mice,0.0776872,0.080468,0.0718948
EM,0.0648185,0.0674888,0.0506246


Plugin,MAR-0.3,MNAR-0.3,MCAR-0.3
mean,5.1609,3.489,3.551
mice,81235.1,47258.2,40774.2
EM,101821.0,350179.0,222651.0


In [21]:
# get more metrics to compare using xgboost

from sklearn import metrics
import xgboost as xgb

def get_metrics(X_train, y_train, X_test, y_test):
    xgb_clf = xgb.XGBClassifier(verbosity=0)
    xgb_clf = xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    score = xgb_clf.score(X_test, y_test)

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    auroc = metrics.auc(fpr, tpr)

    prec, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
    aurpc = metrics.auc(recall, prec)

    return score, auroc, aurpc


metrics_headers = ["Plugin", "Accuracy", "AUROC", "AURPC"]
xgboost_test_score = []


x, x_miss, mask = datasets["MAR"][pct]

xgboost_test_score.append(
    ["original dataset", *get_metrics(X_train, y_train, X_test, y_test)]
)
# : accuracy, area under the ROC curve (AUROC), and area under the precision-recall curve (AURPC).
for plugin in plugins:
    X_train_imp = imputers.get(plugin).fit_transform(x_miss.copy())
    # run on impute data
    score, auroc, aurpc = get_metrics(X_train_imp, y_train, X_test, y_test)
    xgboost_test_score.append([plugin, score, auroc, aurpc])

  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  Sigma = np.cov(
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [22]:

# the higher the better
display(
    HTML(
        tabulate.tabulate(xgboost_test_score, headers=metrics_headers, tablefmt="html")
    )
)

Plugin,Accuracy,AUROC,AURPC
original dataset,0.95614,0.956335,0.975618
mean,0.964912,0.963798,0.978921
mice,0.964912,0.966974,0.983078
EM,0.973684,0.974436,0.986271
