In [2]:
import scipy.io
import numpy as np
import os

In [3]:
import sys
import warnings
import time
from tqdm import tqdm
from math import sqrt

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan

import xgboost as xgb


from sklearn.metrics import mean_squared_error
import numpy as np


In [4]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

In [6]:
data_dir = r"/home/er647/data/fwal-data"

In [7]:
def load_ASU_dataset(data_dir, dataset):
	mat = scipy.io.loadmat(os.path.join(data_dir, "ASU_datasets", f"{dataset}.mat"))
	X = mat['X']
	y = np.squeeze(mat['Y'])
	X = X.astype(np.float64)
	y = y.astype(np.int64)

	if y.min() == 1 and y.max() == len(set(y)):
		y -= 1
	
	if y.min() == -1 and y.max() == 1 and len(set(y)) == 2:
		y = (y + 1) // 2

	return X, y	


In [8]:
X, y = load_ASU_dataset(data_dir, 'madelon')

In [9]:
type(X), X.shape

(numpy.ndarray, (2600, 500))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
type(X_train), X_train.shape


(numpy.ndarray, (2080, 500))

In [12]:
def gen_MCAR_datasets(X_valid, X_test, fraction = 0.1, seed = 0, replace_val = 0, include_train = False):
    """
    Missing data mechanism
    - fraction (float): percentage of missing values
    - seed (int): seed for reproducibility
    - replace_val: 0 or np.nan for example
    """
    np.random.seed(seed)

    missing_mask_valid = np.random.choice([0, 1], size=X_valid.shape, p=[1-fraction, fraction])
    missing_mask_test = np.random.choice([0, 1], size=X_test.shape, p=[1-fraction, fraction])


    X_valid_missing = X_valid.copy()
    X_valid_missing[missing_mask_valid==1] = replace_val

    X_test_missing = X_test.copy()
    X_test_missing[missing_mask_test==1] = replace_val
    return X_valid_missing, X_test_missing, missing_mask_valid, missing_mask_test
X_valid_missing, X_test_missing, missing_mask_valid, missing_mask_test = gen_MCAR_datasets(X_train, X_test, replace_val=np.nan)

In [13]:
imputers.list()

['median',
 'most_frequent',
 'sinkhorn',
 'missforest',
 'ice',
 'miracle',
 'mean',
 'sklearn_missforest',
 'nop',
 'sklearn_ice',
 'EM',
 'miwae',
 'softimpute',
 'mice',
 'gain',
 'hyperimpute']

In [14]:


benchmarks = [
    'miwae',
    # 'softimpute',
    'mean',
    # 'miracle',
    # 'sinkhorn',
    'sklearn_missforest',
    # 'most_frequent',
    # 'EM', # too slow
    # 'nop',
    # 'hyperimpute', # too slow
    # 'gain',
    # 'median',
    # 'ice',
    # 'mice', # too slow
    # 'sklearn_ice',
    # 'missforest' # too slow
]

# for benchmark in ['mean']:
for benchmark in benchmarks:
    start = time.time()
    imputer = imputers.get(benchmark, random_state=42)
    fitted_imputer = imputer.fit(X_train)
    X_test_imputed = fitted_imputer.transform(X_test_missing).to_numpy()
    missing_mask = np.isnan(X_test_missing)  # Creating the mask where X_test_missing is NaN
    mse = mean_squared_error(X_test[missing_mask], X_test_imputed[missing_mask])  # Calculating MSE only for missing values
    print(benchmark,' | # NaNs in test set: ', np.isnan(X_test_missing).sum(), '| # NaNs in imputed set: ',np.isnan(X_test_imputed).sum(),'| time (seconds): ', round(time.time() - start, 4), '| MSE: ',mse)



miwae  | # NaNs in test set:  25963 | # NaNs in imputed set:  0 | time (seconds):  45.5957 | MSE:  214146.89860067665
mean  | # NaNs in test set:  25963 | # NaNs in imputed set:  0 | time (seconds):  0.0585 | MSE:  898.521319990077
sklearn_missforest  | # NaNs in test set:  25963 | # NaNs in imputed set:  0 | time (seconds):  118.7317 | MSE:  699.7983433008646


In [15]:
imputer = imputers.get('ice')

In [27]:
fitted_imputer = imputer.fit(X_train)

In [28]:
X_test_imputed = fitted_imputer.transform(X_test_missing).to_numpy()