In [90]:
import os
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
import pandas as pd
import json

In [105]:
data_list = ['banknote','california','climate_model_crashes','concrete_compression',
 'connectionist_bench_sonar','qsar_biodegradation','wine_quality_red','wine_quality_white','yacht_hydrodynamics',
 'yeast']

clf_list = ["banknote","climate_model_crashes","connectionist_bench_sonar","qsar_biodegradation","yeast"]
reg_list = ['concrete_compression','wine_quality_red','wine_quality_white','yacht_hydrodynamics']

para_set = [0.3,0.5,0.7]
type_list = ["diffuse","logistic","mar","mcar"]

prefilled_type = ["zero","mean","mice"]

In [100]:
def find_mean_cov(train_data):
    
    means = np.mean(train_data, axis=0)

    covariance_matrix = np.cov(train_data, rowvar=False)

    return means,covariance_matrix

In [103]:
def fill_zero(train_data_copy,train_data):
    
    if train_data_copy.shape != train_data.shape: 
        #print("need to refill")
        columns_with_all_nan = np.all(np.isnan(train_data), axis=0)
        # Iterate over columns to fill in the first row with zeros if all NaN
        for i, col_has_all_nan in enumerate(columns_with_all_nan):
            if col_has_all_nan:
                train_data[:, i] = 0

        return train_data
    else:
        return train_data_copy

In [99]:
def run_norm(train_data, save_name):

    # fillintype,dataname,missingtype,missingpara,norm_full

    fillintype = save_name[0]
    prefilled_path = save_name[1]
    norm_full = save_name[2]

    # print("Before Fill")
    # print(train_data[-1,:])

    if fillintype == "mean":
        train_data_copy = train_data.copy()
        # Impute missing values with column means
        if fillintype == "mean":
            # Create a SimpleImputer object with strategy set to 'mean'
            imputer = SimpleImputer(strategy='mean')
            # Fit the imputer on the training data and transform the data
            train_data_copy = imputer.fit_transform(train_data)

        # print("Afterfill")
        # print(train_data_copy[-1,:])
        train_data = fill_zero(train_data_copy,train_data)
        means, covariance_matrix = find_mean_cov(train_data) 

    elif fillintype == "mice":
        train_data_copy = train_data.copy()
        # Initialize the MICE imputer
        mice_imputer = IterativeImputer()
        # Impute missing values
        train_data_copy = mice_imputer.fit_transform(train_data_copy)

        # print("Afterfill")
        # print(train_data_copy[-1,:])
        train_data = fill_zero(train_data_copy,train_data)
        means, covariance_matrix = find_mean_cov(train_data)  

    elif fillintype == "zero":
        train_data_copy = train_data.copy()
        # Impute missing values with zeros
        train_data_copy[np.isnan(train_data_copy)] = 0
        # print("Afterfill")
        # print(train_data_copy[-1,:])
        train_data = fill_zero(train_data_copy,train_data)
        means, covariance_matrix = find_mean_cov(train_data)  

    # print("Afterfill_zero")
    # print()
    # print(train_data[-1,:])
    # print(means)
    # print(covariance_matrix)
    # Write parameters to files
    # fillintype,dataname,missingtype,missingpara,norm/full
    if not os.path.exists("{}/{}".format(prefilled_path, fillintype)):
        os.makedirs("{}/{}".format(prefilled_path, fillintype))
    #print("{}/{}/{}_cov.txt".format(prefilled_path, fillintype, norm_full))
    np.savetxt("{}/{}/{}_cov.txt".format(prefilled_path, fillintype, norm_full), covariance_matrix, delimiter=',', fmt='%f')
    np.savetxt("{}/{}/{}_mu.txt".format(prefilled_path,fillintype, norm_full), means, delimiter=',', fmt='%f')


In [107]:
for data in data_list:
    for types in type_list:
        
        norm_data = np.load('datasets/{}/{}_norm.npy'.format(data,data))
        full_data, y = load_data(data)

        for para in para_set:
            
            # for original data
            path = 'datasets/{}/{}/{}'.format(data,types,para)
            mask = np.load('{}.npy'.format(path))

            # for prefilled data
            prefilled_path = 'prefilled_data/{}/{}/{}'.format(data,types,para)
            if not os.path.exists(prefilled_path):
                os.makedirs(prefilled_path)

            print("Data: {} | Type: {} | Para: {}".format(data, types, para))

            with open('datasets/{}/split_index_cv_seed-1_nfold-5.json'.format(data), 'r') as file:
                index = json.load(file)
                for fold_n in index.keys():
                    #print(fold)
                    fold = index[fold_n]
                    # use nan to inpute X data
                    full_data[mask == 0] = np.nan
                    norm_data[mask == 0] = np.nan

                    train_index = fold['train_index']
                    test_index = fold['test_index']

                    full_X_train = full_data[train_index]
                    full_X_test = full_data[test_index]

                    norm_X_train = norm_data[train_index]
                    norm_X_test = norm_data[test_index]

                    y_train = y[train_index]
                    y_test = y[test_index]

                    for fillintype in prefilled_type:
                        full_X_train_copy = full_X_train.copy()
                        norm_X_train_copy = norm_X_train.copy()
                        #run_norm(full_X_train_copy, [fillintype, prefilled_path,"full_{}".format(fold_n)])
                        run_norm(norm_X_train_copy, [fillintype, prefilled_path,"norm_{}".format(fold_n)])
                    # np.savetxt(path+'train_data.txt', X_train, delimiter=',')
                    # np.savetxt(path+'test_data.txt', X_test, delimiter=',')
                    # np.savetxt(path+'train_labels.txt', y_train, delimiter=',')
                    # np.savetxt(path+'test_labels.txt', y_test, delimiter=',')

Data: banknote | Type: diffuse | Para: 0.3
Data: banknote | Type: diffuse | Para: 0.5
Data: banknote | Type: diffuse | Para: 0.7




Data: banknote | Type: logistic | Para: 0.3
Data: banknote | Type: logistic | Para: 0.5




Data: banknote | Type: logistic | Para: 0.7
Data: banknote | Type: mar | Para: 0.3
Data: banknote | Type: mar | Para: 0.5
Data: banknote | Type: mar | Para: 0.7
Data: banknote | Type: mcar | Para: 0.3




Data: banknote | Type: mcar | Para: 0.5
Data: banknote | Type: mcar | Para: 0.7




Data: california | Type: diffuse | Para: 0.3




Data: california | Type: diffuse | Para: 0.5




Data: california | Type: diffuse | Para: 0.7




Data: california | Type: logistic | Para: 0.3




Data: california | Type: logistic | Para: 0.5
Data: california | Type: logistic | Para: 0.7
Data: california | Type: mar | Para: 0.3
Data: california | Type: mar | Para: 0.5




Data: california | Type: mar | Para: 0.7
Data: california | Type: mcar | Para: 0.3




Data: california | Type: mcar | Para: 0.5




Data: california | Type: mcar | Para: 0.7




Data: climate_model_crashes | Type: diffuse | Para: 0.3
Data: climate_model_crashes | Type: diffuse | Para: 0.5
Data: climate_model_crashes | Type: diffuse | Para: 0.7




Data: climate_model_crashes | Type: logistic | Para: 0.3
Data: climate_model_crashes | Type: logistic | Para: 0.5




Data: climate_model_crashes | Type: logistic | Para: 0.7




Data: climate_model_crashes | Type: mar | Para: 0.3
Data: climate_model_crashes | Type: mar | Para: 0.5
Data: climate_model_crashes | Type: mar | Para: 0.7
Data: climate_model_crashes | Type: mcar | Para: 0.3
Data: climate_model_crashes | Type: mcar | Para: 0.5




Data: climate_model_crashes | Type: mcar | Para: 0.7




Data: concrete_compression | Type: diffuse | Para: 0.3




Data: concrete_compression | Type: diffuse | Para: 0.5




Data: concrete_compression | Type: diffuse | Para: 0.7
Data: concrete_compression | Type: logistic | Para: 0.3




Data: concrete_compression | Type: logistic | Para: 0.5




Data: concrete_compression | Type: logistic | Para: 0.7
Data: concrete_compression | Type: mar | Para: 0.3
Data: concrete_compression | Type: mar | Para: 0.5
Data: concrete_compression | Type: mar | Para: 0.7
Data: concrete_compression | Type: mcar | Para: 0.3




Data: concrete_compression | Type: mcar | Para: 0.5




Data: concrete_compression | Type: mcar | Para: 0.7
Data: connectionist_bench_sonar | Type: diffuse | Para: 0.3




Data: connectionist_bench_sonar | Type: diffuse | Para: 0.5




Data: connectionist_bench_sonar | Type: diffuse | Para: 0.7




Data: connectionist_bench_sonar | Type: logistic | Para: 0.3




Data: connectionist_bench_sonar | Type: logistic | Para: 0.5




Data: connectionist_bench_sonar | Type: logistic | Para: 0.7




Data: connectionist_bench_sonar | Type: mar | Para: 0.3


 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]

Data: connectionist_bench_sonar | Type: mar | Para: 0.5
Data: connectionist_bench_sonar | Type: mar | Para: 0.7
Data: connectionist_bench_sonar | Type: mcar | Para: 0.3


 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 31 32 33 34 36 38 39 40 41 42 48 49 50 51 53 55 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 25 26 27 29 30 31 32 33 34 35 36 38 39 40 41 42 43 45 47 48 49 50 51 52
 53 54 55 56 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 25 26 27 29 30 31 32 33 34 35 36 38 39 40 41 42 43 45 47 48 49 50 51 52
 53 54 55 56 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 25 26 27 29 30 31 32 33 34 35 36 38 39 40 41 42 43 45 47 48 49 50 51 52
 53 54 55 56 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 25 26 27 29 30 31 32 33 34 35 36 38 39 40 41 42 43 45 47 48 49 50 51 52
 53 54 55 56 58 59]. At least one non-missing value is needed for imputation with strategy='mean'.
 25 26 27 29 30 31 32 33 34 35 36 38 39 40

Data: connectionist_bench_sonar | Type: mcar | Para: 0.5




Data: connectionist_bench_sonar | Type: mcar | Para: 0.7




Data: qsar_biodegradation | Type: diffuse | Para: 0.3




Data: qsar_biodegradation | Type: diffuse | Para: 0.5




Data: qsar_biodegradation | Type: diffuse | Para: 0.7




Data: qsar_biodegradation | Type: logistic | Para: 0.3




Data: qsar_biodegradation | Type: logistic | Para: 0.5




Data: qsar_biodegradation | Type: logistic | Para: 0.7




Data: qsar_biodegradation | Type: mar | Para: 0.3


 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.


Data: qsar_biodegradation | Type: mar | Para: 0.5
Data: qsar_biodegradation | Type: mar | Para: 0.7
Data: qsar_biodegradation | Type: mcar | Para: 0.3


 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 37 38 39]. At least one non-missing value is needed for imputation with strategy='mean'.
 26 27 28 30 31 32 33 34 35 36 37 38 39 40]. At least one non-missing value is needed for imputation with strategy='mean'.
 26 27 28 30 31 32 33 34 35 36 37 38 39 40]. At least one non-missing value is needed for imputation with strategy='mean'.
 26 27 28 30 31 32 33 34 35 36 37 38 39 40]. At least one non-missing value is needed for imputation with strategy='mean'.
 26 27 28 30 31 32 33 34 35 36 37 38 39 40]. At least one non-missing value is needed for imputation with strategy='mean'.
 26 27 28 30 31 32 33 34 35 36 37 38 39 40]. At least one non-missing value is needed for imputation with strategy='mean'.
 26 27 28 30 31 32 33 34 

Data: qsar_biodegradation | Type: mcar | Para: 0.5




Data: qsar_biodegradation | Type: mcar | Para: 0.7




Data: wine_quality_red | Type: diffuse | Para: 0.3
Data: wine_quality_red | Type: diffuse | Para: 0.5




Data: wine_quality_red | Type: diffuse | Para: 0.7




Data: wine_quality_red | Type: logistic | Para: 0.3




Data: wine_quality_red | Type: logistic | Para: 0.5
Data: wine_quality_red | Type: logistic | Para: 0.7
Data: wine_quality_red | Type: mar | Para: 0.3
Data: wine_quality_red | Type: mar | Para: 0.5
Data: wine_quality_red | Type: mar | Para: 0.7
Data: wine_quality_red | Type: mcar | Para: 0.3




Data: wine_quality_red | Type: mcar | Para: 0.5
Data: wine_quality_red | Type: mcar | Para: 0.7




Data: wine_quality_white | Type: diffuse | Para: 0.3




Data: wine_quality_white | Type: diffuse | Para: 0.5




Data: wine_quality_white | Type: diffuse | Para: 0.7




Data: wine_quality_white | Type: logistic | Para: 0.3




Data: wine_quality_white | Type: logistic | Para: 0.5
Data: wine_quality_white | Type: logistic | Para: 0.7
Data: wine_quality_white | Type: mar | Para: 0.3




Data: wine_quality_white | Type: mar | Para: 0.5
Data: wine_quality_white | Type: mar | Para: 0.7
Data: wine_quality_white | Type: mcar | Para: 0.3




Data: wine_quality_white | Type: mcar | Para: 0.5




Data: wine_quality_white | Type: mcar | Para: 0.7
Data: yacht_hydrodynamics | Type: diffuse | Para: 0.3
Data: yacht_hydrodynamics | Type: diffuse | Para: 0.5




Data: yacht_hydrodynamics | Type: diffuse | Para: 0.7
Data: yacht_hydrodynamics | Type: logistic | Para: 0.3




Data: yacht_hydrodynamics | Type: logistic | Para: 0.5
Data: yacht_hydrodynamics | Type: logistic | Para: 0.7




Data: yacht_hydrodynamics | Type: mar | Para: 0.3
Data: yacht_hydrodynamics | Type: mar | Para: 0.5
Data: yacht_hydrodynamics | Type: mar | Para: 0.7
Data: yacht_hydrodynamics | Type: mcar | Para: 0.3




Data: yacht_hydrodynamics | Type: mcar | Para: 0.5
Data: yacht_hydrodynamics | Type: mcar | Para: 0.7




Data: yeast | Type: diffuse | Para: 0.3
Data: yeast | Type: diffuse | Para: 0.5




Data: yeast | Type: diffuse | Para: 0.7




Data: yeast | Type: logistic | Para: 0.3




Data: yeast | Type: logistic | Para: 0.5
Data: yeast | Type: logistic | Para: 0.7




Data: yeast | Type: mar | Para: 0.3
Data: yeast | Type: mar | Para: 0.5
Data: yeast | Type: mar | Para: 0.7
Data: yeast | Type: mcar | Para: 0.3




Data: yeast | Type: mcar | Para: 0.5




Data: yeast | Type: mcar | Para: 0.7




In [5]:
from sklearn.datasets import load_iris, load_wine, fetch_california_housing
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame and the label column is 'target'


data_list = ['banknote','california','climate_model_crashes','concrete_compression',
 'connectionist_bench_sonar','qsar_biodegradation','wine_quality_red','wine_quality_white','yacht_hydrodynamics',
 'yeast']

def load_data(name):
    label_encoder = LabelEncoder()
    if name == 'banknote':
     with open('datasets/banknote/data_banknote_authentication.txt', 'rb') as f:
        df = pd.read_csv(f, low_memory=False, sep=',')
        X = df.values[:,:-1]
        y = df.values[:,-1]
        y = label_encoder.fit_transform(y)
    elif name == 'california':
        data  = fetch_california_housing()
        X = data["data"]
        y = data["target"]

    elif name == 'climate_model_crashes':
        X,y = fetch_climate_model_crashes()
        y = label_encoder.fit_transform(y)
    elif name == 'concrete_compression':
         X,y = fetch_concrete_compression()
    elif name == 'yacht_hydrodynamics':
         X,y = fetch_yacht_hydrodynamics()
    elif name == 'airfoil_self_noise':
         X,y = fetch_airfoil_self_noise()
    elif name == 'connectionist_bench_sonar':
         X,y = fetch_connectionist_bench_sonar()
         y = label_encoder.fit_transform(y) 
    elif name == 'qsar_biodegradation':
         X,y = fetch_qsar_biodegradation()
         y = label_encoder.fit_transform(y) 
    elif name == 'wine_quality_red':
             X,y = fetch_wine_quality_red()
    elif name == 'wine_quality_white':
             X,y = fetch_wine_quality_white() 
    elif name == 'yeast':
             X,y = fetch_yeast()
             y = label_encoder.fit_transform(y)  

    return X,y



In [6]:
def fetch_climate_model_crashes():
    with open('datasets/climate_model_crashes/pop_failures.dat', 'rb') as f:
        df = pd.read_csv(f, delimiter='\s+', header = 0)
        # Ignore the two blocking factor
        X = df.values[:, 2:-1]
        y =  df.values[:, -1]

    return X,y

def fetch_concrete_compression():
    with open('datasets/concrete_compression/Concrete_Data.xls', 'rb') as f:
        df = pd.read_excel(io=f)
        X = df.values[:, :-1]
        y =  df.values[:, -1]
    return X,y


def fetch_yacht_hydrodynamics():
    with open('datasets/yacht_hydrodynamics/yacht_hydrodynamics.data', 'rb') as f:
        df = pd.read_csv(f, delimiter='\s+', header = None)
        X = df.values[:, :-1]
        y =  df.values[:, -1]
    return X,y


def fetch_connectionist_bench_sonar():
    with open('datasets/connectionist_bench_sonar/sonar.all-data', 'rb') as f:
        df = pd.read_csv(f, delimiter=',', header = None)

        X = df.values[:, :-1].astype('float')
        y =  df.values[:, -1]
    return X,y


def fetch_qsar_biodegradation():
    with open('datasets/qsar_biodegradation/biodeg.csv', 'rb') as f:
        df = pd.read_csv(f, delimiter=';', header = None)
        X = df.values[:, :-1].astype('float')
        y =  df.values[:, -1]

    return X,y

def fetch_yeast():
    with open('datasets/yeast/yeast.data', 'rb') as f:
        df = pd.read_csv(f, delimiter='\s+', header = None)
        X = df.values[:, 1:-1].astype('float')
        y =  df.values[:, -1]

    return X,y


def fetch_wine_quality_red():
    with open('datasets/wine_quality_red/winequality-red.csv', 'rb') as f:
        df = pd.read_csv(f, delimiter=';')
        X = df.values[:, :-1].astype('float')
        y =  df.values[:, -1]
    return X,y

# Dpne!
def fetch_wine_quality_white():
    with open('datasets/wine_quality_white/data.csv', 'rb') as f:
        df = pd.read_csv(f, delimiter=';')
        X = df.values[:, :-1].astype('float')
        y =  df.values[:, -1]
    return X,y
