In [36]:
import numpy as np
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import json
import os

In [2]:
dataname_list = ["banknote","yeast","climate_model_crashes",
                 "wine_quality_white", "yacht_hydrodynamics","concrete_compression",
                 "breast_cancer","solar_fire","car_evluation"
                 ]

In [41]:
def save_split_index_cv(scaled_data, directory_path, seed=1, nfold=5):
    indlist = np.arange(len(scaled_data))

    np.random.seed(seed)
    np.random.shuffle(indlist)

    fold_size = len(scaled_data) // nfold
    save_index = {}

    for fold in range(nfold):
        start = fold * fold_size
        end = start + fold_size if fold < nfold - 1 else len(scaled_data)
        
        test_index = indlist[start:end]
        train_index = np.concatenate([indlist[:start], indlist[end:]])

        # If you want to split the training set into train and validation sets
        num_train = int(len(train_index) * 0.9)
        train_subindex = train_index[:num_train]
        valid_subindex = train_index[num_train:]

        fold_index = {
            "test_index": test_index.astype(np.int64).tolist(),
            "train_index": train_subindex.astype(np.int64).tolist(),
            "valid_index": valid_subindex.astype(np.int64).tolist()
        }
        save_index[f"fold_{fold+1}"] = fold_index

    with open(f"data/{directory_path}/split_index_cv_seed-{seed}_nfold-{nfold}.json", 'w') as file:
        json.dump(save_index, file)

In [30]:
def load_data(name):
    if name == "banknote":
        with open('data/banknote/data_banknote_authentication.txt', 'rb') as f:
            df = pd.read_csv(f, low_memory=False, sep=',',header = None)
            Xy = {}
            # Ignore the two blocking factor
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]
    elif name == "yeast":
        with open('data/yeast/yeast.data', 'rb') as f:
            df = pd.read_csv(f, delimiter='\s+', header = None)
            Xy = {}
            # remove index
            Xy['data'] = df.values[:, 1:-1].astype('float')
            Xy['target'] =  df.values[:, -1]
    elif name == "climate_model_crashes":
        with open('data/climate_model_crashes/pop_failures.dat', 'rb') as f:
            df = pd.read_csv(f, delimiter='\s+', header = 0)
            Xy = {}
            # Ignore the two blocking factor
            Xy['data'] = df.values[:, 2:-1]
            Xy['target'] =  df.values[:, -1]
    elif name == "wine_quality_white":
        with open('data/wine_quality_white/data.csv', 'rb') as f:
            df = pd.read_csv(f, delimiter=';')
            Xy = {}
            Xy['data'] = df.values[:, :-1].astype('float')
            Xy['target'] =  df.values[:, -1]

    elif name == "yacht_hydrodynamics":
        with open('data/yacht_hydrodynamics/yacht_hydrodynamics.data', 'rb') as f:
            df = pd.read_csv(f, delimiter='\s+', header = None)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "concrete_compression":
        with open('data/concrete_compression/Concrete_Data.xls', 'rb') as f:
            df = pd.read_excel(io=f)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "breast_cancer":
        with open('data/breast_cancer/breast_cancer.data', 'rb') as f:
            df = pd.read_csv(f, delimiter=',', header = None)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "solar_fire":
        with open('data/solar_fire/flare.data1', 'rb') as f:
            df1 = pd.read_csv(f, delimiter='\s+', header = None)
        with open('data/solar_fire/flare.data2', 'rb') as f:
            df2 = pd.read_csv(f, delimiter='\s+', header = None)
            df = pd.concat([df1, df2], ignore_index=True)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "car_evaluation":
        with open('data/car_evaluation/car.data', 'rb') as f:
            df = pd.read_csv(f, delimiter=',', header = None)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    return Xy


In [68]:
dataname_list = ["banknote","yeast","climate_model_crashes",
                 "wine_quality_white", "yacht_hydrodynamics","concrete_compression",
                 "breast_cancer","solar_fire","car_evaluation"
                 ]
save = False
missing_rate = [0.05, 0.1, 0.3, 0.5] 


for name in dataname_list:
    for p in missing_rate:
        Xy = load_data(name)
        
        feature = Xy['data']
        label = Xy['target']

        MCAR_mask = MCAR(feature, p, seed=1)
        # MAR_mask = MCAR(feature, p, seed=1)
        # MNAR_mask = MNAR(feature, p, seed=1)
        
        

        if save:
            save_split_index_cv(Xy['data'],name,seed = 1,nfold = 5)
            np.save(f"data/{name}/feature.npy", Xy['data'])
            np.save(f"data/{name}/label.npy", Xy['target'])
    print()


[0.04956268 0.04956268 0.04956268 0.04956268] 0.04956268221574344
[0.09985423 0.09985423 0.09985423 0.09985423] 0.09985422740524781
[0.29956268 0.29956268 0.29956268 0.29956268] 0.29956268221574345
[0.5 0.5 0.5 0.5] 0.5

[0.04986523 0.04986523 0.04986523 0.04986523 0.04986523 0.04986523
 0.04986523 0.04986523] 0.04986522911051213
[0.09973046 0.09973046 0.09973046 0.09973046 0.09973046 0.09973046
 0.09973046 0.09973046] 0.09973045822102426
[0.29986523 0.29986523 0.29986523 0.29986523 0.29986523 0.29986523
 0.29986523 0.29986523] 0.2998652291105121
[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5] 0.5

[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
 0.05 0.05 0.05 0.05] 0.05
[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1] 0.1
[0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3] 0.3
[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5] 0.5

[0.04981625 0.04981625 0.04981625 0.04981625 0.04981625 0.04981625
 0.04981

## Missing Mechan

In [69]:
def MCAR(observed_values, p, seed=1):
    np.random.seed(seed)
    num_rows, num_cols = observed_values.shape
    num_to_remove_per_column = int(num_rows * p)
    masks = np.ones_like(observed_values)
    for col in range(num_cols):
        indices_to_remove = np.random.choice(num_rows, num_to_remove_per_column, replace=False)
        masks[indices_to_remove, col] = 0
    #calculate_missing_rates(masks)
    return masks


In [58]:
def calculate_missing_rates(mask):
    # Calculate missing rate for each column
    num_rows, num_cols = mask.shape
    missing_rate_per_column = np.sum(mask == 0, axis=0) / num_rows

    # Calculate overall missing rate
    total_elements = num_rows * num_cols
    overall_missing_rate = np.sum(mask == 0) / total_elements

    print(missing_rate_per_column, overall_missing_rate)