In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
def MCAR(observed_values, missing_ratio, masks):
    for col in range(observed_values.shape[1]):  # col #

        obs_indices = np.where(observed_values[:, col])[0]
        miss_indices = np.random.choice(
        obs_indices, (int)(len(obs_indices) * missing_ratio), replace=False
        )
        masks[miss_indices, col] = False

    return masks

In [9]:
import torch

from data_loaders import *

from missing_process.missing_method import * 

import os

In [174]:
import pickle

In [19]:
from torch.utils.data import DataLoader, Dataset

In [10]:
data = dataset_loader("california")

In [None]:
import sys
sys.path.append("..")
import pickle
import yaml
import os
import re
import sys
import numpy as np
import pandas as pd

from data_loaders import *
import missing_process.missing_method as missing_method

In [158]:
def generate_middle_single_column(lower,upper,partial_missing,dataset):
    if lower == 0:
        lower_quantile = np.min(dataset, axis=0)
    else:
        lower_quantile = np.quantile(dataset,lower, axis=0)
    if upper == 1:
        upper_quantile = np.max(dataset, axis=0)
    else:
        upper_quantile = np.quantile(dataset,upper, axis=0)


    ix_larger_than = dataset >= lower_quantile
    ix_smaller_than = dataset <= upper_quantile

    
    combined_ix = np.equal(ix_larger_than, ix_smaller_than)
    combined_ix = random_missing_single_column(combined_ix,partial_missing)
    return combined_ix

In [157]:
def random_missing_single_column(array, fraction_to_change):

    result = array.copy()
    n_to_change = int(np.sum(result) * fraction_to_change)
    ix_to_change = np.random.choice(np.flatnonzero(result), size=n_to_change, replace=False)

    result[ix_to_change] = False

    return result

In [156]:
def missing_single(X,multiple_block,missing_dim = 1):
    
    Xnan = X.copy()

    #---- Missing Dimention
    
    ix_list = []
    for key in multiple_block.keys():
        info = multiple_block[key]
        combined_ix = generate_middle_single_column(info["lower"],info["upper"],info["partial_missing"], X)
        ix_list.append(combined_ix)
    combined_ix = np.logical_or.reduce(ix_list)
    
    Xnan[:,][combined_ix] = np.nan

    masks = np.array(~np.isnan(Xnan), dtype=np.float)
    return masks

In [154]:
def BM_missing(observed_values,missing_para):

    mask = np.ones(observed_values.shape)
    mask_single = missing_single(observed_values[:,missing_para["column"]],missing_para["missing"])
    mask[:,missing_para["column"]]= mask_single

    return mask

In [153]:
data = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]])


In [159]:
def process_func(dataname,path: str, aug_rate=1,missing_type = "MCAR",
                  missing_para = ""):
 
    data = dataset_loader(dataname)
    # print(data)
    # data.replace("?", np.nan, inplace=True)
    # Don't apply data argument (use n*dataset)
    # data_aug = pd.concat([data] * aug_rate)


    observed_values = data["data"].astype("float32")

    #observed_values = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]]).astype("float32")

    #np.random.shuffle(observed_values)

    # Print the shuffled array

    observed_masks = ~np.isnan(observed_values)
    masks = observed_masks.copy()
    
    "Need input origin dataset and parameters"
    if missing_type == "MCAR":
        masks = MCAR(observed_values,missing_para,masks)

    elif missing_type == "quantile":
        Xnan, Xz = missing_method.missing_by_range(observed_values, missing_para)
        masks = np.array(~np.isnan(Xnan), dtype=np.float)

    elif missing_type == "logistic":
        masks = missing_method.MNAR_mask_logistic(observed_values, missing_para)

    elif missing_type == "self_mask":
        masks = missing_method.MNAR_self_mask_logistic(observed_values, missing_para)

    elif missing_type == "BN":
        masks == BM_missing(observed_values, missing_para)


    # gt_mask: 0 for missing elements and manully maksed elements
    gt_masks = masks.reshape(observed_masks.shape)

    observed_values = np.nan_to_num(observed_values)
    observed_masks = observed_masks.astype(int)
    gt_masks = gt_masks.astype(int)

    return observed_values, observed_masks, gt_masks, data["data"].shape[1]

In [171]:

class tabular_dataset(Dataset):
    # eval_length should be equal to attributes number.
    def __init__(
        self, dataname, use_index_list=None, 
        aug_rate=1, seed=0,
        missing_type = "MCAR", missing_para = "",missing_name = "MCAR"
        ):
        #self.eval_length = eval_length
        np.random.seed(seed)
        
        dataset_path = f"datasets/{dataname}/data.csv"
        processed_data_path = (
            f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}.pk"
        )
        processed_data_path_norm = (
            f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}_max-min_norm.pk"
        )

        # print(processed_data_path)
        # print(processed_data_path_norm)
        # If no dataset created
        if not os.path.isfile(processed_data_path):
            self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = process_func(
                dataname, dataset_path, aug_rate=aug_rate,
                missing_type = missing_type, missing_para = missing_para
            )
            with open(processed_data_path, "wb") as f:
                pickle.dump(
                    [self.observed_values, self.observed_masks, self.gt_masks, self.eval_length], f
                )
            print("--------Dataset created--------")

        elif os.path.isfile(processed_data_path_norm):
            with open(processed_data_path_norm, "rb") as f:
                self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = pickle.load(
                    f
                )
            print("--------Normalized dataset loaded--------")
        
        if use_index_list is None:
            self.use_index_list = np.arange(len(self.observed_values))
        else:
            self.use_index_list = use_index_list

    def __getitem__(self, org_index):
        index = self.use_index_list[org_index]
        s = {
            "observed_data": self.observed_values[index],
            "observed_mask": self.observed_masks[index],
            "gt_mask": self.gt_masks[index],
            "timepoints": np.arange(self.eval_length),
        }
        return s

    def __len__(self):
        return len(self.use_index_list)





In [172]:
def prepare_dataset(dataname, seed=1, nfold=5, batch_size=16,
                   missing_type = "Quantile", missing_para = 0.5, missing_name = "Q1_complete"):

    dataset = tabular_dataset(dataname = dataname,seed=seed,
                              missing_type = missing_type, missing_para = missing_para,
                                missing_name = missing_name)
    # print("Missing Type:",missing_type)
    # print("Missing Para:",missing_para)
    print("Missing Name:",missing_name)
    
    
    indlist = np.arange(len(dataset))

    np.random.seed(seed + 1)
    np.random.shuffle(indlist)

    tmp_ratio = 1 / nfold
    start = (int)((nfold - 1) * len(dataset) * tmp_ratio)
    
    end = (int)(nfold * len(dataset) * tmp_ratio)

    test_index = indlist[start:end]
    remain_index = np.delete(indlist, np.arange(start, end))

    np.random.shuffle(remain_index)

    # Modify here to change train,valid ratio
    num_train = (int)(len(remain_index) * 1)
    train_index = remain_index[:num_train]
    valid_index = remain_index[num_train:]



    # Here we perform max-min normalization.
    print("Here we perform max-min normalization.")
    processed_data_path_norm = (
        f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}_max-min_norm.pk"
    )
    if not os.path.isfile(processed_data_path_norm):
        print(
            "--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------"
        )
        # data transformation after train-test split.
        col_num = dataset.observed_values.shape[1]
        max_arr = np.zeros(col_num)
        min_arr = np.zeros(col_num)
        mean_arr = np.zeros(col_num)
        for k in range(col_num):
            # Using observed_mask to avoid counting missing values.
            obs_ind = dataset.observed_masks[train_index, k].astype(bool)
            temp = dataset.observed_values[train_index, k]
            max_arr[k] = max(temp[obs_ind])
            min_arr[k] = min(temp[obs_ind])
        # print(f"--------------Max-value for each column {max_arr}--------------")
        # print(f"--------------Min-value for each column {min_arr}--------------")

        dataset.observed_values = (
            (dataset.observed_values - 0 + 1) / (max_arr - 0 + 1)
        ) * dataset.observed_masks

        with open(processed_data_path_norm, "wb") as f:
            pickle.dump(
                [dataset.observed_values, dataset.observed_masks, dataset.gt_masks, dataset.eval_length], f
            )



In [162]:
missing_name = "C1_double"



In [176]:
missing_type = "BN"
for missing_name in missing_list:
    missing_para = missing_list[missing_name]
    prepare_dataset("california",seed=1, nfold=5, batch_size=16,
                   missing_type = missing_type, missing_para = missing_para, missing_name = missing_name)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
Missing Name: C0_lower
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------
--------Dataset created--------
Missing Name: C0_upper
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------
--------Dataset created--------
Missing Name: C0_double
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------
--------Dataset created--------
Missing Name: C1_lower
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------
--------Dataset created--------
Missing Name: C1_upper
Here we perform max-min normalizatio

In [165]:
missing_list = {
    "C0_lower":{
        "column":0,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C0_upper":{
        "column":0,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C0_double":{
        "column":0,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C1_lower":{
        "column":1,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C1_upper":{
        "column":1,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C1_double":{
        "column":1,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C2_lower":{
        "column":2,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C2_upper":{
        "column":2,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C2_double":{
        "column":2,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C3_lower":{
        "column":3,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C3_upper":{
        "column":3,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C3_double":{
        "column":3,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C4_lower":{
        "column":4,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C4_upper":{
        "column":4,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C4_double":{
        "column":4,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C5_lower":{
        "column":5,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C5_upper":{
        "column":5,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C5_double":{
        "column":5,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C6_lower":{
        "column":6,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C6_upper":{
        "column":6,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C6_double":{
        "column":6,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C7_lower":{
        "column":7,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C7_upper":{
        "column":7,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C7_double":{
        "column":7,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}}
}

In [142]:
import json

with open("BM_missing.json", "w") as file:
    json.dump(missing_list, file)