In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
import torch


In [6]:
from torch.utils.data import DataLoader, Dataset
from data_loaders import *

In [4]:
data = dataset_loader("california")

In [7]:
from missing_process.block_rules import *

In [8]:
import sys
sys.path.append("..")
import pickle
import os
import re
import sys
import numpy as np
import pandas as pd

from data_loaders import *
import missing_process.missing_method as missing_method

In [9]:
def generate_middle_single_column(lower,upper,partial_missing,dataset):
    if lower == 0:
        lower_quantile = np.min(dataset, axis=0)
    else:
        lower_quantile = np.quantile(dataset,lower, axis=0)
    if upper == 1:
        upper_quantile = np.max(dataset, axis=0)
    else:
        upper_quantile = np.quantile(dataset,upper, axis=0)


    ix_larger_than = dataset >= lower_quantile
    ix_smaller_than = dataset <= upper_quantile

    
    combined_ix = np.equal(ix_larger_than, ix_smaller_than)
    combined_ix = random_missing_single_column(combined_ix,partial_missing)
    return combined_ix

In [10]:
def random_missing_single_column(array, fraction_to_change):

    result = array.copy()
    n_to_change = int(np.sum(result) * fraction_to_change)
    ix_to_change = np.random.choice(np.flatnonzero(result), size=n_to_change, replace=False)

    result[ix_to_change] = False

    return result

In [11]:
def missing_single(X,multiple_block,missing_dim = 1):
    
    Xnan = X.copy()

    #---- Missing Dimention
    
    ix_list = []
    for key in multiple_block.keys():
        info = multiple_block[key]
        combined_ix = generate_middle_single_column(info["lower"],info["upper"],info["partial_missing"], X)
        ix_list.append(combined_ix)
    combined_ix = np.logical_or.reduce(ix_list)
    
    Xnan[:,][combined_ix] = np.nan

    masks = np.array(~np.isnan(Xnan), dtype=np.float)
    return masks

In [12]:
def BM_missing(observed_values,missing_para):

    mask = np.ones(observed_values.shape)
    mask_single = missing_single(observed_values[:,missing_para["column"]],missing_para["missing"])
    mask[:,missing_para["column"]]= mask_single

    print("MASK from BM_missing",mask)

    return mask

In [None]:
data = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]])


In [13]:
def generate_mask(dataname,path: str, aug_rate=1,missing_type = "MCAR",
                  missing_para = ""):
 
    data = dataset_loader(dataname)
    observed_values = data["data"].astype("float32")

    #observed_values = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]]).astype("float32")



    observed_masks = ~np.isnan(observed_values)
    masks = observed_masks.copy()
    
    "Need input origin dataset and parameters"
    if missing_type == "MCAR":
        #masks = MCAR(observed_values,missing_para,masks)
        pass

    elif missing_type == "quantile":
        Xnan, Xz = missing_method.missing_by_range(observed_values, missing_para)
        masks = np.array(~np.isnan(Xnan), dtype=np.float)

    elif missing_type == "logistic":
        masks = missing_method.MNAR_mask_logistic(observed_values, missing_para)

    elif missing_type == "diffuse":
        #masks = missing_method.MNAR_self_mask_logistic(observed_values, missing_para)

        masks = diffuse_mnar_single(observed_values, missing_para[0],missing_para[1])

    elif missing_type == "BN":
        print("go BN")
        masks = BM_missing(observed_values, missing_para)


    # gt_mask: 0 for missing elements and manully maksed elements

    gt_masks = masks.reshape(observed_masks.shape)

    observed_values = np.nan_to_num(observed_values)
    observed_masks = observed_masks.astype(int)
    gt_masks = gt_masks.astype(int)

    return observed_values, observed_masks, gt_masks, data["data"].shape[1]

In [14]:

class tabular_dataset(Dataset):
    # eval_length should be equal to attributes number.
    def __init__(
        self, dataname, use_index_list=None, 
        aug_rate=1, seed=0,
        missing_type = "MCAR", missing_para = "",missing_name = "MCAR"
        ):
        #self.eval_length = eval_length
        np.random.seed(seed)
        
        dataset_path = f"datasets/{dataname}/data.csv"
        processed_data_path = (
            f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}.pk"
        )
        processed_data_path_norm = (
            f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}_max-min_norm.pk"
        )

        # print(processed_data_path)
        # print(processed_data_path_norm)
        # If no dataset created
        if not os.path.isfile(processed_data_path):
            self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = generate_mask(
                dataname, dataset_path, aug_rate=aug_rate,
                missing_type = missing_type, missing_para = missing_para)
            print("Self.gtMasks",self.gt_masks)
            with open(processed_data_path, "wb") as f:
                pickle.dump(
                    [self.observed_values, self.observed_masks, self.gt_masks, self.eval_length], f
                )
            print("--------Dataset created--------")

        elif os.path.isfile(processed_data_path_norm):
            with open(processed_data_path_norm, "rb") as f:
                self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = pickle.load(
                    f
                )
            print("--------Normalized dataset loaded--------")
        
        if use_index_list is None:
            self.use_index_list = np.arange(len(self.observed_values))
        else:
            self.use_index_list = use_index_list

    def __getitem__(self, org_index):
        index = self.use_index_list[org_index]
        s = {
            "observed_data": self.observed_values[index],
            "observed_mask": self.observed_masks[index],
            "gt_mask": self.gt_masks[index],
            "timepoints": np.arange(self.eval_length),
        }
        return s

    def __len__(self):
        return len(self.use_index_list)





In [15]:
def columnwise_min_max_scaling(dataset):
    col_num = dataset.observed_values.shape[1]
    max_arr = np.zeros(col_num)
    min_arr = np.zeros(col_num)

    for k in range(col_num):
        # Using observed_mask to avoid counting missing values.
        obs_ind = dataset.gt_masks[:, k].astype(bool)
        temp = dataset.observed_values[:, k][obs_ind]
        if len(temp) > 0:  # Check if there are non-zero mask values
            max_arr[k] = max(temp)
            min_arr[k] = min(temp)
        else:
            max_arr[k] = 1
            min_arr[k] = 0

    dataset.observed_values = ((dataset.observed_values - min_arr) / (max_arr - min_arr + 1e-6))



In [16]:
def prepare_dataset(dataname, seed=1, nfold=5, batch_size=16,
                   missing_type = "Quantile", missing_para = 0.5, missing_name = "Q1_complete"):

    dataset = tabular_dataset(dataname = dataname,seed=seed,
                              missing_type = missing_type, missing_para = missing_para,
                                missing_name = missing_name)
    # print("Missing Type:",missing_type)
    # print("Missing Para:",missing_para)
    print("Missing Name:",missing_name)
    
    
    indlist = np.arange(len(dataset))

    np.random.seed(seed + 1)
    np.random.shuffle(indlist)

    tmp_ratio = 1 / nfold
    start = (int)((nfold - 1) * len(dataset) * tmp_ratio)
    
    end = (int)(nfold * len(dataset) * tmp_ratio)

    test_index = indlist[start:end]
    remain_index = np.delete(indlist, np.arange(start, end))

    np.random.shuffle(remain_index)

    # Modify here to change train,valid ratio
    num_train = (int)(len(remain_index) * 1)
    train_index = remain_index[:num_train]
    valid_index = remain_index[num_train:]



    # Here we perform max-min normalization.
    print("Here we perform max-min normalization.")
    processed_data_path_norm = (
        f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}_max-min_norm.pk"
    )
    if not os.path.isfile(processed_data_path_norm):
        print(
            "--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------"
        )
# 使用上面的函数来进行数据处理
        columnwise_min_max_scaling(dataset)

        with open(processed_data_path_norm, "wb") as f:
            pickle.dump(
                [dataset.observed_values, dataset.observed_masks, dataset.gt_masks, dataset.eval_length], f
            )



In [17]:
def diffuse_mnar_single(data, up_percentile = 0.5, obs_percentile = 0.5):
    
    def scale_data(data):
        min_vals = np.min(data, axis=0)
        max_vals = np.max(data, axis=0)
        scaled_data = (data - min_vals) / (max_vals - min_vals)
        return scaled_data

    data = scale_data(data)

    mask = np.ones(data.shape)

    n_cols = data.shape[1]
    n_miss_cols = int(n_cols * 0.5)  # 选择50%的列作为缺失列
    miss_cols = np.random.choice(n_cols, size=n_miss_cols, replace=False)  # 随机选择缺失列的索引

    obs_cols = [col for col in range(data.shape[1]) if col not in miss_cols]
    
    for miss_col in miss_cols:
        missvar_bounds = np.quantile(data[:, miss_col], up_percentile)
        temp = data[:, miss_col] > missvar_bounds
        
        obsvar_bounds = np.quantile(data[temp][:, obs_cols], obs_percentile)
        temp2 = data[:, miss_col] > obsvar_bounds

        merged_temp = np.logical_or(temp, temp2).astype(int)
        mask[:, miss_col] = merged_temp
    print("Missing Rate",1 - np.count_nonzero(mask) / mask.size)
    return mask

In [None]:
missing_name = "C1_double"



In [62]:
dataset = "california"
dataset = "concrete_compression"
#dataset = "wine_quality_white"
#dataset = "wine_quality_red"
#dataset = "banknote"

#"concrete_compression"#,"wine_quality_white","banknote",
seed = 1
nfold = 5


missing_type = "logistic"
#missing_type = "diffuse"
#missing_type = "quantile"
#missing_type = "BM"

missing_rule = load_json_file("q_ratio.json")
#missing_rule = load_json_file("diffuse_ratio.json")

#missing_rule = load_json_file("single_quantile.json")
#missing_rule = load_json_file("double_quantile_1.json")
#missing_rule = load_json_file("double_quantile_2.json")

#missing_rule = load_json_file("BM_missing.json")


for missing_name in missing_rule:
    missing_para = missing_rule[missing_name]
    prepare_dataset(
        dataname=dataset,
        seed=seed,nfold=nfold,batch_size=128,
        missing_type = missing_type,missing_para = missing_para,missing_name = missing_name)

Self.gtMasks [[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 1 0]
 [0 1 0 ... 0 1 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 1 0 0]
 [0 0 0 ... 0 0 1]]
--------Dataset created--------
Missing Name: 0.25
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------
Self.gtMasks [[0 1 0 ... 0 1 0]
 [0 1 1 ... 1 1 0]
 [1 1 1 ... 1 0 1]
 ...
 [0 0 0 ... 1 0 1]
 [0 1 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]]
--------Dataset created--------
Missing Name: 0.5
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------
Self.gtMasks [[0 0 1 ... 1 1 1]
 [1 1 1 ... 0 1 1]
 [0 0 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 1]
 [1 1 1 ... 0 1 1]
 [1 0 0 ... 1 1 1]]
--------Dataset created--------
Missing Name: 0.75
Here we perform max-min normalization.
--------------Dataset has not been normalized yet. Perform data no

In [17]:
missing_rule

{'Q2_Q3_complete': {'1': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.0},
  '2': {'lower': 0.5, 'upper': 0.75, 'partial_missing': 0.0}},
 'Q2_Q3_partial': {'1': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.5},
  '2': {'lower': 0.5, 'upper': 0.75, 'partial_missing': 0.5}},
 'Q2_Q4_complete': {'1': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.0},
  '2': {'lower': 0.75, 'upper': 1, 'partial_missing': 0.0}},
 'Q2_Q4_partial': {'1': {'lower': 0.25, 'upper': 0.5, 'partial_missing': 0.5},
  '2': {'lower': 0.75, 'upper': 1, 'partial_missing': 0.5}},
 'Q3_Q4_complete': {'1': {'lower': 0.5, 'upper': 0.75, 'partial_missing': 0.0},
  '2': {'lower': 0.75, 'upper': 1, 'partial_missing': 0.0}},
 'Q3_Q4_partial': {'1': {'lower': 0.5, 'upper': 0.75, 'partial_missing': 0.5},
  '2': {'lower': 0.75, 'upper': 1, 'partial_missing': 0.5}}}

In [18]:
missing_type = "BN"
dataset = "wine_quality_white"
for missing_name in missing_list:
    missing_para = missing_list[missing_name]
    prepare_dataset(dataset,seed=1, nfold=5, batch_size=16,
                   missing_type = missing_type, missing_para = missing_para, missing_name = missing_name)


--------Normalized dataset loaded--------
Missing Name: C0_lower
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C0_upper
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C0_double
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C1_lower
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C1_upper
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C1_double
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C2_lower
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C2_upper
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C2_double
Here we perform max-min normalization.
--------Normalized dataset loaded--------
Missing Name: C3_lo

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


IndexError: index 11 is out of bounds for axis 1 with size 11

In [18]:
missing_list = {
    "C0_lower":{
        "column":0,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C0_upper":{
        "column":0,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C0_double":{
        "column":0,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C1_lower":{
        "column":1,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C1_upper":{
        "column":1,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C1_double":{
        "column":1,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C2_lower":{
        "column":2,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C2_upper":{
        "column":2,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C2_double":{
        "column":2,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C3_lower":{
        "column":3,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C3_upper":{
        "column":3,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C3_double":{
        "column":3,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C4_lower":{
        "column":4,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C4_upper":{
        "column":4,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C4_double":{
        "column":4,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C5_lower":{
        "column":5,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C5_upper":{
        "column":5,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C5_double":{
        "column":5,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C6_lower":{
        "column":6,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C6_upper":{
        "column":6,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C6_double":{
        "column":6,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    "C7_lower":{
        "column":7,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C7_upper":{
        "column":7,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C7_double":{
        "column":7,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    
    "C8_lower":{
        "column":8,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C8_upper":{
        "column":8,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C8_double":{
        "column":8,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    
    "C9_lower":{
        "column":9,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C8_upper":{
        "column":9,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C9_double":{
        "column":8,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    

    "C10_lower":{
        "column":10,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C10_upper":{
        "column":10,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C10_double":{
        "column":10,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
    
    "C11_lower":{
        "column":11,
        "missing":{1:{"lower": 0.0, "upper": 0.35, "partial_missing": 0}}
    },
    "C11_upper":{
        "column":11,
        "missing":{1:{"lower": 0.65, "upper": 1, "partial_missing": 0.0}}
    },
    "C11_double":{
        "column":11,
        "missing": {"1": {"lower": 0.0, "upper": 0.35, "partial_missing": 0.0}, 
                    "2": {"lower": 0.65, "upper": 1, "partial_missing": 0.0}}},
}

In [15]:
import json

with open("BM_missing.json", "w") as file:
    json.dump(missing_list, file)

In [49]:
processed_data_path_norm = "datasets/banknote/quantile-Q1_complete_seed-1_max-min_norm.pk"
    

with open(processed_data_path_norm, "rb") as f:
        observed_values, observed_masks, gt_masks, eval_length = pickle.load(
                f
        )

print(observed_values, observed_masks, gt_masks, eval_length)

[[ 0.7348913   0.67358271 -0.0466758   0.1950446 ]
 [ 0.65579731 -0.06367053  0.17834973  0.5178313 ]
 [ 0.60817107  0.76605918 -0.12639074 -0.24375432]
 ...
 [-0.23022065 -0.80191988  0.98284117 -0.0755649 ]
 [-0.20851314 -0.45560047  0.71584793  0.23204505]
 [-0.08964529  0.071439    0.21737031  0.74188154]] [[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 ...
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]] [[1 1 0 1]
 [1 0 1 1]
 [1 1 0 0]
 ...
 [0 0 1 0]
 [0 0 1 1]
 [0 1 1 1]] 4
