In [88]:
import sys
sys.path.append("..")
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm


In [89]:
import os
import sys
from scipy import optimize
from torch.utils.data import DataLoader, Dataset
from data_loaders import *
import missing_process.missing_method as missing_method
from missing_process.block_rules import *

In [90]:
def minmaxscaler(array,column_min,column_max):
    scaled_array = np.zeros_like(array)  
    for i in range(array.shape[1]):
        if column_min[i] == column_max[i]:
            # If min and max are the same for this column, leave the column unchanged
            scaled_array[:, i] = array[:, i]
        else:
            # Otherwise, apply Min-Max scaling
            scaled_array[:, i] = (array[:, i] - column_min[i]) / (column_max[i] - column_min[i])


    return scaled_array


In [91]:
def MCAR(observed_values, missing_ratio, masks):
    for col in range(observed_values.shape[1]):  # col #

        obs_indices = np.where(observed_values[:, col])[0]
        miss_indices = np.random.choice(
        obs_indices, (int)(len(obs_indices) * missing_ratio), replace=False
        )
        masks[miss_indices, col] = False

    return masks

def process_func(dataname, path: str, aug_rate=1,missing_type = "MCAR",
                  missing_para = ""):
 
    data = dataset_loader(dataname)
    # print(data)
    # data.replace("?", np.nan, inplace=True)
    # Don't apply data argument (use n*dataset)
    # data_aug = pd.concat([data] * aug_rate)

    observed_values = data["data"].astype("float32")

    observed_masks = ~np.isnan(observed_values)
    masks = observed_masks.copy()

    "Need input origin dataset and parameters"
    if missing_type == "MCAR":
        masks = MCAR(observed_values,missing_para,masks)

    elif missing_type == "quantile":
        Xnan, Xz = missing_method.missing_by_range(observed_values, missing_para)
        masks = np.array(~np.isnan(Xnan), dtype=np.float)

    elif missing_type == "logistic":
        masks = missing_method.MNAR_mask_logistic(observed_values, missing_para)

    elif missing_type == "diffuse":
        #print("Go Diffuse")
        #masks = missing_method.MNAR_self_mask_logistic(observed_values, missing_para)

        masks = diffuse_mnar_single(observed_values, missing_para[0],missing_para[1])


    # gt_mask: 0 for missing elements and manully maksed elements
    gt_masks = masks.reshape(observed_masks.shape)

    observed_values = np.nan_to_num(observed_values)
    observed_masks = observed_masks.astype(int)
    gt_masks = gt_masks.astype(int)

    return observed_values, observed_masks, gt_masks, data["data"].shape[1]

In [99]:
class tabular_dataset(Dataset):
    # eval_length should be equal to attributes number.
    def __init__(
        self, dataname, use_index_list=None, 
        aug_rate=1, seed=0,
        missing_type = "MCAR", missing_para = "",missing_name = "MCAR"
        ):
        #self.eval_length = eval_length
        np.random.seed(seed)
        
        dataset_path = f"datasets/{dataname}/data.csv"
        processed_data_path = (
            f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}.pk"
        )
        processed_data_path_norm = (
            f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}_max-min_norm.pk"
        )
        # If no dataset created
        if not os.path.isfile(processed_data_path):
            print("--------NO Dataset--------")
            self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = process_func(
                dataname, dataset_path, aug_rate=aug_rate,
                missing_type = missing_type, missing_para = missing_para
            )
            with open(processed_data_path, "wb") as f:
                pickle.dump(
                    [self.observed_values, self.observed_masks, self.gt_masks, self.eval_length], f
                )
            print("--------Dataset created--------")

        elif os.path.isfile(processed_data_path) and os.path.isfile(processed_data_path_norm):
            with open(processed_data_path_norm, "rb") as f:
                self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = pickle.load(
                    f
                )

            print("--------Normalized Dataset loaded--------")


        elif os.path.isfile(processed_data_path_norm):
            print("Have Norm")
            with open(processed_data_path_norm, "rb") as f:
                self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = pickle.load(
                    f
                )
            print("--------Normal Dataset loaded--------")

        else:
            with open(processed_data_path, "rb") as f:
                self.observed_values, self.observed_masks, self.gt_masks, self.eval_length = pickle.load(
                    f
                )
            print("Go to Normalized")


        #计算0的占比
        zero_percentage = (self.gt_masks == 0).mean() * 100

        print(f"0的占比: {zero_percentage}%")
        
        if use_index_list is None:
            self.use_index_list = np.arange(len(self.observed_values))
        else:
            self.use_index_list = use_index_list

    def __getitem__(self, org_index):
        index = self.use_index_list[org_index]
        s = {
            "observed_data": self.observed_values[index],
            "observed_mask": self.observed_masks[index],
            "gt_mask": self.gt_masks[index],
            "timepoints": np.arange(self.eval_length),
        }
        return s

    def __len__(self):
        return len(self.use_index_list)

In [93]:
def get_dataloader(dataname, seed=1, nfold=5, batch_size=16,
                   missing_type = "MCAR", missing_para = "", missing_name = "MCAR"):

    dataset = tabular_dataset(dataname = dataname,seed=seed,
                              missing_type = missing_type, missing_para = missing_para,
                                missing_name = missing_name)
    #print(f"Dataset size:{len(dataset)} entries")
    
    
    indlist = np.arange(len(dataset))

    np.random.seed(seed + 1)
    np.random.shuffle(indlist)

    tmp_ratio = 1 / nfold
    start = (int)((nfold - 1) * len(dataset) * tmp_ratio)
    
    end = (int)(nfold * len(dataset) * tmp_ratio)

    test_index = indlist[start:end]
    remain_index = np.delete(indlist, np.arange(start, end))

    np.random.shuffle(remain_index)

    # Modify here to change train,valid ratio
    num_train = (int)(len(remain_index) * 0.9)
    train_index = remain_index[:num_train]
    valid_index = remain_index[num_train:]



    # Here we perform max-min normalization.
    processed_data_path_norm = (
        f"datasets/{dataname}/{missing_type}-{missing_name}_seed-{seed}_max-min_norm.pk"
    )
    
    if not os.path.isfile(processed_data_path_norm):
        #print(
        #    "--------------Dataset has not been normalized yet. Perform data normalization and store the mean value of each column.--------------"
        #)
        # data transformation after train-test split.
        col_num = dataset.observed_values.shape[1]
        max_arr = np.zeros(col_num)
        min_arr = np.zeros(col_num)
        mean_arr = np.zeros(col_num)

        print("Missing Rate",1 - np.count_nonzero(dataset.gt_masks) / dataset.observed_values.size)
        for k in range(col_num):
            # Using observed_mask to avoid counting missing values.
            obs_ind = dataset.gt_masks[train_index, k].astype(bool)
            temp = dataset.observed_values[train_index, k]
            #print(temp[obs_ind])
            try:
                max_arr[k] = max(temp[obs_ind])
                min_arr[k] = min(temp[obs_ind])
            except:
                max_arr[k] = max(temp)
                min_arr[k] = min(temp)
        
       
        # print(max_arr)
        # print(min_arr)
        # print(f"--------------Max-value for each column {max_arr}--------------")
        # print(f"--------------Min-value for each column {min_arr}--------------")

        #dataset.observed_values = minmaxscaler(dataset.observed_values,min_arr,max_arr)
        dataset.observed_values = (
            (dataset.observed_values - ( min_arr + 1)) / (max_arr - min_arr + 1)
        )  * dataset.observed_masks

        with open(processed_data_path_norm, "wb") as f:
            pickle.dump(
                [dataset.observed_values, dataset.observed_masks, dataset.gt_masks, dataset.eval_length], f
            )

#     # Create datasets and corresponding data loaders objects.
#     train_dataset = tabular_dataset(dataname = dataname,
#         use_index_list=train_index, seed=seed,
#         missing_type = missing_type, missing_para = missing_para, missing_name = missing_name
#     )
#     #train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=1)
#     valid_dataset = tabular_dataset(dataname = dataname,
#         use_index_list=valid_index, seed=seed,
#         missing_type = missing_type, missing_para = missing_para, missing_name = missing_name
#     )
#     #valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=0)

#     test_dataset = tabular_dataset(dataname = dataname,
#         use_index_list=test_index, seed=seed,
#         missing_type = missing_type, missing_para = missing_para, missing_name = missing_name
#     )
#    #test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=0)

#     print(f"Training dataset size: {len(train_dataset)}")
#     print(f"Validation dataset size: {len(valid_dataset)}")
#     print(f"Testing dataset size: {len(test_dataset)}")

    #return train_loader, valid_loader, test_loader


# cc : self_mask

# Red: Self_mask

# white: Self_mask

In [94]:
def diffuse_mnar_single(data, up_percentile = 0.5, obs_percentile = 0.5):
    
    def scale_data(data):
        min_vals = np.min(data, axis=0)
        max_vals = np.max(data, axis=0)
        scaled_data = (data - min_vals) / (max_vals - min_vals)
        return scaled_data

    data = scale_data(data)

    mask = np.ones(data.shape)

    n_cols = data.shape[1]
    n_miss_cols = int(n_cols)  # 选择50%的列作为缺失列
    miss_cols = np.random.choice(n_cols, size=n_miss_cols, replace=False)  # 随机选择缺失列的索引

    obs_cols = [col for col in range(data.shape[1]) if col not in miss_cols]
    
    for miss_col in miss_cols:
        missvar_bounds = np.quantile(data[:, miss_col], up_percentile)
        temp = data[:, miss_col] >= missvar_bounds

        obsvar_bounds = np.quantile(data[temp][:, -miss_cols], obs_percentile)
        temp2 = data[:, miss_col] > obsvar_bounds

        merged_temp = np.logical_or(temp, temp2).astype(int)
        mask[:, miss_col] = merged_temp
    print("Missing Rate",1 - np.count_nonzero(mask) / mask.size)
    return mask

In [103]:
datalist = ["banknote","concrete_compression",
            "wine_quality_white","wine_quality_red",
            "california","climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics"
            ]



missingtypelist = ["quantile","diffuse","logistic"]
missingtypelist = ["quantile"]


In [107]:
seed = 1
nfold = 5


for dataset in tqdm(datalist):
    
    for missingtype in missingtypelist:
        if missingtype == "logistic":
            missing_rule = load_json_file("missing_rate.json")
        elif missingtype == "diffuse":
            missing_rule = load_json_file("diffuse_ratio.json")
        elif missingtype == "quantile":
            missing_rule = load_json_file("quantile_full.json")
        

        for rule_name in missing_rule:
            
            rule = missing_rule[rule_name]

            # Create folder
            # Every loader contains "observed_data", "observed_mask", "gt_mask", "timepoints"
            print(dataset,missingtype,rule_name)
            try:
                get_dataloader(
                    dataname=dataset,
                    seed=seed,
                    nfold=nfold,
                    batch_size=128,
                    missing_type = missingtype,
                    missing_para = rule,
                    missing_name = rule_name
                )

            except:
                print(dataset,missingtype,rule_name, "Not have")



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


banknote quantile Q1_Q2_0.25
--------Normalized Dataset loaded--------
0的占比: 12.545587162654998%
banknote quantile Q1_Q3_0.25
--------Normalized Dataset loaded--------
0的占比: 12.545587162654998%
banknote quantile Q1_Q4_0.25
--------Normalized Dataset loaded--------
0的占比: 12.563822027716995%
banknote quantile Q2_Q3_0.25
--------Normalized Dataset loaded--------
0的占比: 12.527352297592998%
banknote quantile Q2_Q4_0.25
--------Normalized Dataset loaded--------
0的占比: 12.563822027716995%
banknote quantile Q3_Q4_0.25
--------Normalized Dataset loaded--------
0的占比: 12.563822027716995%
banknote quantile Q1_0.25
--------Normalized Dataset loaded--------
0的占比: 6.272793581327499%
banknote quantile Q2_0.25
--------Normalized Dataset loaded--------
0的占比: 6.272793581327499%
banknote quantile Q3_0.25
--------Normalized Dataset loaded--------
0的占比: 6.272793581327499%
banknote quantile Q4_0.25
--------Normalized Dataset loaded--------
0的占比: 6.291028446389496%
banknote quantile Q1_Q2_0.5
--------Normalized

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 18.094660194174757%
Missing Rate 0.1809466019417476
concrete_compression quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 16.820388349514563%
Missing Rate 0.1682038834951456
concrete_compression quantile Q3_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 15.291262135922329%
Missing Rate 0.15291262135922334
concrete_compression quantile Q1_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 8.410194174757281%
Missing Rate 0.0841019417475728
concrete_compression quantile Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 10.048543689320388%
Missing Rate 0.10048543689320388
concrete_compression quantile Q3_0.25
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 8.762135922330097%
Missing Rate 0.08762135922330094
concrete_compression quantile Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 6.771844660194175%
Missing Rate 0.0677184466019417
concrete_compression quantile Q1_Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 31.820388349514563%
Missing Rate 0.3182038834951456
concrete_compression quantile Q1_Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 32.5%
Missing Rate 0.32499999999999996
concrete_compression quantile Q1_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 30.218446601941746%
Missing Rate 0.3021844660194175
concrete_compression quantile Q2_Q3_0.5
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 34.381067961165044%
Missing Rate 0.34381067961165046
concrete_compression quantile Q2_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 33.49514563106796%
Missing Rate 0.3349514563106796
concrete_compression quantile Q3_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 30.254854368932037%
Missing Rate 0.3025485436893204
concrete_compression quantile Q1_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 16.735436893203882%
Missing Rate 0.1673543689320388
concrete_compression quantile Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 20.012135922330096%
Missing Rate 0.200121359223301
concrete_compression quantile Q3_0.5
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 17.487864077669904%
Missing Rate 0.174878640776699
concrete_compression quantile Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.483009708737864%
Missing Rate 0.1348300970873786
concrete_compression quantile Q1_Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 44.21116504854369%
Missing Rate 0.4421116504854369
concrete_compression quantile Q1_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 47.560679611650485%
Missing Rate 0.4756067961165048
concrete_compression quantile Q1_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 45.351941747572816%
Missing Rate 0.4535194174757281
concrete_compression quantile Q2_Q3_0.75
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 49.1383495145631%
Missing Rate 0.49138349514563107
concrete_compression quantile Q2_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.23058252427185%
Missing Rate 0.5023058252427184
concrete_compression quantile Q3_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 44.817961165048544%
Missing Rate 0.4481796116504855
concrete_compression quantile Q1_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.13349514563107%
Missing Rate 0.2513349514563107
concrete_compression quantile Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 30.012135922330096%
Missing Rate 0.30012135922330097
concrete_compression quantile Q3_0.75
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 26.25%
Missing Rate 0.26249999999999996
concrete_compression quantile Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 20.218446601941746%
Missing Rate 0.20218446601941753
concrete_compression quantile Q1_Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 53.83495145631068%
Missing Rate 0.5383495145631068
concrete_compression quantile Q1_Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 61.529126213592235%
Missing Rate 0.6152912621359223
concrete_compression quantile Q1_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 60.36407766990292%
Missing Rate 0.603640776699029
concrete_compression quantile Q2_Q3_1.0
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


--------Dataset created--------
0的占比: 62.20873786407767%
Missing Rate 0.6220873786407767
concrete_compression quantile Q2_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 66.89320388349515%
Missing Rate 0.6689320388349514
concrete_compression quantile Q3_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 58.88349514563107%
Missing Rate 0.5888349514563107
concrete_compression quantile Q1_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 33.44660194174757%
Missing Rate 0.3344660194174758
concrete_compression quantile Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 39.97572815533981%
Missing Rate 0.39975728155339807
concrete_compression quantile Q3_1.0
--------NO Dataset--------


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 34.95145631067961%
Missing Rate 0.3495145631067961
concrete_compression quantile Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 26.91747572815534%
Missing Rate 0.2691747572815534
wine_quality_white quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.486023980103196%
Missing Rate 0.134860239801032
wine_quality_white quantile Q1_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.504584431493374%
Missing Rate 0.13504584431493372
wine_quality_white quantile Q1_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.254018337725974%
Missing Rate 0.1325401833772597
wine_quality_white quantile Q2_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.567689966219978%
Missing Rate 0.13567689966219976
wine_quality_white quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.473031664130072%
Missing R

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


wine_quality_white quantile Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 6.936040684509447%
Missing Rate 0.06936040684509448
wine_quality_white quantile Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 6.787557073388024%
Missing Rate 0.06787557073388029
wine_quality_white quantile Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 6.536990979620624%
Missing Rate 0.06536990979620627
wine_quality_white quantile Q1_Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 26.60269497754185%
Missing Rate 0.2660269497754185
wine_quality_white quantile Q1_Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 26.985040276179518%
Missing Rate 0.26985040276179517
wine_quality_white quantile Q1_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 26.478339953227664%
Missing Rate 0.2647833995322766
wine_quality_white quantile Q2_Q3_0.5
--------NO Dataset--------
--------Dat

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 13.860945098184787%
Missing Rate 0.13860945098184785
wine_quality_white quantile Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.56583392108096%
Missing Rate 0.1356583392108096
wine_quality_white quantile Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.059133598129105%
Missing Rate 0.13059133598129102
wine_quality_white quantile Q1_Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 39.355581127733025%
Missing Rate 0.3935558112773303
wine_quality_white quantile Q1_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 40.474776346560745%
Missing Rate 0.4047477634656075
wine_quality_white quantile Q1_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 39.71194179442444%
Missing Rate 0.39711941794424443
wine_quality_white quantile Q2_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 39.48550428746427%
Missing Rate 0

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 20.34782285905193%
Missing Rate 0.20347822859051934
wine_quality_white quantile Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 19.584988306915623%
Missing Rate 0.1958498830691563
wine_quality_white quantile Q1_Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 51.7632428820669%
Missing Rate 0.5176324288206688
wine_quality_white quantile Q1_Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 53.94223987527377%
Missing Rate 0.5394223987527377
wine_quality_white quantile Q1_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 52.93069527450908%
Missing Rate 0.5293069527450908
wine_quality_white quantile Q2_Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 51.83191655221056%
Missing Rate 0.5183191655221056
wine_quality_white quantile Q2_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 53.81788485095957%
Missing Rate 0.53817

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
 30%|███       | 3/10 [00:02<00:06,  1.15it/s]

--------Dataset created--------
0的占比: 27.11681948104978%
Missing Rate 0.2711681948104978
wine_quality_white quantile Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 26.10527488028509%
Missing Rate 0.2610527488028509
wine_quality_red quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.616464835977032%
Missing Rate 0.13616464835977027
wine_quality_red quantile Q1_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.298084029791346%
Missing Rate 0.13298084029791346
wine_quality_red quantile Q1_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.093410654386265%
Missing Rate 0.13093410654386262
wine_quality_red quantile Q2_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.855250440616294%
Missing Rate 0.13855250440616296
wine_quality_red quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.838194325999204%
Missing Rate 0.138

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 39.53038831087611%
Missing Rate 0.3953038831087612
wine_quality_red quantile Q1_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 39.80897151628859%
Missing Rate 0.39808971516288594
wine_quality_red quantile Q1_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 39.240434362385585%
Missing Rate 0.3924043436238558
wine_quality_red quantile Q2_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 40.275171982489056%
Missing Rate 0.40275171982489055
wine_quality_red quantile Q2_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 41.423617033373134%
Missing Rate 0.4142361703337313
wine_quality_red quantile Q3_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 38.632099607709364%
Missing Rate 0.3863209960770937
wine_quality_red quantile Q1_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 19.67707089658309%
Missing Rate 0.1967707

 40%|████      | 4/10 [00:03<00:04,  1.46it/s]

--------Dataset created--------
0的占比: 26.050372391835808%
Missing Rate 0.2605037239183581
california quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.663517441860463%
Missing Rate 0.1266351744186046
california quantile Q1_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.590237403100776%
Missing Rate 0.1259023740310078
california quantile Q1_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.602349806201552%
Missing Rate 0.12602349806201552
california quantile Q2_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.703488372093021%
Missing Rate 0.1270348837209302
california quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.733769379844961%
Missing Rate 0.1273376937984496
california quantile Q3_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.604166666666666%
Missing Rate 0.12604166666666672
california quantile 

 50%|█████     | 5/10 [00:04<00:04,  1.03it/s]

california quantile Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.27374031007752%
Missing Rate 0.2527374031007752
climate_model_crashes quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.592592592592592%
Missing Rate 0.12592592592592589
climate_model_crashes quantile Q1_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.592592592592592%
Missing Rate 0.12592592592592589
climate_model_crashes quantile Q1_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.592592592592592%
Missing Rate 0.12592592592592589
climate_model_crashes quantile Q2_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.592592592592592%
Missing Rate 0.12592592592592589
climate_model_crashes quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.592592592592592%
Missing Rate 0.12592592592592589
climate_model_crashes quantile Q3_Q4_0.25
--------NO Data

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 6.296296296296296%
Missing Rate 0.062962962962963
climate_model_crashes quantile Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 6.296296296296296%
Missing Rate 0.062962962962963
climate_model_crashes quantile Q1_Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.185185185185183%
Missing Rate 0.2518518518518519
climate_model_crashes quantile Q1_Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.185185185185183%
Missing Rate 0.2518518518518519
climate_model_crashes quantile Q1_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.185185185185183%
Missing Rate 0.2518518518518519
climate_model_crashes quantile Q2_Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.185185185185183%
Missing Rate 0.2518518518518519
climate_model_crashes quantile Q2_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.185185185185183%


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 12.592592592592592%
Missing Rate 0.12592592592592589
climate_model_crashes quantile Q1_Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.77777777777778%
Missing Rate 0.37777777777777777
climate_model_crashes quantile Q1_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.77777777777778%
Missing Rate 0.37777777777777777
climate_model_crashes quantile Q1_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.77777777777778%
Missing Rate 0.37777777777777777
climate_model_crashes quantile Q2_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.77777777777778%
Missing Rate 0.37777777777777777
climate_model_crashes quantile Q2_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.77777777777778%
Missing Rate 0.37777777777777777
climate_model_crashes quantile Q3_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.7777

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 50.0%
Missing Rate 0.5
climate_model_crashes quantile Q1_Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.0%
Missing Rate 0.5
climate_model_crashes quantile Q1_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.0%
Missing Rate 0.5
climate_model_crashes quantile Q2_Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.0%
Missing Rate 0.5
climate_model_crashes quantile Q2_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.0%
Missing Rate 0.5
climate_model_crashes quantile Q3_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.0%
Missing Rate 0.5
climate_model_crashes quantile Q1_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.0%
Missing Rate 0.25
climate_model_crashes quantile Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.0%
Missing Rate 0.25
climate_model_crashes quantile Q3_1.0
--

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)


connectionist_bench_sonar quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.612179487179487%
Missing Rate 0.12612179487179487
connectionist_bench_sonar quantile Q1_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.636217948717949%
Missing Rate 0.12636217948717954
connectionist_bench_sonar quantile Q1_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.580128205128204%
Missing Rate 0.12580128205128205
connectionist_bench_sonar quantile Q2_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.66826923076923%
Missing Rate 0.12668269230769236
connectionist_bench_sonar quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.620192307692307%
Missing Rate 0.12620192307692313
connectionist_bench_sonar quantile Q3_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.604166666666666%
Missing Rate 0.12604166666666672
connectionist_ben

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

0的占比: 25.096153846153847%
Missing Rate 0.25096153846153846
connectionist_bench_sonar quantile Q1_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.572115384615385%
Missing Rate 0.1257211538461539
connectionist_bench_sonar quantile Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.612179487179487%
Missing Rate 0.12612179487179487
connectionist_bench_sonar quantile Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.612179487179487%
Missing Rate 0.12612179487179487
connectionist_bench_sonar quantile Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.532051282051281%
Missing Rate 0.12532051282051282
connectionist_bench_sonar quantile Q1_Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.628205128205124%
Missing Rate 0.3762820512820513
connectionist_bench_sonar quantile Q1_Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.756410256410255%
Missing 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

connectionist_bench_sonar quantile Q1_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.15224358974359%
Missing Rate 0.501522435897436
connectionist_bench_sonar quantile Q2_Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.10416666666667%
Missing Rate 0.5010416666666666
connectionist_bench_sonar quantile Q2_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.20032051282052%
Missing Rate 0.5020032051282051
connectionist_bench_sonar quantile Q3_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.128205128205124%
Missing Rate 0.5012820512820513
connectionist_bench_sonar quantile Q1_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.112179487179485%
Missing Rate 0.25112179487179487
connectionist_bench_sonar quantile Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.16025641025641%
Missing Rate 0.2516025641025641
connectionist_bench_sonar quantile Q3_

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 25.525372789272915%
Missing Rate 0.25525372789272915
qsar_biodegradation quantile Q3_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.009825453704774%
Missing Rate 0.2500982545370477
qsar_biodegradation quantile Q1_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 13.494393711709629%
Missing Rate 0.1349439371170963
qsar_biodegradation quantile Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 14.812160443879321%
Missing Rate 0.1481216044387932
qsar_biodegradation quantile Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 14.493122182406658%
Missing Rate 0.1449312218240666
qsar_biodegradation quantile Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.530343312911802%
Missing Rate 0.12530343312911807
qsar_biodegradation quantile Q1_Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 45.40515547335568%
Missing Rate

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 47.3956768003699%
Missing Rate 0.473956768003699
qsar_biodegradation quantile Q3_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 45.825916079066005%
Missing Rate 0.4582591607906601
qsar_biodegradation quantile Q1_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 26.940238122760373%
Missing Rate 0.26940238122760374
qsar_biodegradation quantile Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 29.57577158709976%
Missing Rate 0.29575771587099753
qsar_biodegradation quantile Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 28.94000693561438%
Missing Rate 0.2894000693561438
qsar_biodegradation quantile Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 25.032944168304244%
Missing Rate 0.2503294416830424
qsar_biodegradation quantile Q1_Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 59.91446075598197%
Missing Rate 0.599144

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 65.70569876314877%
Missing Rate 0.6570569876314877
qsar_biodegradation quantile Q3_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 62.51300427696221%
Missing Rate 0.625130042769622
qsar_biodegradation quantile Q1_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 40.411513119870534%
Missing Rate 0.40411513119870535
qsar_biodegradation quantile Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 44.36481331637961%
Missing Rate 0.44364813316379614
qsar_biodegradation quantile Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 43.410010403421566%
Missing Rate 0.43410010403421573
qsar_biodegradation quantile Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.542480638076526%
Missing Rate 0.37542480638076525
qsar_biodegradation quantile Q1_Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 68.67876546064039%
Missing Rate 0

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 80.31441451855277%
Missing Rate 0.8031441451855277
qsar_biodegradation quantile Q3_Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 75.01098138943475%
Missing Rate 0.7501098138943475
qsar_biodegradation quantile Q1_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 53.83655068778176%
Missing Rate 0.5383655068778176
qsar_biodegradation quantile Q2_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 59.109929487920475%
Missing Rate 0.5910992948792047
qsar_biodegradation quantile Q3_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 57.836088313489775%
Missing Rate 0.5783608831348976
qsar_biodegradation quantile Q4_1.0
--------NO Dataset--------
--------Dataset created--------
0的占比: 50.005779678649866%
Missing Rate 0.5000577967864988
yeast quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 22.734164420485175%
Missing Rate 0.2273416442048517
ye

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 12.41576819407008%
Missing Rate 0.12415768194070076
yeast quantile Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 12.912735849056602%
Missing Rate 0.129127358490566
yeast quantile Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 11.40498652291105%
Missing Rate 0.11404986522911054
yeast quantile Q1_Q2_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 40.97877358490566%
Missing Rate 0.4097877358490566
yeast quantile Q1_Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 42.857142857142854%
Missing Rate 0.4285714285714286
yeast quantile Q1_Q4_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 41.36623989218329%
Missing Rate 0.4136623989218329
yeast quantile Q2_Q3_0.5
--------NO Dataset--------
--------Dataset created--------
0的占比: 41.80424528301887%
Missing Rate 0.4180424528301887
yeast quantile Q2_Q4_0.5
--------NO Dataset--------
--------Datase

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 56.28369272237197%
Missing Rate 0.5628369272237197
yeast quantile Q2_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 57.488207547169814%
Missing Rate 0.5748820754716981
yeast quantile Q3_Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 57.5050539083558%
Missing Rate 0.575050539083558
yeast quantile Q1_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.0872641509434%
Missing Rate 0.370872641509434
yeast quantile Q2_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 37.17991913746631%
Missing Rate 0.3717991913746631
yeast quantile Q3_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 38.68766846361186%
Missing Rate 0.3868766846361186
yeast quantile Q4_0.75
--------NO Dataset--------
--------Dataset created--------
0的占比: 34.18126684636118%
Missing Rate 0.3418126684636119
yeast quantile Q1_Q2_1.0
--------NO Dataset--------
--------Dataset create

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks = np.array(~np.isnan(Xnan), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masks 

--------Dataset created--------
0的占比: 45.56098382749326%
Missing Rate 0.4556098382749326
yacht_hydrodynamics quantile Q1_Q2_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 17.37012987012987%
Missing Rate 0.1737012987012987
yacht_hydrodynamics quantile Q1_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 19.75108225108225%
Missing Rate 0.19751082251082253
yacht_hydrodynamics quantile Q1_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 17.261904761904763%
Missing Rate 0.17261904761904767
yacht_hydrodynamics quantile Q2_Q3_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 19.85930735930736%
Missing Rate 0.19859307359307354
yacht_hydrodynamics quantile Q2_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 17.857142857142858%
Missing Rate 0.1785714285714286
yacht_hydrodynamics quantile Q3_Q4_0.25
--------NO Dataset--------
--------Dataset created--------
0的占比: 19.101731601731604%
M

100%|██████████| 10/10 [00:07<00:00,  1.37it/s]


In [90]:
dataset = "qsar_biodegradation"
missingtype = "diffuse"
rule_name = "0.4"


if missingtype == "logistic":
    missing_rule = load_json_file("missing_rate.json")
elif missingtype == "diffuse":
    missing_rule = load_json_file("diffuse_ratio.json")
elif missingtype == "quantile":
    missing_rule = load_json_file("complete.json")

rule = missing_rule[rule_name] 

get_dataloader(
    dataname=dataset,
    seed=seed,
    nfold=nfold,
    batch_size=128,
    missing_type = missingtype,
    missing_para = rule,
    missing_name = rule_name
)

Missing Rate 0.025939197780603385
self.eval_length 41
--------Dataset created--------
[[ 3.919   2.6909  0.     ...  7.253   0.      0.    ]
 [ 4.17    2.1144  0.     ...  7.257   0.      0.    ]
 [ 3.932   3.2512  0.     ...  7.601   0.      0.    ]
 ...
 [ 4.869   1.767   0.     ...  9.537   1.      0.    ]
 [ 5.158   1.6914  2.     ... 11.055   0.      1.    ]
 [ 5.076   2.6588  2.     ...  9.13    0.      2.    ]]
[[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]
Missing Rate 0.025939197780603385
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[ 3.919       2

In [None]:
# 定义需要替换的字符串和对应的替换字符串
replacements = {
    "logistic-0.25+": "logistic-0.75",
    "logistic-0.75+": "logistic-0.25"
}



In [None]:
mod_name = os.listdir("datasets")
for data in mod_name:
    path = "datasets/{}".format(data)
    files = os.listdir(path)
    # 循环遍历文件列表
    for filename in files:
        # 遍历替换字典中的键值对
        for old_str, new_str in replacements.items():
            # 检查文件名中是否包含需要替换的字符串
            if old_str in filename:
                # 使用字符串的replace方法进行替换
                new_filename = filename.replace(old_str, new_str)
                # 使用os.rename()函数来重命名文件
                os.rename("{}/{}".format(path,filename), "{}/{}".format(path,new_filename))

In [None]:
from scipy.stats import multivariate_normal
n_var = 3
min_corr = 0.1
max_corr = 0.3

mu = np.zeros(n_var)  # mean vector
np.random.seed(1)

corr = np.random.uniform(min_corr, max_corr, size=int(n_var * (n_var - 1) / 2))  # correlation vector (n_var, 2)
cov = np.zeros((n_var, n_var))  # covariance matrix

diag = np.eye(n_var)


cov[np.triu_indices(n_var, k=1)] = corr  # fill upper triangular part with correlations
cov = cov + cov.T + diag


n = 300  # sample size
np.random.seed(2)  # set seed so results are replicable
dat = multivariate_normal.rvs(mean=mu, cov=cov, size=n)  # data


In [None]:
import numpy as np

def missing(data, prob_miss, seed=100):
    np.random.seed(seed)
    m = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        m[i] = np.random.binomial(n=1, size=1, p=prob_miss[i])
    return m

In [None]:
import pandas as pd
import numpy as np

def miss_data(miss_ind, dat, miss_col, n_var):
    miss_dat = pd.DataFrame(dat)
    miss_dat['miss.ind'] = miss_ind
    miss_dat[miss_col] = np.where(miss_ind == 1, np.nan, miss_dat[miss_col])
    
    colnames = ['obs. var' + str(i) for i in range(1, n_var)] + ['miss.ind', 'miss.val']
    miss_dat.columns = colnames
    miss_dat.columns.values[miss_col] = 'miss.var'
    
    return miss_dat

In [None]:
def diffuse_mnar(target_miss, up_percentile, obs_percentile, dat, miss_col, n_var):
    missvar_bounds = np.quantile(dat[:, miss_col], up_percentile)
    temp = dat[:, miss_col] > missvar_bounds

    obsvar_bounds = np.quantile(dat[temp, :-1][:, 0], obs_percentile)

    miss_ind = np.zeros(len(dat))
    miss_ind[temp] = dat[temp, :-1][:, 0] > obsvar_bounds

    miss_dat = np.column_stack((dat, miss_ind, dat[:, miss_col]))
    miss_dat[miss_ind == 1, miss_col] = np.nan

    colnames = ['obs. var' + str(i) for i in range(1, n_var)] + ['miss. ind', 'miss. val']
    miss_dat = pd.DataFrame(miss_dat)
    #miss_dat.columns.values[miss_col] = 'miss.var'

    return miss_dat,dat

In [None]:
def diffuse_mnar(target_miss, up_percentile, obs_percentile, dat, miss_col, n_var):
    missvar_bounds = np.quantile(dat[:, miss_col], up_percentile)
    temp = dat[:, miss_col] > missvar_bounds

    obsvar_bounds = np.quantile(dat[temp, :-1][:, 0], obs_percentile)

    mask = np.zeros(len(dat))
    mask[temp] = dat[temp, :-1][:, 0] > obsvar_bounds


    return mask.sum()


In [None]:
diffuse_mnar(target_miss, up_percentile, obs_percentile, dat, miss_col, n_var)

In [None]:
target_miss = 0.2
up_percentile = 0.6  # 必须小于1 - target.miss
obs_percentile = 1 - target_miss/(1-up_percentile)

miss_col =2

In [None]:
def diffuse_mnar_single(data, up_percentile=0.5, obs_percentile=0.5):
    
    def scale_data(data):
      min_vals = np.min(data, axis=0)
      max_vals = np.max(data, axis=0)
      scaled_data = (data - min_vals) / (max_vals - min_vals)
      return scaled_data

    data = scale_data(data)

    mask = np.ones(data.shape)

    n_cols = data.shape[1]
    n_miss_cols = int(n_cols * 0.5)  # 选择50%的列作为缺失列
    miss_cols = np.random.choice(n_cols, size=n_miss_cols, replace=False)  # 随机选择缺失列的索引

    obs_cols = [col for col in range(data.shape[1]) if col not in miss_cols]
    for miss_col in miss_cols:
      print(miss_col)
      missvar_bounds = np.quantile(data[:, miss_col], up_percentile)
      print(data[:, miss_col])
      print(missvar_bounds)
      temp = data[:, miss_col] > missvar_bounds
      print(temp)


      obsvar_bounds = np.quantile(data[temp, :][:,obs_cols], obs_percentile)
      print(obsvar_bounds)
      
        # 初始化与原始数据维度相同的mask，所有元素均为1
      #mask[temp, miss_col] = (data[temp, :][:,obs_cols][:, 0] <= obsvar_bounds).astype(int)
      temp2 = data[:, miss_col]> obsvar_bounds
      #temp2 = data[:, miss_col] > missvar_bounds  # 根据缺失值情况将对应位置的值设为0
      print(temp2)
      print()
      # print(mask)
      # print()
    return mask

In [None]:
def diffuse_mnar_single(data, up_percentile = 0.5, obs_percentile = 0.5):
    
    def scale_data(data):
        min_vals = np.min(data, axis=0)
        max_vals = np.max(data, axis=0)
        scaled_data = (data - min_vals) / (max_vals - min_vals)
        return scaled_data

    data = scale_data(data)

    mask = np.ones(data.shape)

    n_cols = data.shape[1]
    n_miss_cols = int(n_cols * 0.5)  # 选择50%的列作为缺失列
    miss_cols = np.random.choice(n_cols, size=n_miss_cols, replace=False)  # 随机选择缺失列的索引

    obs_cols = [col for col in range(data.shape[1]) if col not in miss_cols]
    
    for miss_col in miss_cols:
        missvar_bounds = np.quantile(data[:, miss_col], up_percentile)
        temp = data[:, miss_col] > missvar_bounds
        
        obsvar_bounds = np.quantile(data[temp][:, obs_cols], obs_percentile)
        temp2 = data[:, miss_col] > obsvar_bounds

        merged_temp = np.logical_or(temp, temp2).astype(int)
        mask[:, miss_col] = merged_temp
    print("Missing Rate",1 - np.count_nonzero(mask) / mask.size)
    return mask

In [None]:
diffuse_mnar_single(dat,0.75,0.75)

In [None]:
def diffuse_mnar_single(data, target_miss_rate, up_percentile, obs_percentile):
    
    def scale_data(data):
        min_vals = np.min(data, axis=0)
        max_vals = np.max(data, axis=0)
        scaled_data = (data - min_vals) / (max_vals - min_vals)
        return scaled_data
    
    def compute_miss_rate(mask):
        return 1 - np.count_nonzero(mask) / mask.size
    
    data = scale_data(data)

    mask = np.ones(data.shape)

    n_cols = data.shape[1]
    n_miss_cols = int(n_cols * 0.5)  # 选择50%的列作为缺失列
    miss_cols = np.random.choice(n_cols, size=n_miss_cols, replace=False)  # 随机选择缺失列的索引

    obs_cols = [col for col in range(data.shape[1]) if col not in miss_cols]
    
    while True:
        for miss_col in miss_cols:
            missvar_bounds = np.quantile(data[:, miss_col], up_percentile)
            temp = data[:, miss_col] > missvar_bounds
            obsvar_bounds = np.quantile(data[temp][:, obs_cols], obs_percentile)
            temp2 = data[:, miss_col] > obsvar_bounds

            merged_temp = np.logical_or(temp, temp2).astype(int)
            mask[:, miss_col] = merged_temp
        
        miss_rate = compute_miss_rate(mask)

        print("Missing Rate",miss_rate)
        
        if miss_rate <= target_miss_rate:
            #up_percentile += 0.05
            obs_percentile += 0.05
            print(up_percentile, obs_percentile)
        else:
            #print(up_percentile,obs_percentile)
            return mask

In [None]:
data = np.array([[10040,10001,10002],[1,2,1],[10,20,50],[1,1,2]]).T

In [None]:
mask = diffuse_mnar_single(dat,0.2,0.5,0.5)
mask