In [2]:
import sys
sys.path.append("..")
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from scipy import optimize
from torch.utils.data import DataLoader, Dataset
from data_loaders import *
import missing_process.missing_method as missing_method
from missing_process.block_rules import *
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [3]:
real_datalist = ["banknote","concrete_compression",
            "wine_quality_white","wine_quality_red",
            "california","climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics"
            ]
#real_datalist = ["yacht_hydrodynamics"]

syn_datalist = ["syn1"]


missingtypelist = ["quantile","diffuse","logistic"]
missingtypelist = ["mcar","mar"]

In [7]:
def create_missing(missingtypelist,datalist_name):
    '''
    Create Mask npy
    Save missing 
    '''
    

    if datalist_name == "real":
        datalist = real_datalist
    elif datalist_name == "syn":
        datalist = syn_datalist

    for missingtype in missingtypelist:
        if missingtype == "logistic":
            missing_rule = load_json_file("missing_rate.json")
        elif missingtype == "diffuse":
            missing_rule = load_json_file("diffuse_ratio.json")
        elif missingtype == "quantile":
            missing_rule = load_json_file("quantile_full.json")

        elif missingtype == "mcar" or missingtype == "mar":
            missing_rule = load_json_file("mcar.json")


        missing_rate_d = {}

        for dataname in datalist:
            directory_path = f"../datasets/{dataname}"    
            norm_values = np.load(f'{directory_path}/{dataname}_norm.npy')

            print(dataname)
            missing_rate_d[dataname] = []

                # Check if the directory exists
            if not os.path.exists(f'{directory_path}/{missingtype}'):
                os.makedirs(f'{directory_path}/{missingtype}')
            else:
                pass
            
            for rule_name in missing_rule:

                rule = missing_rule[rule_name]
                # try:
                #     create_mask(norm_values,missingtype,rule)
                # except:
                #     print(dataname,missingtype)
                
                observed_masks = create_mask(norm_values,missingtype,rule)

                np.save(f'{directory_path}/{missingtype}/{rule_name}.npy', observed_masks)


                missing_rate = 1 - np.count_nonzero(observed_masks) / observed_masks.size
                #print(rule_name,missing_rate)

                missing_rate_d[dataname].append(missing_rate)

        df = pd.DataFrame.from_dict(missing_rate_d)
        df.index = [rule_name for rule_name in missing_rule]
        
        df.to_csv(f"../datasets/{datalist_name}_{missingtype}_missing_rate.csv")

In [8]:
missingtypelist

['mcar', 'mar']

In [102]:
create_missing(["mar"],"real")

banknote
0.3
0.5
0.7
concrete_compression
0.3
0.5
0.7
wine_quality_white
0.3
0.5
0.7
wine_quality_red
0.3
0.5
0.7
california
0.3
0.5
0.7
climate_model_crashes
0.3
0.5
0.7
connectionist_bench_sonar
0.3
0.5
0.7
qsar_biodegradation
0.3
0.5
0.7
yeast
0.3
0.5
0.7
yacht_hydrodynamics
0.3
0.5
0.7


In [99]:
def create_mask(observed_values,missing_type = "MCAR",
                  missing_para = None):

    observed_masks = ~np.isnan(observed_values.astype("float32"))
    masks = observed_masks.copy().astype("float32")


    "Need input origin dataset and parameters"
    if missing_type == "mcar":
        #masks = missing_method.mcar(observed_values, missing_para)
        #masks = missing_method.MCAR(observed_values,missing_para,masks)
        masks = MCAR(observed_values,missing_para)


    elif missing_type == "quantile":
        Xnan, Xz = missing_method.missing_by_range(observed_values, missing_para)
        masks = np.array(~np.isnan(Xnan), dtype=np.float)


    elif missing_type == "logistic":
        masks = missing_method.MNAR_mask_logistic(observed_values, missing_para)


    elif missing_type == "diffuse":
        masks =  missing_method.diffuse_mnar_single(observed_values, missing_para[0],missing_para[1])

    elif missing_type == "mar":

        masks = missing_method.MAR_mask(observed_values,1,missing_para)

        

    observed_masks = masks.astype(int)
    return observed_masks



In [92]:
def MCAR(observed_values, p, seed=1):

    np.random.seed(seed)
    """
    Generate MCAR mask based on the observed values and missing parameter p.

    Parameters:
    - observed_values: numpy array, observed values in the dataset
    - p: float, percentage of data to be randomly removed (between 0 and 1)
    - seed: int, random seed for reproducibility (default is 1)

    Returns:
    - masks: numpy array, masks indicating removed (0) and present (1) values
    """
      # Set random seed for reproducibility

    num_rows, num_cols = observed_values.shape

    # Number of elements to be removed per column
    num_to_remove_per_column = int(num_rows * p)

    # Initialize masks with all ones
    masks = np.ones_like(observed_values)

    # Randomly select indices to remove for each column
    for col in range(num_cols):
        indices_to_remove = np.random.choice(num_rows, num_to_remove_per_column, replace=False)
        masks[indices_to_remove, col] = 0

    return masks
