In [365]:
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets
%matplotlib inline
from scipy.io import arff
import sys
sys.path.append("..")
import missing_process.missing_method as missing_method
from missing_process.block_rules import *
from sklearn.datasets import make_blobs
import pandas as pd

#  https://cs.joensuu.fi/sipu/datasets/
# dataset
def MCAR(observed_values, missing_rate, masks):
    # Check if missing_rate is between 0 and 1
    if not 0 <= missing_rate <= 1:
        raise ValueError("Missing rate must be between 0 and 1.")
    
    n_samples, n_features = observed_values.shape
    
    new_mask = masks.copy()
    # Copy observed_values to avoid altering the original data
    data_with_missing = observed_values.copy()
    

    n_missing = int(n_samples * missing_rate)
        
        # Randomly choose indices for 'missing' data
    missing_indices = np.random.choice(n_samples, n_missing, replace=False)
        
        # Set chosen indices to np.nan in the data and 0 in the masks
    new_mask[missing_indices, 0] = 0
    
    return new_mask


def MAR(observed_values, missing_rate,masks):
    if observed_values.shape[1] < 2:
        raise ValueError("The input array must have at least two columns for MAR.")

    n_samples = observed_values.shape[0]
    
    new_mask = masks.copy()

    # Calculate the median of the second column
    med = np.percentile(observed_values[:, 1],missing_rate*100)


    # Identify indices where first column's value is larger than the median of the second column
    missing_indices = np.where(observed_values[:, 1] < med)[0]

    # Set these indices to 0 in the mask for the first column
    new_mask[missing_indices, 0] = 0

    return new_mask



def MNAR(observed_values, missing_rate, masks):
    if observed_values.shape[1] < 2:
        raise ValueError("The input array must have at least two columns for MAR.")

    n_samples = observed_values.shape[0]
    
    new_mask = masks.copy()

    # Calculate the median of the second column
    med = np.percentile(observed_values[:, 0],missing_rate*100)

    
    # Identify indices where first column's value is larger than the median of the second column
    missing_indices = np.where(observed_values[:, 0] < med)[0]
    # Set these indices to 0 in the mask for the first column
    new_mask[missing_indices, 0] = 0

    return new_mask



def simple_diffuse(observed_values, masks, up_percentile, obs_percentile):
    # This is actually MAR
    new_mask = masks.copy()

    # Calculate the median of the second column
    bound_1 = np.percentile(observed_values[:, 0], up_percentile)
    bound_2 = np.percentile(observed_values[:, 1], obs_percentile)

    # Identify indices where first column's value is larger than the median of the second column
    missing_indices = np.where((observed_values[:, 0] < bound_1) & (observed_values[:, 1] < bound_2))[0]

    # Set these indices to 0 in the mask for the first column
    new_mask[missing_indices, 0] = 0

    return new_mask


def diffuse_mnar_single(data, up_percentile = 0.5, obs_percentile = 0.5):

    mask = np.ones(data.shape)

    miss_col = 0  # 随机选择缺失列的索引

    obs_col = 1
    
    missvar_bounds = np.quantile(data[:, miss_col], up_percentile)
    temp = data[:, miss_col] > missvar_bounds
    
    obsvar_bounds = np.quantile(data[:, obs_col], obs_percentile)
    temp2 = data[:, miss_col] > obsvar_bounds

    merged_temp = np.logical_or(temp, temp2).astype(int)
    mask[:, miss_col] = merged_temp
    return mask


In [314]:
def missing_rate(data):
        # Create a mask to identify NaN values
    nan_mask = np.isnan(data)

    # Calculate the total number of NaN values in each column
    nan_counts = np.sum(nan_mask, axis=0)

    # Calculate the missing rate (percentage of NaN values) for each column
    missing_rate = nan_counts / data.shape[0] * 100

    print("Missing rate for each column:")
    for i, rate in enumerate(missing_rate):
        print(f"Column {i + 1}: {rate:.2f}%")

In [315]:
def create_dataset(data):
    if data == "normal":
        orgin = np.random.randn(1000, 2)
    elif data == "4-clusters":
        orgin, y = make_blobs(n_samples=1000, centers=4, cluster_std=0.7, random_state=0)
    elif data == "negtive_correlated":
        mean = [0, 0]
        std_devs = [1, 1]
        correlation = -0.7
        covariance = np.array([[std_devs[0]**2, correlation * std_devs[0] * std_devs[1]],
                            [correlation * std_devs[0] * std_devs[1], std_devs[1]**2]])
        orgin = np.random.multivariate_normal(mean, covariance, size=1000)
    elif data == "positive_correlated":
        mean = [0, 0]
        std_devs = [1, 1]
        correlation = 0.7
        covariance = np.array([[std_devs[0]**2, correlation * std_devs[0] * std_devs[1]],
                            [correlation * std_devs[0] * std_devs[1], std_devs[1]**2]])
        orgin = np.random.multivariate_normal(mean, covariance, size=1000)

    elif data == "syn_donut" or data == "syn_spiral":

        data = arff.loadarff(f'../datasets/{data}/{data}.arff')
        df = pd.DataFrame(data[0])
        orgin = np.array(df[["a0","a1"]])


    return orgin

In [362]:
def make_dic(dataname_list):
    full_d = {}

    for data in dataname_list:
        full_d[data] = {}

        full_d[data]["MCAR"] = {}
        full_d[data]["MAR"] = {}
        full_d[data]["MNAR"] = {}
        full_d[data]["logistic"] = {}
        full_d[data]["diffuse"] = {}
        full_d[data]["diffuse_simple"] = {}

        orgin = create_dataset(data)
        full_d[data]["origin"] = orgin

            
        observed_masks = ~np.isnan(orgin.astype("float32"))
        masks = observed_masks.copy().astype("float32")

        for i in [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
            # MCAR
            observed_masks = ~np.isnan(orgin.astype("float32"))
            mcar_mask =  MCAR(orgin, i, masks)
            nan_mask = np.where(mcar_mask == 0, np.nan, mcar_mask)
            masked_value_na_MCAR = orgin * nan_mask
            full_d[data]["MCAR"][i] = masked_value_na_MCAR


            # MAR
            observed_masks = ~np.isnan(orgin.astype("float32"))
            mar_mask =  MAR(orgin, i, masks)
            nan_mask = np.where(mar_mask == 0, np.nan, mar_mask)
            masked_value_na_MAR = orgin * nan_mask
            full_d[data]["MAR"] [i] = masked_value_na_MAR


            # logistic
            observed_masks = ~np.isnan(orgin.astype("float32"))
            if i != 1:
                logistic_mask =  missing_method.MNAR_mask_logistic(orgin,1-i,exclude_inputs=True)
            else:
                logistic_mask =  np.isnan(orgin.astype("float32"))
            nan_mask = np.where(logistic_mask == 0, np.nan, logistic_mask)
            masked_value_na_logistic = orgin * nan_mask
            full_d[data]["logistic"][i] = masked_value_na_logistic

            # pure focus MNAR
            observed_masks = ~np.isnan(orgin.astype("float32"))
            mask =  MNAR(orgin, i, masks)

            nan_mask = np.where(mask == 0, np.nan, mask)
            masked_value_na = orgin * nan_mask
            full_d[data]["MNAR"] [i] = masked_value_na



            # diffuse
            observed_masks = ~np.isnan(orgin.astype("float32"))
            if i != 0:
                diffuse_mask =  diffuse_mnar_single(orgin,i,i)
                
            else:
                diffuse_mask =  ~np.isnan(orgin.astype("float32"))
            nan_mask = np.where(diffuse_mask == 0, np.nan, diffuse_mask)
            masked_value_na_diffuse = orgin * nan_mask
            full_d[data]["diffuse"][i] = masked_value_na_diffuse

            
            # simple diffuse

            observed_masks = ~np.isnan(orgin.astype("float32"))
            if i != 0:
                diffuse_mask = simple_diffuse(orgin, observed_masks,i*100,i*100)
                
            else:
                diffuse_mask =  ~np.isnan(orgin.astype("float32"))
            nan_mask = np.where(diffuse_mask == 0, np.nan, diffuse_mask)
            masked_value_na_diffuse = orgin * nan_mask
            full_d[data]["diffuse_simple"][i] = masked_value_na_diffuse


    return full_d

In [373]:
def make_plot(parameter = 0.5,mechanism = ["MCAR","MAR",
                                           "MNAR-Focus(percentile)",
                                           "MNAR-Focus(logistic)",
                                           "MNAR-Diffuse","MNAR-Diffuse(Simple)",
                                           ], 
              dataset = ["normal","4-clusters","negtive_correlated",
                         "positive_correlated","syn_spiral"],
                ):
        # Create a figure and axes
    
    if mechanism == "MCAR":
        masked_value_na = full_d[dataset]["MCAR"][parameter]
    elif mechanism == "MAR":
        masked_value_na = full_d[dataset]["MAR"][parameter]
    elif mechanism == "MNAR-Focus(logistic)":
        masked_value_na = full_d[dataset]["logistic"][parameter]
    elif mechanism == "MNAR-Focus(percentile)":
        masked_value_na = full_d[dataset]["MNAR"][parameter]
    elif mechanism == "MNAR-Diffuse":
        masked_value_na = full_d[dataset]["diffuse"][parameter]
    elif mechanism == "MNAR-Diffuse(Simple)":
        masked_value_na = full_d[dataset]["diffuse_simple"][parameter]
        
    orgin = full_d[dataset]["origin"]
    fig, ax = plt.subplots(figsize=(6, 6))
    # Scatter plot
    ax.scatter(masked_value_na[:, 0], masked_value_na[:, 1], s=10,label = "Data After Missing",marker="o",alpha=1, c="red")
    ax.scatter(orgin[:, 0], orgin[:, 1], s=10,label = "Complete Data",marker=".")
    ax.set_title(f'{mechanism}')
    ax.set_xlabel('Missing Column')
    ax.set_ylabel('Observed Column')
    ax.legend(loc = 1)
    if dataset == "normal":
        ax.set_xlim(-3,3)
        ax.set_ylim(-3,3)
    elif dataset == "4-clusters":
        ax.set_xlim(-5,4)
        ax.set_ylim(-5,10)
    #plt.show()

In [374]:
full_d = make_dic(["normal","4-clusters","negtive_correlated","positive_correlated","syn_spiral"])
ipywidgets.interact(make_plot,parameter= (0,1,0.1))


interactive(children=(FloatSlider(value=0.5, description='parameter', max=1.0), Dropdown(description='mechanis…

<function __main__.make_plot(parameter=0.5, mechanism=['MCAR', 'MAR', 'MNAR-Focus(percentile)', 'MNAR-Focus(logistic)', 'MNAR-Diffuse', 'MNAR-Diffuse(Simple)'], dataset=['normal', '4-clusters', 'negtive_correlated', 'positive_correlated', 'syn_spiral'])>

array([ 515., 1000.])