In [1]:
import numpy as np
import pandas as pd
import os
import torch
import numpy as np
import json
from scipy import optimize

In [200]:
def calculate_missing_rates(data):    
    # Calculate the overall missing rate
    total_missing = np.isnan(data).sum()
    total_elements = data.size
    overall_missing_rate = total_missing / total_elements
    print(f"Overall missing rate: {overall_missing_rate:.2%}")
    
    # # Calculate the missing rate for each column
    # column_missing_rates = {}
    # for i in range(data.shape[1]):
    #     column_missing_rate = np.isnan(data[:, i]).mean()
    #     column_missing_rates[f'feature_{i}'] = column_missing_rate
    #     print(f"Missing rate for column 'feature_{i}': {column_missing_rate:.2%}")
    
    # return {"overall": overall_missing_rate, "columns": column_missing_rates}

In [9]:
namelist = ["car","breast",
            "australian","heart","adult","student",
            "banknote","sonar","spam","wine"]

In [2]:
namelist = [
            "adult"]

## MCAR

Missing rate : 5，10，20，30，40，50，60，70，80

In [3]:
missing_rate = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

In [4]:
import numpy as np
import pandas as pd

def make_mcar(data, missing_rate=0.1, seed=1):
    # Convert the data to float if it's not already, so it can hold NaN values
    data = data.astype(float)
    
    total_elements = data.size
    missing_elements = int(total_elements * missing_rate)

    # Create a copy of the data to avoid modifying the original array
    data_with_missing = data.copy()

    np.random.seed(seed)
    mask_indices = np.random.choice(total_elements, missing_elements, replace=False)

    # Convert flat indices to multi-dimensional indices
    multi_indices = np.unravel_index(mask_indices, data.shape)

    # Set selected elements to NaN
    data_with_missing[multi_indices] = np.nan

    return data_with_missing

In [5]:
for dataname in namelist:
    data = np.array(pd.read_csv(f"dataset/{dataname}/features.csv"))
    for rate in missing_rate:
        missingdata = make_mcar(data, missing_rate=rate, seed=1)

        #print(f"Missing rate analysis for dataset: {dataname}")
        #missing_rates = calculate_missing_rates(missingdata)
        
            # Create the directory if it doesn't exist
        output_dir = f"dataset_nan/{dataname}/mcar"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    # Save the data with missing values as a NumPy array
        np.save(f"{output_dir}/{rate}.npy", missingdata)
    

## MAR

BREAST Heart, Adult

In [74]:
#namelist = ["breast","heart","adult"]

In [7]:
for dataname in namelist:
    data = np.array(pd.read_csv(f"dataset/{dataname}/features.csv"))
    for rate in missing_rate:
        missingdata = make_mar(data, p=rate, p_obs= 0.2)

        print(f"Missing rate analysis for dataset: {dataname}")
        #missing_rates = calculate_missing_rates(missingdata)
        output_dir = f"dataset_nan/{dataname}/mar"
        if not os.path.exists(output_dir):
           os.makedirs(output_dir)

    #Save the data with missing values as a NumPy array
        np.save(f"{output_dir}/{rate}.npy", missingdata)

Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult
Missing rate analysis for dataset: adult


In [6]:
import torch
import numpy as np
from scipy import optimize

def fit_intercepts(X, coeffs, p, self_mask=False):
    if self_mask:
        d = len(coeffs)
        intercepts = torch.zeros(d)
        for j in range(d):
            def f(x):
                return torch.sigmoid(X * coeffs[j] + x).mean().item() - p
            intercepts[j] = optimize.bisect(f, -50, 50)
    else:
        d_obs, d_na = coeffs.shape
        intercepts = torch.zeros(d_na)
        
        # Ensure X and coeffs are in floating-point format
        X = X.float()
        coeffs = coeffs.float()

        for j in range(d_na):
            def f(x):
                return torch.sigmoid(X.mv(coeffs[:, j]) + x).mean().item() - p
            
            intercepts[j] = optimize.bisect(f, -50, 50)
    
    return intercepts

def pick_coeffs(X, idxs_obs=None, idxs_nas=None, self_mask=False):
    n, d = X.shape
    if self_mask:
        coeffs = torch.randn(d).float()  # Ensure coeffs are float
        Wx = X * coeffs
        coeffs /= torch.std(Wx, 0)
    else:
        d_obs = len(idxs_obs)
        d_na = len(idxs_nas)
        coeffs = torch.randn(d_obs, d_na).float()  # Ensure coeffs are float

        # Convert indices to LongTensor for PyTorch operations
        idxs_obs = torch.tensor(idxs_obs, dtype=torch.long)
        idxs_nas = torch.tensor(idxs_nas, dtype=torch.long)

        # Ensure the data is in floating-point format
        X = X.float()  # Ensure X is a floating-point tensor

        # Perform operations
        Wx = X[:, idxs_obs].mm(coeffs)
        coeffs /= torch.std(Wx, 0, keepdim=True)
    return coeffs

def make_mar(X, p, p_obs):
    n, d = X.shape

    to_torch = torch.is_tensor(X)  # Determine if X is a PyTorch tensor or a NumPy array
    if not to_torch:
        X = torch.from_numpy(X)

    # Initialize a boolean mask
    mask = torch.zeros(n, d).bool()

    # Calculate the number of observed variables and the number of potentially missing variables
    d_obs = max(int(p_obs * d), 1)
    d_na = d - d_obs

    # Select indices for observed and potentially missing variables
    idxs_obs = np.random.choice(d, d_obs, replace=False)
    idxs_nas = np.array([i for i in range(d) if i not in idxs_obs])

    # Generate coefficients and intercepts for the logistic model
    coeffs = pick_coeffs(X, idxs_obs, idxs_nas)
    intercepts = fit_intercepts(X[:, idxs_obs], coeffs, p)

    # Ensure X, coeffs, and intercepts are floating-point tensors
    X = X.float()
    coeffs = coeffs.float()
    intercepts = intercepts.float()

    # Calculate the probabilities using the logistic model
    ps = torch.sigmoid(X[:, idxs_obs].mm(coeffs) + intercepts)

    # Generate random values and apply the mask
    ber = torch.rand(n, d_na)
    mask[:, idxs_nas] = ber < ps

    # Apply the mask to X, setting the masked elements to NaN
    X[mask] = float('nan')

    # Convert back to numpy array if the input was a numpy array
    if not to_torch:
        X = X.numpy()

    return X

## MNAR

In [8]:
import numpy as np
import pandas as pd

# namelist = [#"car","breast",
#             #"australian","heart","adult","student",
#             "banknote","sonar","spam","wine"
#             ]

def make_mnar(X, percentile):
    # Copy the array to avoid altering the original one
    X_mnar = X.copy()
    percentile = percentile * 100
    # Iterate over each column in the array
    for col in range(X_mnar.shape[1]):
        # Calculate the percentile value for the current column
        threshold = np.percentile(X_mnar[:, col], percentile)

        # Replace values less than the threshold with np.nan
        X_mnar[:, col] = np.where(X_mnar[:, col] < threshold, np.nan, X_mnar[:, col])

    return X_mnar

# Example usage
# Assuming `data` is your NumPy array and `percentile` is your threshold:
# missingdata = make_mnar(data, percentile=20)


In [8]:
def make_mnar_columnwise(data, col_info, percentile):
    data_mnar = data.copy()
    percentile = percentile*100
    for col, col_type in col_info.items():
        col_idx = int(col)  # Assuming the keys in `col_info` correspond to column indices

        if "numerical" in col_type:
            # Calculate the percentile value for the numerical column
            threshold = np.percentile(data_mnar[:, col_idx], percentile)
            # Replace values less than the threshold with np.nan
            data_mnar[:, col_idx] = np.where(data_mnar[:, col_idx] < threshold, np.nan, data_mnar[:, col_idx])

        elif "ordinal" in col_type:
            # Use the ordinal mapping from JSON to process the column
            ordinal_map = col_type['ordinal']
            threshold = np.percentile([ordinal_map.get(val, np.nan) for val in data_mnar[:, col_idx]], percentile)
            # Replace values less than the threshold with np.nan
            data_mnar[:, col_idx] = np.where([ordinal_map.get(val, np.nan) < threshold for val in data_mnar[:, col_idx]], np.nan, data_mnar[:, col_idx])

        elif "nominal" in col_type:
            # Nominal data typically isn't ordinal or numerical, but let's apply a similar logic
            # Convert nominal categories to numbers using some encoding (this is an assumption)
            unique_vals = list(set(data_mnar[:, col_idx]))
            mapping = {val: i for i, val in enumerate(unique_vals)}
            # Calculate the percentile value for the nominal column (based on its encoded values)
            threshold = np.percentile([mapping.get(val, np.nan) for val in data_mnar[:, col_idx]], percentile)
            # Replace values less than the threshold with np.nan
            data_mnar[:, col_idx] = np.where([mapping.get(val, np.nan) < threshold for val in data_mnar[:, col_idx]], np.nan, data_mnar[:, col_idx])

    return data_mnar


In [9]:
import os
import numpy as np
import pandas as pd
import json
import random

# namelist = [#"car","breast",
#             "australian",
#             #"heart","adult","student",
#             #"banknote","sonar","spam","wine"
#             ]
namelist = [#"car",
    "adult",
            #"australian",
            #"heart","adult","student",
            #"banknote","sonar","spam","wine"
            ]
def make_mnar_columnwise(data, col_info, q, random_seed=1):
    np.random.seed(random_seed)
    random.seed(random_seed)
    q = q * 100
    data_mnar = data.astype(float)

    missing_rates = {}

    for col, col_type in col_info.items():
        col_idx = int(col)  # Assuming the keys in `col_info` correspond to column indices
        num_to_remove = int(len(data_mnar) * q / 100)
        if "numerical" in col_type:
            # Calculate the percentile value for the numerical column
            threshold = np.percentile(data_mnar[:, col_idx], q)
            # Replace values less than the threshold with np.nan
            data_mnar[:, col_idx] = np.where(data_mnar[:, col_idx] < threshold, np.nan, data_mnar[:, col_idx])

            # Calculate the missing rate for this column
            missing_rate = np.mean(np.isnan(data_mnar[:, col_idx])) * 100
            missing_rates[col_idx] = missing_rate
            #print("numerical" ,missing_rate)

        elif "ordinal" in col_type:
            # Use the ordinal mapping from JSON to find the top two largest ordinal values
            ordinal_map = col_type['ordinal']
            max_value = max(ordinal_map.values())

            # Find the indices where the values in the column are greater than or equal to max_value - 1
            max_indices = np.where(data_mnar[:, col_idx] >= (max_value - 2))[0].tolist()

            # Find the rest of the indices (those not in max_indices)
            all_indices = set(range(data_mnar.shape[0]))
            other_indices = list(all_indices - set(max_indices))

            # Determine which indices to remove based on the number to remove
            if len(max_indices) >= num_to_remove:
                remove_indices = random.sample(max_indices, num_to_remove)
            else:
                # If there are not enough max_indices, take all max_indices and supplement with random others
                remove_indices = max_indices
                random_indices = random.sample(other_indices, num_to_remove - len(remove_indices))
                #remove_indices = remove_indices + random_indices

            data_mnar[remove_indices, col_idx] = np.nan

            # Calculate the missing rate for this column
            missing_rate = np.mean(np.isnan(data_mnar[:, col_idx])) * 100
            missing_rates[col_idx] = missing_rate
            #print("ordinal" ,missing_rate)

        elif "nominal" in col_type:
            # Nominal data: Randomly choose one category and make a portion of the data missing
            unique_vals = list(set(data_mnar[:, col_idx]))
            chosen_val = random.choice(unique_vals)

            # Get indices of the chosen category
            chosen_indices = np.where(data_mnar[:, col_idx] == chosen_val )[0].tolist()


            # Find the rest of the indices (those not in max_indices)
            all_indices = set(range(data_mnar.shape[0]))
            other_indices = list(all_indices - set(chosen_indices))

            # Determine which indices to remove based on the number to remove
            if len(chosen_indices) >= num_to_remove:
                remove_indices = random.sample(chosen_indices, num_to_remove)
            else:
                # If there are not enough max_indices, take all max_indices and supplement with random others
                remove_indices = chosen_indices
                random_indices = random.sample(other_indices, num_to_remove - len(remove_indices))
                remove_indices = remove_indices + random_indices


            data_mnar[remove_indices, col_idx] = np.nan

            # Calculate the missing rate for this column
            missing_rate = np.mean(np.isnan(data_mnar[:, col_idx])) * 100
            #print("nominal",missing_rate)
            missing_rates[col_idx] = missing_rate

    return data_mnar


for dataname in namelist:
    data = np.array(pd.read_csv(f"dataset/{dataname}/features.csv"))
    with open(f"dataset/{dataname}/column_info.json", 'r') as f:
        col_info = json.load(f)
    
    for rate in missing_rate:
        missingdata = make_mnar_columnwise(data, col_info, q=rate)

        #missing_rates = calculate_missing_rates(missingdata)
        
        output_dir = f"dataset_nan/{dataname}/mnar"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)


    #Save the data with missing values as a NumPy array
        np.save(f"{output_dir}/{rate}.npy", missingdata)

In [77]:

import numpy as np

def apply_missing_rate(data, missing_rate):
    # Flatten the data to simplify the process
    flat_data = data.flatten()

    # Count the existing missing values
    total_elements = flat_data.size
    current_missing_count = np.sum(np.isnan(flat_data))

    # Calculate the target number of missing values
    target_missing_count = int(missing_rate * total_elements)

    # Calculate how many more values need to be removed
    additional_missing_count = target_missing_count - current_missing_count

    if additional_missing_count <= 0:
        # If the current missing rate is already higher than or equal to the target, return the original data
        return data

    # Identify indices that are not already missing
    available_indices = np.where(~np.isnan(flat_data))[0]

    # Randomly select indices to remove additional data
    indices_to_remove = np.random.choice(available_indices, additional_missing_count, replace=False)

    # Set the selected indices to np.nan to represent missing data
    flat_data[indices_to_remove] = np.nan

    # Reshape the flat data back to the original shape
    return flat_data.reshape(data.shape)


In [142]:
dataname = "heart"
used_rate = 0.2
actual_rate = 0.6
output_dir = f"dataset_nan/{dataname}/mar"
used_data = np.load(f"{output_dir}/{used_rate}.npy")
calculate_missing_rates(used_data)

actual_data = np.load(f"{output_dir}/{actual_rate}.npy")
calculate_missing_rates(actual_data)
save = 1

if save:
    modified = apply_missing_rate(used_data, 0.50)

    calculate_missing_rates(modified)

    np.save(f"{output_dir}/{actual_rate}.npy", modified)

Overall missing rate: 16.68%
Overall missing rate: 49.99%
Overall missing rate: 49.99%


## Check Missing Rate

In [144]:
missing_type = ["mcar", "mar"]
namelist = ["car", "breast", "australian", "heart", "adult", "student", 
            "banknote", "sonar", "spam", "wine"]
missing_rate = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

for mtype in missing_type:
    for dataname in namelist:
        for rate in missing_rate:
            output_dir = f"dataset_nan/{dataname}/{mtype}"
            used_data = np.load(f"{output_dir}/{rate}.npy")
            
            print(f"Processing: Type = {mtype}, Data = {dataname}, Rate = {rate}")
            calculate_missing_rates(used_data)
        print()

Processing: Type = mcar, Data = car, Rate = 0.05
Overall missing rate: 5.00%
Processing: Type = mcar, Data = car, Rate = 0.1
Overall missing rate: 9.99%
Processing: Type = mcar, Data = car, Rate = 0.2
Overall missing rate: 19.99%
Processing: Type = mcar, Data = car, Rate = 0.3
Overall missing rate: 30.00%
Processing: Type = mcar, Data = car, Rate = 0.4
Overall missing rate: 40.00%
Processing: Type = mcar, Data = car, Rate = 0.5
Overall missing rate: 50.00%
Processing: Type = mcar, Data = car, Rate = 0.6
Overall missing rate: 59.99%
Processing: Type = mcar, Data = car, Rate = 0.7
Overall missing rate: 69.99%
Processing: Type = mcar, Data = car, Rate = 0.8
Overall missing rate: 80.00%

Processing: Type = mcar, Data = breast, Rate = 0.05
Overall missing rate: 9.05%
Processing: Type = mcar, Data = breast, Rate = 0.1
Overall missing rate: 13.75%
Processing: Type = mcar, Data = breast, Rate = 0.2
Overall missing rate: 23.27%
Processing: Type = mcar, Data = breast, Rate = 0.3
Overall missing 