In [1]:
import numpy as np
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import json
import os

In [2]:
dataname_list = ["banknote","yeast","climate_model_crashes",
                 "wine_quality_white", "yacht_hydrodynamics","concrete_compression",
                 "breast_cancer","solar_fire","car_evluation"
                 ]

In [3]:
def save_split_index_cv(scaled_data, directory_path, seed=1, nfold=5):
    indlist = np.arange(len(scaled_data))

    np.random.seed(seed)
    np.random.shuffle(indlist)

    fold_size = len(scaled_data) // nfold
    save_index = {}

    for fold in range(nfold):
        start = fold * fold_size
        end = start + fold_size if fold < nfold - 1 else len(scaled_data)
        
        test_index = indlist[start:end]
        train_index = np.concatenate([indlist[:start], indlist[end:]])

        # If you want to split the training set into train and validation sets
        num_train = int(len(train_index) * 0.9)
        train_subindex = train_index[:num_train]
        valid_subindex = train_index[num_train:]

        fold_index = {
            "test_index": test_index.astype(np.int64).tolist(),
            "train_index": train_subindex.astype(np.int64).tolist(),
            "valid_index": valid_subindex.astype(np.int64).tolist()
        }
        save_index[f"fold_{fold+1}"] = fold_index

    with open(f"data/{directory_path}/split_index_cv_seed-{seed}_nfold-{nfold}.json", 'w') as file:
        json.dump(save_index, file)

In [4]:
def load_data(name):
    if name == "banknote":
        with open('data/banknote/data_banknote_authentication.txt', 'rb') as f:
            df = pd.read_csv(f, low_memory=False, sep=',',header = None)
            Xy = {}
            # Ignore the two blocking factor
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]
    elif name == "yeast":
        with open('data/yeast/yeast.data', 'rb') as f:
            df = pd.read_csv(f, delimiter='\s+', header = None)
            Xy = {}
            # remove index
            Xy['data'] = df.values[:, 1:-1].astype('float')
            Xy['target'] =  df.values[:, -1]
    elif name == "climate_model_crashes":
        with open('data/climate_model_crashes/pop_failures.dat', 'rb') as f:
            df = pd.read_csv(f, delimiter='\s+', header = 0)
            Xy = {}
            # Ignore the two blocking factor
            Xy['data'] = df.values[:, 2:-1]
            Xy['target'] =  df.values[:, -1]
    elif name == "wine_quality_white":
        with open('data/wine_quality_white/data.csv', 'rb') as f:
            df = pd.read_csv(f, delimiter=';')
            Xy = {}
            Xy['data'] = df.values[:, :-1].astype('float')
            Xy['target'] =  df.values[:, -1]

    elif name == "yacht_hydrodynamics":
        with open('data/yacht_hydrodynamics/yacht_hydrodynamics.data', 'rb') as f:
            df = pd.read_csv(f, delimiter='\s+', header = None)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "concrete_compression":
        with open('data/concrete_compression/Concrete_Data.xls', 'rb') as f:
            df = pd.read_excel(io=f)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "breast_cancer":
        with open('data/breast_cancer/breast_cancer.data', 'rb') as f:
            df = pd.read_csv(f, delimiter=',', header = None)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "solar_fire":
        with open('data/solar_fire/flare.data1', 'rb') as f:
            df1 = pd.read_csv(f, delimiter='\s+', header = None)
        with open('data/solar_fire/flare.data2', 'rb') as f:
            df2 = pd.read_csv(f, delimiter='\s+', header = None)
            df = pd.concat([df1, df2], ignore_index=True)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    elif name == "car_evaluation":
        with open('data/car_evaluation/car.data', 'rb') as f:
            df = pd.read_csv(f, delimiter=',', header = None)
            Xy = {}
            Xy['data'] = df.values[:, :-1]
            Xy['target'] =  df.values[:, -1]

    return Xy


In [169]:
dataname_list = ["banknote","yeast","climate_model_crashes",
                 "wine_quality_white", 
                 "yacht_hydrodynamics",
                 "concrete_compression",
                #  "breast_cancer",
                #  "solar_fire",
                #  "car_evaluation"
                 ]
save = False
p = 0.5 

missing_rate = [0.05,0.1,0.3]


#dataname_list = ["yeast"]
for name in dataname_list:
    print(name)
    Xy = load_data(name)
    
    feature = Xy['data']
    label = Xy['target']

    #MCAR_mask = MCAR(feature, p, seed=1)
    # print("MAR")
    #MAR_mask = MAR(feature, p)
    #print("MNAR")
    MNAR_mask = MNAR(feature, p)
    
    

    if save:
        save_split_index_cv(Xy['data'],name,seed = 1,nfold = 5)
        np.save(f"data/{name}/feature.npy", Xy['data'])
        np.save(f"data/{name}/label.npy", Xy['target'])
    print()


banknote
[0.5        0.53571429 0.5393586  0.35058309] 0.4814139941690962

yeast
[0.66442049 0.35377358 0.71698113 0.29716981 0.17318059 0.98382749
 0.22506739 0.74326146] 0.5197102425876011

climate_model_crashes
[0.48333333 0.5        0.50925926 0.47592593 0.46851852 0.52407407
 0.48518519 0.4962963  0.50555556 0.50555556 0.48888889 0.53888889
 0.50185185 0.48333333 0.52777778 0.47407407 0.52777778 0.51851852] 0.5008230452674897

wine_quality_white
0.02927946078051824 0.5
[0.98734177 0.         1.         0.48101266 1.         0.21314822
 0.92935892 1.         0.86729277 1.         0.99612087] 0.7703886558521104

yacht_hydrodynamics
[0.94805195 1.         1.         0.0487013  0.00974026 0.        ] 0.5010822510822511

concrete_compression
[0.65728155 0.71747573 0.19223301 0.7592233  1.         0.
 0.         0.84563107] 0.5214805825242719



## Missing Mechan

In [5]:
def MCAR(observed_values, p, seed=1):
    np.random.seed(seed)
    num_rows, num_cols = observed_values.shape
    num_to_remove_per_column = int(num_rows * p)
    masks = np.ones_like(observed_values)
    for col in range(num_cols):
        indices_to_remove = np.random.choice(num_rows, num_to_remove_per_column, replace=False)
        masks[indices_to_remove, col] = 0
    calculate_missing_rates(masks)
    return masks

def force_mask(mask, p, tolerance=0.01,seed = 1):
    rows, cols = mask.shape
    
    for col in range(cols):
        missing_rate = np.mean(mask[:, col] == 0)
        
        while abs(missing_rate - p) > tolerance:
            if missing_rate < p:
                # Need more zeros, randomly change 1s to 0s
                one_indices = np.where(mask[:, col] == 1)[0]
                if len(one_indices) == 0:
                    break  # No more 1s to change to 0s
                np.random.seed(seed)
                random_index = np.random.choice(one_indices)
                mask[random_index, col] = 0
            else:
                # Need more ones, randomly change 0s to 1s
                zero_indices = np.where(mask[:, col] == 0)[0]
                if len(zero_indices) == 0:
                    break  # No more 0s to change to 1s
                np.random.seed(seed)
                random_index = np.random.choice(zero_indices)
                mask[random_index, col] = 1

            missing_rate = np.mean(mask[:, col] == 0)
    
    return mask

In [6]:
def calculate_missing_rates(mask):
    # Calculate missing rate for each column
    num_rows, num_cols = mask.shape
    missing_rate_per_column = np.sum(mask == 0, axis=0) / num_rows

    # Calculate overall missing rate
    total_elements = num_rows * num_cols
    overall_missing_rate = np.sum(mask == 0) / total_elements

    print(missing_rate_per_column, overall_missing_rate)

In [162]:
import torch
from scipy import optimize

def pick_coeffs(X, idxs_obs=None, idxs_nas=None, self_mask=False):
    n, d = X.shape
    if self_mask:
        torch.manual_seed(d)
        coeffs = torch.randn(d)
        Wx = X * coeffs
        coeffs /= torch.std(Wx, 0)
    else:
        d_obs = len(idxs_obs)
        d_na = len(idxs_nas)
        torch.manual_seed(d)
        coeffs = torch.randn(d_obs, d_na).float()

        # Dynamically adjust coeffs to match the type of X[:, idxs_obs]
        if X[:, idxs_obs].dtype == torch.double:
            coeffs = coeffs.double()
        # Add more conditions here if there are other types you need to handle

        # Perform operations
        Wx = X[:, idxs_obs].mm(coeffs)
        coeffs /= torch.std(Wx, 0, keepdim=True)
    return coeffs


def fit_intercepts(X, coeffs, p, self_mask=False):
    if self_mask:
        d = len(coeffs)
        intercepts = torch.zeros(d)
        for j in range(d):
            def f(x):
                return torch.sigmoid(X * coeffs[j] + x).mean().item() - p
            
            try:
                intercepts[j] = optimize.bisect(f, -500, 500)
            except:
                print(f(-500),f(500))
    else:
        d_obs, d_na = coeffs.shape
        intercepts = torch.zeros(d_na)
        for j in range(d_na):
            def f(x):
                return torch.sigmoid(X.mv(coeffs[:, j]) + x).mean().item() - p
            #intercepts[j] = optimize.bisect(f, -500, 500)
            try:
                intercepts[j] = optimize.bisect(f, -500, 500)
            except:
                print(f(-500),f(500))
            
    return intercepts


def MAR(X, p, p_obs = 0.5,seed = 1):

    n, d = X.shape

    to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        X = torch.from_numpy(X)

    mask = torch.zeros(n, d).bool() if to_torch else np.zeros((n, d)).astype(bool)

    d_obs = max(int(p_obs * d), 1) ## number of variables that will have no missing values (at least one variable)
    d_na = d - d_obs ## number of variables that will have missing values

    ### Sample variables that will all be observed, and those with missing values:
    np.random.seed(n)
    idxs_obs = np.random.choice(d, d_obs, replace=False)
    idxs_nas = np.array([i for i in range(d) if i not in idxs_obs])

    ### Other variables will have NA proportions that depend on those observed variables, through a logistic model
    ### The parameters of this logistic model are random.

    ### Pick coefficients so that W^Tx has unit variance (avoids shrinking)
    coeffs = pick_coeffs(X, idxs_obs, idxs_nas)
    ### Pick the intercepts to have a desired amount of missing values
    intercepts = fit_intercepts(X[:, idxs_obs], coeffs, p)

    ps = torch.sigmoid(X[:, idxs_obs].mm(coeffs) + intercepts)
    torch.manual_seed(n)
    ber = torch.rand(n, d_na)
    mask[:, idxs_nas] = ber >= ps
    #mask = force_mask(mask,p)
    calculate_missing_rates(mask)
    return mask

#

def MNAR(X, p):

    n, d = X.shape

    to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        X = torch.from_numpy(X)

    ### Variables will have NA proportions that depend on those observed variables, through a logistic model
    ### The parameters of this logistic model are random.

    ### Pick coefficients so that W^Tx has unit variance (avoids shrinking)
    coeffs = pick_coeffs(X, self_mask=True)
    
    ### Pick the intercepts to have a desired amount of missing values
    intercepts = fit_intercepts(X, coeffs, p, self_mask=True)
    

    ps = torch.sigmoid(X * coeffs + intercepts)


    np.random.seed(n)
    ber = np.random.rand(n, d)
    mask = ber >= ps if to_torch else ber >= ps.numpy()
    #mask = force_mask(mask,p)
    
    calculate_missing_rates(mask)
    return mask
