In [2]:
import sys
sys.path.append("..")
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from scipy import optimize
from torch.utils.data import DataLoader, Dataset
from data_loaders import *
import missing_process.missing_method as missing_method
from missing_process.block_rules import *
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [3]:
real_datalist = ["banknote","concrete_compression",
            "wine_quality_white","wine_quality_red",
            "california","climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics"
            ]

syn_datalist = ["syn1"]

## Create Data Folder

In [4]:
def get_data(dataname,observed_values = None):
    """"
    Apply missing rules
    Nosplit: False-> using trainset, True->using full dataset
    """
    directory_path = f"datasets/{dataname}"

    if dataname in real_datalist:
        
        data = dataset_loader(dataname)
        observed_values = data["data"].astype("float32")
        os.listdir(directory_path)
        np.save(f'{directory_path}/{dataname}.npy', observed_values)
    elif dataname in syn_datalist:
        observed_values = np.load(f'{directory_path}/{dataname}.npy')
    elif dataname not in syn_datalist+real_datalist and observed_values is not None:
        if os.path.exists(directory_path):
            np.save(f'{directory_path}/{dataname}.npy', observed_values.astype("float32"))
            print(f"Directory '{directory_path}' created successfully.")
        else:
            os.makedirs(directory_path, exist_ok=True)
            np.save(f'{directory_path}/{dataname}.npy', observed_values)
            print(f"Create directory '{directory_path}'.")
    
    else:
        print("Invalid Data")
        
    return observed_values    


            

## Do Normalization

In [None]:

def min_max_scaler(observed_values):
    min_val = np.min(observed_values,axis=0)
    max_val = np.max(observed_values,axis=0)
    
    scaled_array = (observed_values - min_val + 1e-10)  / (max_val - min_val + 1e-10)
    
    return np.round(scaled_array,10)


def sensitivity_check(observed_values):


    min_values = np.min(observed_values, axis=0)
    max_values = np.max(observed_values, axis=0)

    min_check = np.any(min_values > 1) or np.any(min_values < 0)
    max_check = np.any(max_values > 1) or np.any(max_values < 0)
    
    if min_check or max_check:
        print("Test did not pass: Min or Max values out of range")
    else:
        print("Test passed: Min and Max values are within range")
        

## Do Data Split

In [17]:
def save_split_index(scaled_data,directory_path,seed = 1,nfold = 5):
    indlist = np.arange(len(scaled_data))

    np.random.seed(seed + 1)
    np.random.shuffle(indlist)

    tmp_ratio = 1 / nfold
    start = (int)((nfold - 1) * len(scaled_data) * tmp_ratio)

    end = (int)(nfold * len(scaled_data) * tmp_ratio)

    test_index = indlist[start:end]
    remain_index = np.delete(indlist, np.arange(start, end))

    np.random.shuffle(remain_index)

    # Modify here to change train,valid ratio
    num_train = (int)(len(remain_index) * 0.9)
    train_index = remain_index[:num_train]
    valid_index = remain_index[num_train:]

    save_index = {}
    save_index["test_index"] = test_index.astype(np.int64).tolist()
    save_index["train_index"] = train_index.astype(np.int64).tolist()
    save_index["valid_index"] = valid_index.astype(np.int64).tolist()
    with open(f"{directory_path}/split_index_seed-{seed}.json", 'w') as file:
        json.dump(save_index, file)

In [19]:
seed = 1
nfold = 5


for data_name in real_datalist:

    directory_path = f"datasets/{data_name}"
    observed_values = get_data(data_name)
    
    scaled_data = min_max_scaler(observed_values)
    sensitivity_check(scaled_data)

    np.save(f'{directory_path}/{data_name}_norm.npy', scaled_data)

    save_split_index(scaled_data,directory_path)

Test passed: Min and Max values are within range


## Syn 1

 - normal distributions for the first two columns
 - discrete distributions for columns 3-4 with variable outcomes
 - skewed distributions for columns 5-6
 - mixed normal distributions for columns 7-8

In [None]:
# Define the dimensions of the 2D array
rows = 1000
cols = 8  # Set to 8 columns
np.random.seed(1) 
# Create an empty array to store the data
syn1 = np.empty((rows, cols))

# Create normal distributions for the first two columns
for i in range(2):
    mean = np.random.uniform(-10, 10)
    std_dev = np.random.uniform(0.1, 5)
    syn1[:, i] = np.random.normal(mean, std_dev, size=rows)

# Create discrete distributions for columns 3-4 with variable outcomes
for i in range(2, 4):
    if i == 2:
        outcomes = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5]
    else:
        outcomes = [1,1,1, 2, 2, 3, 3, 3, 4,]
    syn1[:, i] = np.random.choice(outcomes, size=rows)

# Create skewed distributions for columns 5-6
for i in range(4, 6):
    skewness = np.random.uniform(-100, 100)
    syn1[:, i] = stats.skewnorm.rvs(skewness, size=rows)

# Create mixed normal distributions for columns 7-8
for i in range(6, 8):
    if i % 2 == 0:
        # Create a normal distribution with mode at 2
        normal_dist1 = np.random.normal(2, 1, size=rows)
        normal_dist2 = np.random.normal(6, 0.5, size=rows)  # Additional normal distribution
        syn1[:, i] = np.random.choice(np.concatenate([normal_dist1, normal_dist2]), size=rows)
    else:
        # Create a normal distribution with mode at -2
        normal_dist1 = np.random.normal(-2, 3, size=rows)
        normal_dist2 = np.random.normal(-6, 0.5, size=rows)  # Additional normal distribution
        syn1[:, i] = np.random.choice(np.concatenate([normal_dist1, normal_dist2]), size=rows)

