In [6]:
# Input the complete dataset, and the target true results and missing data results after introducing the given missing rate.
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
SEED = int(time.time())

def generate_missing_data(dataset, missing_rate):
    # Introduce missing values to the main part of the dataset, excluding the last column (label column)
    dataset_without_labels = dataset.iloc[:, :-1]  # Excluding label column
    missing_dataset = dataset_without_labels.copy()
    total_values = missing_dataset.shape[0] * missing_dataset.shape[1]
    num_missing_values = int(total_values * missing_rate)
    indices = [(i, j) for i in range(missing_dataset.shape[0]) for j in range(missing_dataset.shape[1])]
    preserved_indices = {(i, random.randint(0, missing_dataset.shape[1] - 1)) for i in range(missing_dataset.shape[0])}
    indices = [idx for idx in indices if idx not in preserved_indices]
    random.shuffle(indices)
    for idx in indices[:num_missing_values]:
        missing_dataset.iloc[idx] = np.nan

    # Re-add the label column
    missing_dataset = pd.concat([missing_dataset, dataset.iloc[:, -1]], axis=1)

    return missing_dataset

def mask_test_labels(dataset):
    # Replace all values in the label column of the test set with NaN
    dataset_with_masked_labels = dataset.copy()
    dataset_with_masked_labels.iloc[:, -1] = np.nan
    return dataset_with_masked_labels

# missing_rate = 0.3  # Set the missing rate
# MISSING_RATE = missing_rate*100
DATANAME = 'banknote'
# ID_NUM = 1
for ID_NUM in [1,2,3,4,5]:
    for missing_rate in [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7]:
        MISSING_RATE = int(missing_rate*100)

        # Read the dataset
        dataset = pd.read_csv(f'datasets/{DATANAME}/{DATANAME}.csv',header=None)
        # Split the dataset into training set, validation set, and test set; 70%, 10%, 20%
        train_val, test_set = train_test_split(dataset, test_size=0.2, random_state=SEED+int(ID_NUM+MISSING_RATE))  # Test set 20%
        train_set, val_set = train_test_split(train_val, test_size=0.125, random_state=SEED+int(ID_NUM+MISSING_RATE))  # Validation set 10% of 80% = 8% of total
        # Generate missing data
        missing_data_train = generate_missing_data(train_set, missing_rate)
        missing_data_val = generate_missing_data(val_set, missing_rate)
        missing_data_test = generate_missing_data(test_set, missing_rate)

        # Save the generated data
        missing_data_train.to_csv(f'datasets/{DATANAME}/{DATANAME}_train_RANDOM_{MISSING_RATE}%_NUM_{ID_NUM}.csv',header=None,index=False)
        missing_data_val.to_csv(f'datasets/{DATANAME}/{DATANAME}_val_RANDOM_{MISSING_RATE}%_NUM_{ID_NUM}.csv',header=None,index=False)
        missing_data_test.to_csv(f'datasets/{DATANAME}/{DATANAME}_test_RANDOM_{MISSING_RATE}%_NUM_{ID_NUM}.csv',header=None,index=False)
        train_set.to_csv(f'datasets/{DATANAME}/{DATANAME}_train_REAL_{MISSING_RATE}%_NUM_{ID_NUM}.csv',header=None,index=False)  # Option 2, save to specific location
        val_set.to_csv(f'datasets/{DATANAME}/{DATANAME}_val_REAL_{MISSING_RATE}%_NUM_{ID_NUM}.csv',header=None,index=False)  # Option 2, save to specific location
        test_set.to_csv(f'datasets/{DATANAME}/{DATANAME}_test_REAL_{MISSING_RATE}%_NUM_{ID_NUM}.csv',header=None,index=False)  # Option 2, save to specific location
