In [34]:
import pickle
import numpy as np

import os
import sys
from scipy import optimize
from torch.utils.data import DataLoader, Dataset
from data_loaders import *
import missing_process.missing_method as missing_method
from missing_process.block_rules import *
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import ExtraTreesRegressor

In [6]:
def load_index(observed_values,seed):

    N, D = observed_values.shape

    dl = D - 1
    
    indlist = np.arange(N)

    np.random.seed(seed + 1)
    np.random.shuffle(indlist)

    tmp_ratio = 1 / 5
    start = (int)((5 - 1) * N * tmp_ratio)
    
    end = (int)(5 * N * tmp_ratio)

    test_index = indlist[start:end]
    remain_index = np.delete(indlist, np.arange(start, end))

    np.random.shuffle(remain_index)

    # Modify here to change train,valid ratio
    num_train = (int)(len(remain_index) * 0.9)
    train_index = remain_index[:num_train]
    valid_index = remain_index[num_train:]

    return train_index, test_index

In [134]:
dataname = "banknote"#,,
dataname = "concrete_compression"
#dataname = "wine_quality_white"
dataname = "wine_quality_red"
seed = 1
nfold = 5
#missingtype = "logistic"
missingtype = "diffuse"
#missingtype = "self_mask"

missing_rule = load_json_file("diffuse_ratio.json")
#missing_rule = load_json_file("q_ratio.json")

rule_list = []
mean_list = []
mice_list = []
knn_list = []
missforest_list = []


for rule_name in missing_rule:
    rule = missing_rule[rule_name]
    print("Current Rule",rule_name)
    # Create folder
    # Every loader contains "observed_data", "observed_mask", "gt_mask", "timepoints"
    np.random.seed(seed)

    dataset_path = f"datasets/{dataname}/data.csv"
    processed_data_path = (
        f"datasets/{dataname}/{missingtype}-{rule_name}_seed-{seed}.pk"
    )
    processed_data_path_norm = (
        f"datasets/{dataname}/{missingtype}-{rule_name}_seed-{seed}_max-min_norm.pk"
    )

    if os.path.isfile(processed_data_path_norm):
        with open(processed_data_path_norm, "rb") as f:
            observed_values, observed_masks, gt_masks, eval_length = pickle.load(
                f
            )
    else:
        print("no data")
        break

    # Calculate the percentage of zeros
    zero_percentage = (gt_masks == 0).mean() * 100

    print(f"0 的占比: {zero_percentage}%")

    train_idx, test_idx = load_index(observed_values,seed)
    
#     # print("train_index",len(train_idx))
#     # print("test_index",len(test_idx))


    Xtrain = observed_values[train_idx]
    Xtest = observed_values[test_idx]


    Xtrain_mask = gt_masks[train_idx]
    Xtest_mask = gt_masks[test_idx]

    zero_percentage_train = (Xtrain_mask == 0).mean() * 100
    zero_percentage_test = (Xtest_mask == 0).mean() * 100

    Xnan = Xtrain.copy()
    Xz = Xtrain.copy()
    Xnan[Xtrain_mask == 0] = np.nan
    Xz[Xtrain_mask == 0] = 0
    
    
    X_test_nan = Xtest.copy()
    X_test_z = Xtest.copy()
    X_test_nan[Xtest_mask == 0] = np.nan
    X_test_z[Xtest_mask == 0] = 0
   
    mean, Ximp = mean_imputer(Xnan,X_test_nan,Xtest,Xtest_mask)
    pd.DataFrame(Ximp).to_csv("results/baselines/Imputation_{}_{}_{}.csv".format(dataname,missingtype,rule_name),index=False)
    mice, Ximp = mice_imputer(Xnan,X_test_nan,Xtest,Xtest_mask)
    pd.DataFrame(Ximp).to_csv("results/baselines/Imputation_{}_{}_{}.csv".format(dataname,missingtype,rule_name),index=False)
    knn, Ximp = knn_imputer(Xnan,X_test_nan,Xtest,Xtest_mask)
    pd.DataFrame(Ximp).to_csv("results/baselines/Imputation_{}_{}_{}.csv".format(dataname,missingtype,rule_name),index=False)
    missforest, Ximp = missforest_imputer(Xnan,X_test_nan,Xtest,Xtest_mask)
    pd.DataFrame(Ximp).to_csv("results/baselines/Imputation_{}_{}_{}.csv".format(dataname,missingtype,rule_name),index=False)
    
    rule_list.append(rule_name)
    mean_list.append(mean)
    mice_list.append(mice)
    knn_list.append(knn)
    missforest_list.append(missforest)

    print(mean_list)

result = pd.DataFrame({"Missing_Rule":rule_list,"Mean":mean_list,"Mice":mice_list,"Missforest":missforest_list,"KNN":knn_list})
result.to_csv("results/baselines/RMSE_{}_{}.csv".format(dataname,missingtype),index=False)

Current Rule 0.25
0 的占比: 11.421911421911423%




[0.15354315966112275]
Current Rule 0.5
0 的占比: 23.696628574677355%




[0.15354315966112275, 0.14056792539125404]
Current Rule 0.75
0 的占比: 34.58980044345898%
[0.15354315966112275, 0.14056792539125404, 0.13030045179665234]




In [38]:
def mean_imputer(train_nan,test_nan,test_real,test_mask):

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(train_nan)
    Ximp = imp.transform(test_nan)
    return np.sqrt(np.sum((test_real - Ximp) ** 2 * (1 - test_mask)) / np.sum(1 - test_mask)), Ximp

def mice_imputer(train_nan,test_nan,test_real,test_mask):

    imp = IterativeImputer(random_state=0, sample_posterior=True)
    imp.fit(train_nan)
    Ximp = imp.transform(test_nan)
    return np.sqrt(np.sum((test_real - Ximp) ** 2 * (1 - test_mask)) / np.sum(1 - test_mask)), Ximp


def knn_imputer(train_nan,test_nan,test_real,test_mask):

    imp = KNNImputer(n_neighbors=2)
    imp.fit(train_nan)
    Ximp = imp.transform(test_nan)
    return np.sqrt(np.sum((test_real - Ximp) ** 2 * (1 - test_mask)) / np.sum(1 - test_mask)), Ximp


def missforest_imputer(train_nan,test_nan,test_real,test_mask):
    impute_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0)

    imp = IterativeImputer(random_state=0, estimator=impute_estimator)
    imp.fit(train_nan)
    Ximp = imp.transform(test_nan)
    return np.sqrt(np.sum((test_real - Ximp) ** 2 * (1 - test_mask)) / np.sum(1 - test_mask)), Ximp
