In [None]:
import numpy as np
import sys
sys.path.append("..")
import pandas as pd
from tqdm import tqdm
import sys
from data_loaders import *
from missing_process.block_rules import *
import json
import numpy as np
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import f1_score,mean_squared_error, f1_score, accuracy_score, r2_score

In [None]:
real_datalist = [
    "banknote",
        "concrete_compression",
            "wine_quality_white","wine_quality_red",
            "california","climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics"
            ]

seed = 1
nfold = 5

In [None]:
def load_impute_data(missingtype,model_name,rule_name,dataname,fold,seed = 1):
    if model_name == "tabcsdi" and missingtype in ["mcar","mar"]:
        train_impute = np.load(f'impute_hpc/{missingtype}/{dataname}/{model_name}/{rule_name}_seed-{seed}_{fold}_train.npy')
        test_impute = np.load(f'impute_hpc/{missingtype}/{dataname}/{model_name}/{rule_name}_seed-{seed}_{fold}_test.npy')
        return train_impute,test_impute


    train_impute = np.load(f'impute/{missingtype}/{dataname}/{model_name}/{rule_name}_seed-{seed}_{fold}_train.npy')
    test_impute = np.load(f'impute/{missingtype}/{dataname}/{model_name}/{rule_name}_seed-{seed}_{fold}_test.npy')
    return train_impute,test_impute

def load_train_test(index_file,norm_values,observed_masks,label_values):  

    train_index = index_file["train_index"]
    test_index = index_file["test_index"]

    train_values = norm_values[train_index,:]

    train_masks = observed_masks[train_index,:]

    test_values = norm_values[test_index,:]

    test_masks = observed_masks[test_index,:]


    train_label = label_values[train_index]

    test_label = label_values[test_index]

    return train_values,train_masks,train_label,test_values,test_masks,test_label


def process_target(dataname,y):
    if dataname in ["concrete_compression",
            "wine_quality_white","wine_quality_red",
            "california","yacht_hydrodynamics"
            ]:
        return y,"reg"
    
    else:
        encoder = LabelEncoder()
        y_encoded = encoder.fit_transform(y.reshape(-1, 1))
        return y_encoded,"clf"
    


def model_eval(label_train, impute_train, impute_test, label_test, task_type,model):
    #print(task_type)
    if task_type == "reg":
        # Define regressors
        reg = model
        
        # Check for NaNs

        reg.fit(impute_train, label_train)
        y_pred_test = reg.predict(impute_test)



        test_rmse = np.sqrt(mean_squared_error(label_test, y_pred_test))
        r2 = r2_score(label_test, y_pred_test)
        return test_rmse,r2

    else:
        # Define classifiers

        clf = model
        # Evaluate each classifier and store F1 scores for both train and test

        clf.fit(impute_train, label_train)
        y_pred_test = clf.predict(impute_test)
   
        # Calculate average F1 score
        test_f1 = f1_score(label_test, y_pred_test, average='macro')
        accuracy = accuracy_score(label_test, y_pred_test, )
        return test_f1, accuracy
    

def fillin_imputed_data(imputed,mask,original):
    filled_data = np.where(mask == 1, original, imputed)
    return filled_data

In [None]:


model_name_list = ["random", "zero", "mean", "knn", "mf", "mice", "missforest", "XGB", "ot", "hyper", "gain", "miwae", "notmiwae", "tabcsdi"]

real_datalist = ["banknote",
            "california","climate_model_crashes","concrete_compression",
           "qsar_biodegradation","wine_quality_red", "connectionist_bench_sonar","wine_quality_white",
            "yacht_hydrodynamics","yeast"
            ]
datalist = real_datalist
reg_data = ["california","concrete_compression","wine_quality_red",
            "wine_quality_white","yacht_hydrodynamics"
            ]
clf_data =  ["banknote","climate_model_crashes",
            "qsar_biodegradation","connectionist_bench_sonar",
            "yeast",
            ]

def run_ml(rule_name,model_number,missingtype,datalist,task_type ="reg"):

    d_f1_rmse = {}
    d_acc_r2= {}


    for dataname in tqdm(datalist):
        
        directory_path = f"datasets/{dataname}"
        data = dataset_loader(dataname)    
        norm_values = np.load(f'{directory_path}/{dataname}_norm.npy')
        label_values, task_type = process_target(dataname,data["target"])

        if task_type =="reg":
           ml_model_list = [Ridge(), MLPRegressor(random_state=1), SVR()]
           
        else:
            ml_model_list = [LogisticRegression(random_state=1), MLPClassifier(random_state=1), SVC()]

        model = ml_model_list[model_number]

        observed_masks = np.load(f'{directory_path}/{missingtype}/{rule_name}.npy')
        f = open(f'{directory_path}/split_index_cv_seed-{seed}_nfold-{nfold}.json')
        index_file = json.load(f)

        d_f1_rmse[dataname] = {}
        d_acc_r2[dataname] = {}

        for model_name in model_name_list:
            d_f1_rmse[dataname][model_name]=0
            d_acc_r2[dataname][model_name]=0
            # load data and its mask

            f1_rmse_list = []
            acc_r2_list = []
            for fold in index_file:
                index = index_file[fold]
                train_values,train_masks,train_label,test_values,test_masks,test_label = load_train_test(index,norm_values,observed_masks,label_values)
                impute_train,impute_test  = load_impute_data(missingtype,model_name,rule_name,dataname,fold)

                impute_test = fillin_imputed_data(impute_test,test_masks,test_values)


                evl1,evl2 = model_eval(train_label, impute_train, test_values, test_label, task_type,model)


                f1_rmse_list.append(evl1)
                acc_r2_list.append(evl2)
            

            d_f1_rmse[dataname][model_name]=np.mean(f1_rmse_list)
            d_acc_r2[dataname][model_name]=np.mean(acc_r2_list)


    d_f1_rmse = pd.DataFrame(d_f1_rmse).T
    d_acc_r2 = pd.DataFrame(d_acc_r2).T




In [None]:
real_datalist = ["banknote",
            "california","climate_model_crashes","concrete_compression",
           "qsar_biodegradation","wine_quality_red", "connectionist_bench_sonar","wine_quality_white",
            "yacht_hydrodynamics","yeast"
            ]
reg_data = ["california","concrete_compression","wine_quality_red",
            "wine_quality_white","yacht_hydrodynamics"
            ]
clf_data =  ["banknote","climate_model_crashes",
            "qsar_biodegradation","connectionist_bench_sonar",
            "yeast",
            ]


for missingtype in ["diffuse","logistic"]:
    for rule_name in ["0.3","0.5","0.7"]:
        for model_number in [0,1,2]:
            run_ml(rule_name,model_number,missingtype,reg_data,task_type ="reg")
            run_ml(rule_name,model_number,missingtype,clf_data,task_type ="clf")


missingtype = "quantile"
for rule_name in ["Q1_Q4_0.5","Q2_Q3_0.5","Q2_Q4_0.5"]:
    for model_number in [0,1,2]:
        run_ml(rule_name,model_number,missingtype,reg_data,task_type ="reg")
        run_ml(rule_name,model_number,missingtype,clf_data,task_type ="clf")


for missingtype in ["mcar","mar"]:
    for rule_name in ["0.3","0.5","0.7"]:
        for model_number in [0,1,2]:
            run_ml(rule_name,model_number,missingtype,reg_data,task_type ="reg")
            run_ml(rule_name,model_number,missingtype,clf_data,task_type ="clf")
