In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
def split_mean_std(dataframe):
    mean_df = dataframe.copy()
    std_df = dataframe.copy()

    for col in dataframe.columns:
        if col != 'index':
            mean_df[col] = dataframe[col].apply(lambda x: float(x.split(' ± ')[0]))
            std_df[col] = dataframe[col].apply(lambda x: float(x.split(' ± ')[1]))

    return mean_df,std_df


## RMSE

In [12]:
real_datalist = ["banknote",
            "california","climate_model_crashes","concrete_compression",
           "qsar_biodegradation","wine_quality_red", "connectionist_bench_sonar","wine_quality_white",
            "yacht_hydrodynamics","yeast"
            ]

modellist = ["random","zero","mean","knn","mice","XGB","missforest","mf","hyper","gain","notmiwae","miwae","tabcsdi","ot"]
baseline_modellist = ["random","zero","mean"]
ml_modellist = ["knn","mice","XGB","missforest","mf"]
dl_modellist = ["hyper","gain","notmiwae","miwae","tabcsdi","ot"]
selected_modellist = ["mean","mice","hyper","gain","notmiwae","miwae","tabcsdi","ot"]

def choose_model_list(name,df):
    if name == "baseline":
        df = df[["random","zero","mean"]]
    elif name == "full":
        df = df[["random", "zero", "mean", "knn", "mf", "mice", "missforest", "XGB", "ot", "hyper", "gain", "miwae", "notmiwae", "tabcsdi"]]
    elif name == "ml":
        df = df[["knn","mice","XGB","missforest","mf"]]
    elif name == "dl":
        df = df[["hyper","gain","notmiwae","miwae","tabcsdi","ot"]]
    elif name == "selected":
        df = df[["mean","mice","hyper","gain","notmiwae","miwae","tabcsdi","ot"]]
    return df


In [5]:
def make_rmse_list(missing_type,datalist,modellist):
    train_list = []
    test_list = []
    for dataname in datalist:
        train_df = pd.DataFrame()
        test_df = pd.DataFrame()
        for model in modellist:
            path = f"../results/{missing_type}/{dataname}"
            try:
                data = pd.read_csv(f"{path}/{model}/{missing_type}_RMSE.csv",index_col=0)
                train_df[f'{model}'] = data['train_rmse_mean'].map('{:.5f}'.format) + ' ± ' + data['train_rmse_std'].map('{:.3f}'.format)
                test_df[f'{model}'] = data['test_rmse_mean'].map('{:.5f}'.format) + ' ± ' + data['test_rmse_std'].map('{:.3f}'.format)
            except:
                train_df[f'{model}'] = "0 ± 0"
                test_df[f'{model}'] = "0 ± 0"
        train_df.to_csv(f"{path}/RMSE_train_results.csv")
        test_df.to_csv(f"{path}/RMSE_test_results.csv")
        train_list.append(train_df)
        test_list.append(test_df)
    return train_list,test_list

In [6]:
import matplotlib.cm as cm
def make_grouped_bar_single(dataframe,ylabel,xlabel,
                            title,task_type = "ml",
                            model_list="full",
                            missing_type = "logistic"):
    task_type_name = task_type.split("-")[0]
    dataframe = choose_model_list(model_list,dataframe)
    index_levels = dataframe.index
    methods = dataframe.columns
        # Define a set of hatching patterns
    hatches = ["","","","","","","","","","",'\\\\', 'o', 'O', '.', '*']
    #Define a set of colors (lighter shades for better visibility with hatching)
    colors = ['#FFCCCC', '#CCFFCC', '#CCCCFF', '#FFFF99', '#FFCC99', '#99CCFF', '#FF99CC']
    colormap = cm.Pastel1
    colors = [colormap(i / len(methods)) for i in range(len(methods))]
    mean_df,std_df = split_mean_std(dataframe)
    
    x = np.arange(len(index_levels))  # the label locations
    width = 0.8/len(methods)  # the width of the bars

    fig, ax = plt.subplots(figsize=(12, 7))

    # Creating bars for each method
    for i, method in enumerate(methods):
        mean_values_for_method = mean_df[method]
        rects = ax.bar(x + i * width, mean_values_for_method, width, label=method, yerr=std_df[method],
                       #color=colors[i % len(colors)],
                       hatch=hatches[i % len(hatches)],
                       #edgecolor='black',

                       )


    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(ylabel)
    ax.set_xlabel(f"{xlabel} Parameter")
    ax.set_title(title,y=1.15)
    ax.set_xticks(x + width * 2.5)
    ax.set_xticklabels(index_levels,rotation=60,ha = "right")
    ylim_lower, ylim_upper = customized_ylim(title,model_list,task_type_name,missing_type)
    ax.set_ylim([ylim_lower, ylim_upper])
    ax.legend(loc='upper center', 
              bbox_to_anchor=(0.5, 1.17),
            ncol=len(methods)//2
            )
    plt.subplots_adjust(bottom=0.4)
    if not os.path.exists(f"../plot/{task_type}/{model_list}"):
        # If the path does not exist, create it
        os.makedirs(f"../plot/{task_type}/{model_list}")
    plt.savefig(f"../plot/{task_type}/{model_list}/{xlabel}_{title}_bar.png")  
    #plt.show()

In [7]:
def make_grouped_line_plot_single(dataframe, ylabel, xlabel, 
                                  title,task_type = "ml",
                                  model_list = "full",
                                  missing_type = "logistic"):
    # Data from your dataset
    task_type_name = task_type.split("-")[0]
    dataframe = choose_model_list(model_list,dataframe)
    index_levels = dataframe.index
    methods = dataframe.columns
    markers = ["o", "X", "^", "D", "*"]
    line_styles = ["-", "--", "-.", ":"]
    
    # Combine markers and line styles for up to 20 unique lines
    styles = [f'{line}{marker}' for line in line_styles for marker in markers]
    
    mean_df, std_df = split_mean_std(dataframe)
    
    x = np.arange(len(index_levels))  # the label locations

    fig, ax = plt.subplots(figsize=(12, 7))

    # Creating lines for each method
    for i, method in enumerate(methods):
        mean_values_for_method = mean_df[method]
        # Plot lines with error bars
        ax.errorbar(x, mean_values_for_method, yerr=std_df[method], label=method, fmt=styles[i], capsize=5)

    # Add some text for labels, title, and custom x-axis tick labels, etc.
    ax.set_ylabel(ylabel)
    ax.set_xlabel(f"{xlabel} Parameter")
    ax.set_title(title, y=1.15)
    ax.set_xticks(x)
    ax.set_xticklabels(index_levels, rotation=60, ha="right")
    ylim_lower, ylim_upper = customized_ylim(title,model_list,task_type_name,missing_type)
    ax.set_ylim([ylim_lower, ylim_upper])
    ax.legend(loc='upper center', 
              bbox_to_anchor=(0.5,1.17),
              ncol=len(methods)//2 ) # Adjust ncol as needed based on the number of methods
    
    plt.subplots_adjust(bottom=0.4, top=0.85)  # Adjust the top to make room for the title and legend
    if not os.path.exists(f"../plot/{task_type}/{model_list}"):
    # If the path does not exist, create it
        os.makedirs(f"../plot/{task_type}/{model_list}")
    plt.savefig(f"../plot/{task_type}/{model_list}/{xlabel}_{title}_line.png")  
    #plt.show()

In [8]:
def customized_ylim(title,model_list,task_type,missing_type):
    ylim_lower, ylim_upper = 0,0
    if task_type != "ml":
        ylim_lower, ylim_upper  = -0.05, 1.05
        if missing_type in ["logistic"]:
            if model_list in ["ml"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.5
                elif title in ["california","yeast"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.2
                elif title in ["climate_model_crashes","yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,0.6
                elif title in ["connectionist_bench_sonar","qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0,0.5
            elif model_list in ["dl"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.3
                elif title in ["california","yeast"]:
                    ylim_lower, ylim_upper  = 0,0.25
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.2
                elif title in ["climate_model_crashes","yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,0.5
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0,0.25  
            elif model_list in ["selected"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.5
                elif title in ["california","yeast"]:
                    ylim_lower, ylim_upper  = 0.05,0.3
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.25
                elif title in ["climate_model_crashes","yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,0.5
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.6
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.05,0.25  

        elif missing_type in ["diffuse"]:
            if model_list in ["ml"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.6 
                elif title in ["california"]:
                    ylim_lower, ylim_upper  = 0,0.25
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["climate_model_crashes"]:
                    ylim_lower, ylim_upper  = 0.4,0.6
                elif title in ["yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,0.7
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.4
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.2,0.8 
                elif title in ["yeast"]: 
                    ylim_lower, ylim_upper  = 0.2,1 
            ######################################################
            elif model_list in ["dl"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.6 
                elif title in ["california"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["climate_model_crashes"]:
                    ylim_lower, ylim_upper  = 0,0.6
                elif title in ["yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,1
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.2,1 
                elif title in ["yeast"]: 
                    ylim_lower, ylim_upper  = 0.2,1 
            #############################################
            elif model_list in ["selected"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.55 
                elif title in ["california"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["climate_model_crashes"]:
                    ylim_lower, ylim_upper  = 0.2,0.6
                elif title in ["yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,1
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.2,1 
                elif title in ["yeast"]: 
                    ylim_lower, ylim_upper  = 0.2,1 

        elif missing_type in ["quantile"]:
            if model_list in ["ml"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.6 
                elif title in ["california"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["climate_model_crashes"]:
                    ylim_lower, ylim_upper  = 0.0,0.55
                elif title in ["yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0.2,0.7
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.4
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.0,0.5 
                elif title in ["yeast"]: 
                    ylim_lower, ylim_upper  = 0,0.5 
            ######################################################
            elif model_list in ["dl"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.5 
                elif title in ["california"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["climate_model_crashes"]:
                    ylim_lower, ylim_upper  = 0,0.6
                elif title in ["yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0,0.8
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.0,0.5 
                elif title in ["yeast"]: 
                    ylim_lower, ylim_upper  = 0,0.5 
            #############################################
            elif model_list in ["selected"] :
                if title in ["banknote","concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.45 
                elif title in ["concrete_compression"]:
                    ylim_lower, ylim_upper  = 0, 0.5     
                elif title in ["california"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["wine_quality_white","wine_quality_red"]:
                    ylim_lower, ylim_upper  = 0,0.3
                elif title in ["climate_model_crashes"]:
                    ylim_lower, ylim_upper  = 0.,0.6
                elif title in ["yacht_hydrodynamics"]:
                    ylim_lower, ylim_upper  = 0,1
                elif title in ["connectionist_bench_sonar"]:
                    ylim_lower, ylim_upper  = 0,0.5
                elif title in ["qsar_biodegradation"]:
                    ylim_lower, ylim_upper  = 0.,0.5 
                elif title in ["yeast"]: 
                    ylim_lower, ylim_upper  = 0.,0.8


    else:
        if missing_type in ["quantile"]:
            if title in ["banknote"]:
                ylim_lower, ylim_upper  = 0.2,1.05 
            elif title in ["wine_quality_white"]:
                ylim_lower, ylim_upper  = 0.5,0.9
            elif title in ["california"]:
                ylim_lower, ylim_upper  = 0.6,1.2
            elif title in ["yacht_hydrodynamics"]:
                ylim_lower, ylim_upper  = 5,20
            elif title in ["concrete_compression"]:
                ylim_lower, ylim_upper  = 10,18
            elif title in ["wine_quality_red"]:
                ylim_lower, ylim_upper  = 0.4,0.85
            elif title in ["climate_model_crashes"]:
                ylim_lower, ylim_upper  = 0.4,1
            elif title in ["connectionist_bench_sonar"]:
                ylim_lower, ylim_upper  = 0.4,1.05
            elif title in ["qsar_biodegradation"]:
                ylim_lower, ylim_upper  = 0.3,1
            elif title in ["yeast"]:
                ylim_lower, ylim_upper  = 0,0.7
            else:
                ylim_lower, ylim_upper  = 0,1
        else:
            title = title.split()[0]
            if title in ["banknote"]:
                ylim_lower, ylim_upper  = 0.2,1.05 
            elif title in ["wine_quality_white"]:
                ylim_lower, ylim_upper  = 0.5,0.9
            elif title in ["california"]:
                ylim_lower, ylim_upper  = 0.6,1.2
            elif title in ["yacht_hydrodynamics"]:
                ylim_lower, ylim_upper  = 5,20
            elif title in ["concrete_compression"]:
                ylim_lower, ylim_upper  = 10,18
            elif title in ["wine_quality_red"]:
                ylim_lower, ylim_upper  = 0.4,0.85
            elif title in ["climate_model_crashes"]:
                ylim_lower, ylim_upper  = 0.4,1
            elif title in ["connectionist_bench_sonar"]:
                ylim_lower, ylim_upper  = 0.4,1.05
            elif title in ["qsar_biodegradation"]:
                ylim_lower, ylim_upper  = 0.3,1
            elif title in ["yeast"]:
                ylim_lower, ylim_upper  = 0,0.7
            else:
                ylim_lower, ylim_upper  = 0,1

    return ylim_lower, ylim_upper


In [9]:
def customized_ylim(title,model_list,task_type,missing_type):
    ylim_lower, ylim_upper = 0,0
    if task_type != "ml":
        ylim_lower, ylim_upper  = -0.05, 1.05
    else:
        if missing_type in ["quantile"]:
            if title in ["banknote"]:
                ylim_lower, ylim_upper  = 0.,1.05 
            elif title in ["wine_quality_white"]:
                ylim_lower, ylim_upper  = 0.,0.9
            elif title in ["california"]:
                ylim_lower, ylim_upper  = 0.,1.2
            elif title in ["yacht_hydrodynamics"]:
                ylim_lower, ylim_upper  = 5,20
            elif title in ["concrete_compression"]:
                ylim_lower, ylim_upper  = 10,30
            elif title in ["wine_quality_red"]:
                ylim_lower, ylim_upper  = 0.4,0.85
            elif title in ["climate_model_crashes"]:
                ylim_lower, ylim_upper  = 0.,1
            elif title in ["connectionist_bench_sonar"]:
                ylim_lower, ylim_upper  = 0.,1.05
            elif title in ["qsar_biodegradation"]:
                ylim_lower, ylim_upper  = 0.,1
            elif title in ["yeast"]:
                ylim_lower, ylim_upper  = 0,0.7
            else:
                ylim_lower, ylim_upper  = 0,1
        else:
            title = title.split()[0]
            if title in ["banknote"]:
                ylim_lower, ylim_upper  = 0.2,1.05 
            elif title in ["wine_quality_white"]:
                ylim_lower, ylim_upper  = 0.,1.2
            elif title in ["california"]:
                ylim_lower, ylim_upper  = 0.,1.2
            elif title in ["yacht_hydrodynamics"]:
                ylim_lower, ylim_upper  = 5,25
            elif title in ["concrete_compression"]:
                ylim_lower, ylim_upper  = 5,40
            elif title in ["wine_quality_red"]:
                ylim_lower, ylim_upper  = 0.,1.05
            elif title in ["climate_model_crashes"]:
                ylim_lower, ylim_upper  = 0.,1.05
            elif title in ["connectionist_bench_sonar"]:
                ylim_lower, ylim_upper  = 0.,1.05
            elif title in ["qsar_biodegradation"]:
                ylim_lower, ylim_upper  = 0.,1.05
            elif title in ["yeast"]:
                ylim_lower, ylim_upper  = -0.05,0.7
            else:
                ylim_lower, ylim_upper  = -0.05,1.05

    return ylim_lower, ylim_upper


## Downstream Tasks

In [10]:
def ml_tasks(dataname):
    if dataname in ["concrete_compression",
            "wine_quality_white","wine_quality_red",
            "california","yacht_hydrodynamics"
            ]:
        return "ML_rmse"
    
    else:
        return"ML_f1"
    

In [9]:
#modellist = ["zero","mean","knn","mice","missforest","mf","hyper","gain","notmiwae","miwae","tabcsdi","ot"]
real_datalist = ["banknote",
            "california","climate_model_crashes","concrete_compression",
           "qsar_biodegradation","wine_quality_red", "connectionist_bench_sonar","wine_quality_white",
            "yacht_hydrodynamics","yeast"
            ]
missingtypelist = ["diffuse","logistic"]

In [113]:
modellist = ["random", "zero", "mean", "knn", "mf", "mice", "missforest", "XGB", "ot", "hyper", "gain", "miwae", "notmiwae", "tabcsdi"]
reg_model_list = ["Ridge","MLP","SVM"]
clf_model_list = ["LR","MLP","SVM"]
ml_index = 1
missingtypelist = ["diffuse","logistic"]
real_datalist = ["banknote","concrete_compression",
            "climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics","wine_quality_white","wine_quality_red"
            ]



def make_diff_log_ml(missingtypelist,real_datalist,ml_index,model_list_name,sep = "imp_train"):

    for missingtype in missingtypelist:
        for dataname in real_datalist:
            train_df = pd.DataFrame()
            test_df = pd.DataFrame()
            for model in modellist:
                path = f"../results/{missingtype}/{dataname}"
                if sep == "ml-imp_train" or sep == "ml-imp_test":
                    path = f"../results_sep/{missingtype}/{dataname}"
                if dataname in ["concrete_compression","wine_quality_white","wine_quality_red","california","yacht_hydrodynamics"]:
                    reg = True
                else:
                    reg = False 
                        
                if reg:
                    try:
                        data = pd.read_csv(f"{path}/{model}/{missingtype}_ML_rmse_{ml_index}.csv",index_col=0)
                        train_df[f'{model}'] = data['train_ML_rmse_mean'].map('{:.5f}'.format) + ' ± ' + data['train_ML_rmse_std'].map('{:.3f}'.format)
                        test_df[f'{model}'] = data['test_ML_rmse_mean'].map('{:.5f}'.format) + ' ± ' + data['test_ML_rmse_std'].map('{:.3f}'.format)
                            
                    except:
                        pass
                    
                else:

                    try:
                        data = pd.read_csv(f"{path}/{model}/{missingtype}_ML_f1_{ml_index}.csv",index_col=0)
                        train_df[f'{model}'] = data['train_ML_f1_mean'].map('{:.5f}'.format) + ' ± ' + data['train_ML_f1_std'].map('{:.3f}'.format)
                        test_df[f'{model}'] = data['test_ML_f1_mean'].map('{:.5f}'.format) + ' ± ' + data['test_ML_f1_std'].map('{:.3f}'.format)
                    except:
                        pass
            
            if reg:
                ml_model = reg_model_list[ml_index]
                train_df.to_csv(f"{path}/ML_RMSE_{ml_model}_train.csv")
                test_df.to_csv(f"{path}/ML_RMSE_{ml_model}_test.csv")
                make_grouped_bar_single(test_df,'RMSE',missingtype,f"{dataname} - {ml_model} Model",task_type = sep,
                                        model_list=model_list_name,missing_type = missing_type)
                make_grouped_line_plot_single(test_df,'RMSE',missingtype,f"{dataname} - {ml_model} Model",task_type = sep,
                                              model_list=model_list_name,missing_type = missing_type)
            else:
                ml_model = clf_model_list[ml_index] 
                train_df.to_csv(f"{path}/ML_f1_{ml_model}_train.csv")
                test_df.to_csv(f"{path}/ML_f1_{ml_model}_test.csv")
                make_grouped_bar_single(test_df,'F1',missingtype,f"{dataname} - {ml_model} Model",task_type = sep,
                                        model_list=model_list_name,missing_type = missing_type)
                make_grouped_line_plot_single(test_df,'F1',missingtype,f"{dataname} - {ml_model} Model",task_type = sep,
                                              model_list=model_list_name,missing_type = missing_type)




In [109]:
def choose_model_list(name,df):
    if name == "baseline":
        df = df[[
            "random",
                 "zero","mean"]]
    elif name == "full":
        df = df[[
            "random",
                 "zero","mean","knn","mice","missforest","mf",
                "XGB",
                 "hyper","gain","notmiwae","miwae","tabcsdi","ot"]]
    elif name == "ml":
        df = df[["knn","mice",
                "XGB",
                 "missforest","mf"]]
    elif name == "dl":
        df = df[["hyper","gain","notmiwae","miwae","tabcsdi","ot"]]
    elif name == "selected":
        df = df[["mean","mice","hyper","gain","notmiwae","miwae","tabcsdi","ot"]]
    return df

In [1]:
for ml_index in [0,1,2]:
    for model_list_name in ["full","baseline","ml","dl","selected"]:
    #for model_list_name in ["full"]:
        make_diff_log_ml(missingtypelist,real_datalist,ml_index,model_list_name,sep = "ml")
        make_diff_log_ml(missingtypelist,real_datalist,ml_index,model_list_name,sep = "ml-imp_test")
        make_diff_log_ml(missingtypelist,real_datalist,ml_index,model_list_name,sep = "ml-imp_train")

#make_diff_log_ml(["logistic"],["banknote"],0,"ml")

## RMSE

In [48]:
modellist

['one',
 'zero',
 'mean',
 'knn',
 'mice',
 'XGB',
 'missforest',
 'mf',
 'hyper',
 'gain',
 'notmiwae',
 'miwae',
 'tabcsdi',
 'ot']

In [14]:
real_datalist = ["banknote",
            "california","climate_model_crashes","concrete_compression",
           "qsar_biodegradation","wine_quality_red", "connectionist_bench_sonar","wine_quality_white",
            "yacht_hydrodynamics","yeast"
            ]
missingtypelist = ["diffuse","logistic","quantile","mar","mcar"]
#missingtypelist = ["quantile"]

def rmse_and_plot(missingtypelist,real_datalist,model_list_name):
    for missing_type in missingtypelist:
        train_list = []
        test_list = []
        for dataname in real_datalist:
            train_df = pd.DataFrame()
            test_df = pd.DataFrame()
            for model in modellist:
                path = f"../results/{missing_type}/{dataname}"
                try:
                    data = pd.read_csv(f"{path}/{model}/{missing_type}_RMSE.csv",index_col=0)
                    train_df[f'{model}'] = data['train_rmse_mean'].map('{:.5f}'.format) + ' ± ' + data['train_rmse_std'].map('{:.3f}'.format)
                    test_df[f'{model}'] = data['test_rmse_mean'].map('{:.5f}'.format) + ' ± ' + data['test_rmse_std'].map('{:.3f}'.format)
                except:
                    train_df[f'{model}'] = '0 ± 0'
                    test_df[f'{model}'] = '0 ± 0'
            train_df.to_csv(f"{path}/RMSE_train_results.csv")
            test_df.to_csv(f"{path}/RMSE_test_results.csv")
            train_list.append(train_df)
            test_list.append(test_df)
            # make_grouped_bar_single(test_df,'RMSE',missing_type,dataname,"rmse",model_list=model_list_name,missing_type = missing_type)
            # make_grouped_line_plot_single(test_df,'RMSE',missing_type,dataname,"rmse",model_list=model_list_name,missing_type = missing_type)


In [15]:
for model_list_name in ["full"]:
    rmse_and_plot(missingtypelist,real_datalist,model_list_name)

## Quantile

### RMSE

In [61]:
real_datalist = ["banknote","concrete_compression",
            "california","climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics","wine_quality_white","wine_quality_red"
            ]
missingtypelist = ["quantile"]
def rmse_quantile(missingtypelist,real_datalist,model_list_name):
    for missingtype in missingtypelist:

        for dataname in real_datalist:
            path = f"../results/{missingtype}/{dataname}"
            train_df = pd.read_csv(f"{path}/RMSE_train_results.csv", index_col=0)
            test_df= pd.read_csv(f"{path}/RMSE_test_results.csv", index_col=0)
            test_df = test_df.sort_index()
            test_df['Length'] = test_df.index.str.len()

            # Split the DataFrame into two groups based on the condition
            group1 = test_df[(test_df['Length'] == 9) | (test_df['Length'] == 10)]
            group2 = test_df[~((test_df['Length'] == 9) | (test_df['Length'] == 10))]

            # Drop the "Length" column if it's no longer needed
            group1.drop("Length", axis=1, inplace=True)
            group2.drop("Length", axis=1, inplace=True)

            group3 = group1[group1.index.str.startswith(('Q1_Q2', 'Q2_Q3', 'Q3_Q4'))]
            group4 = group1[~group1.index.str.startswith(('Q1_Q2', 'Q2_Q3', 'Q3_Q4'))]

            #print(f"{path}/RMSE_train_results.csv")
            quantile_bar(group2,'RMSE',missingtype,dataname,"rmse","single",model_list = model_list_name)
            quantile_bar(group3,'RMSE',missingtype,dataname,"rmse","adjacent",model_list = model_list_name)
            quantile_bar(group4,'RMSE',missingtype,dataname,"rmse","non-adjacent",model_list = model_list_name)

            quantile_line(group2,'RMSE',missingtype,dataname,"rmse","single",model_list = model_list_name)
            quantile_line(group3,'RMSE',missingtype,dataname,"rmse","adjacent",model_list = model_list_name)
            quantile_line(group4,'RMSE',missingtype,dataname,"rmse","non-adjacent",model_list = model_list_name)

In [89]:
missingtypelist = ["quantile"]
for model_list_name in ["full","baseline","ml","dl","selected"]:
    rmse_quantile(missingtypelist,real_datalist,model_list_name)

In [131]:

def quantile_bar(dataframe, ylabel, xlabel, title, task_type="rmse",
                 quantile="single",model = "-",model_list = "full"):
    # Data from your dataset
    task_type_name = task_type.split("-")[0]
    #print(dataframe.columns)
    dataframe = choose_model_list(model_list,dataframe)
    index_levels = dataframe.index
    methods = dataframe.columns
    # Define a set of hatching patterns
    hatches = ["", "", "", "", "", "", "", "", "", "\\\\", 'o', 'O', '.', '*']

    mean_df, std_df = split_mean_std(dataframe)

    x = np.arange(len(index_levels))  # the label locations
    width = 0.8/len(methods)  # the width of the bars

    fig, ax = plt.subplots(figsize=(12, 7))

    # Creating bars for each method
    for i, method in enumerate(methods):
        mean_values_for_method = mean_df[method]
        rects = ax.bar(x + i * width, mean_values_for_method, width, label=method, yerr=std_df[method],
                       hatch=hatches[i % len(hatches)],
                       #color=colors[i % len(colors)],
                       #edgecolor='black'
                       )

    # Add dotted lines to separate every four groups of bars
    for i in range(1, len(index_levels)+1, 4):
        ax.axvline(x=i - 1.2, color='gray', linestyle='--')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(ylabel)
    ax.set_xlabel(f"{xlabel} Parameter")
    ax.set_title(f"{title} {model} {quantile}", y=1.15)
    ax.set_xticks(x + width * 2.5)
    ax.set_xticklabels(index_levels, rotation=45, ha="right")

    ylim_lower, ylim_upper = customized_ylim(title,model_list,task_type_name,"quantile")
    ax.set_ylim([ylim_lower, ylim_upper])

    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.17), ncol=len(methods) // 2 )
    #ax.grid()
    plt.subplots_adjust(bottom=0.4)
    plt.savefig(f"../plot/{task_type}/{model_list}/{xlabel}_{title}_{quantile}_{model}_bar.png")
    # else:
    #     plt.savefig(f"../plot/{task_type}/{xlabel}_{title}_{quantile}.png") 
    #plt.show()

In [129]:
def quantile_line(dataframe, ylabel, xlabel, title, task_type="rmse",
                  quantile="single",model = "-",model_list = "full"):
    task_type_name = task_type.split("-")[0]
    # Data from your dataset
    dataframe = choose_model_list(model_list,dataframe)
    index_levels = dataframe.index
    methods = dataframe.columns
    markers = ["o", "X", "^", "D", "*"]
    line_styles = ["-", "--", "-.", ":"]
    
    # Combine markers and line styles for up to 20 unique lines
    styles = [f'{line}{marker}' for line in line_styles for marker in markers]
    
    mean_df, std_df = split_mean_std(dataframe)
    
    x = np.arange(len(index_levels))  # the label locations

    fig, ax = plt.subplots(figsize=(12, 7))

    # Creating lines for each method
    for i, method in enumerate(methods):
        mean_values_for_method = mean_df[method]
        # Plot lines with error bars
        ax.errorbar(x, mean_values_for_method, 
                    yerr=std_df[method], 
                    label=method, fmt=styles[i], 
                    capsize=5)



    # Add dotted lines to separate every four groups of bars
    for i in range(1, len(index_levels)+1, 4):
        ax.axvline(x=i - 1, color='gray', linestyle='--')


    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    ax.set_title(f"{title} {model} {quantile}", y=1.15)
    ax.set_xticks(x)
    ax.set_xticklabels(index_levels, rotation=45, ha='right')
    
    
    handles, labels = ax.get_legend_handles_labels()
    # Set y-axis limits based on task_type and title
    ylim_lower, ylim_upper = customized_ylim(title,model_list,task_type_name,"quantile")
    ax.set_ylim([ylim_lower, ylim_upper])

    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.17), ncol=len(methods) // 2)
    #ax.grid()
    plt.subplots_adjust(bottom=0.4)
    plt.savefig(f"../plot/{task_type}/{model_list}/{xlabel}_{title}_{quantile}_{model}_line.png")
    #plt.show()

## Quantile
### ML

In [132]:
real_datalist = ["banknote","concrete_compression",
            #"california",
            "climate_model_crashes",
            "connectionist_bench_sonar","qsar_biodegradation",
            "yeast","yacht_hydrodynamics","wine_quality_white","wine_quality_red"
            ]
reg_model_list = ["Ridge","MLP","SVM"]
clf_model_list = ["LR","MLP","SVM"]
ml_index = 2

missingtypelist = ["quantile"]

def plot_quantile(missingtypelist,real_datalist,model_list_name,ml_index,task_type = "ml"):
    for missingtype in missingtypelist:

        for dataname in real_datalist:
            if task_type == "ml":
                path = f"../results/{missingtype}/{dataname}"
            else:
                path = f"../results_sep/{missingtype}/{dataname}"
            if dataname in ["concrete_compression","wine_quality_white","wine_quality_red","california","yacht_hydrodynamics"]:
                reg = True
                ml_model = reg_model_list[ml_index]
                train_df = pd.read_csv(f"{path}/ML_RMSE_{ml_model}_train.csv", index_col=0)
                test_df= pd.read_csv(f"{path}/ML_RMSE_{ml_model}_test.csv", index_col=0)
                #print("Reg",f"{path}/ML_RMSE_{ml_model}_train.csv")
            else:
                reg = False 
                ml_model = clf_model_list[ml_index]
                train_df = pd.read_csv(f"{path}/ML_f1_{ml_model}_train.csv", index_col=0)
                test_df= pd.read_csv(f"{path}/ML_f1_{ml_model}_test.csv", index_col=0)
                #print("Clf",f"{path}/ML_f1_{ml_model}_train.csv")

            if task_type == "ml-imp_train":# imputer train, test
                df = train_df.sort_index()
                df['Length'] = df.index.str.len()
            else:
                df = test_df.sort_index()
                df['Length'] = df.index.str.len()

            # Split the DataFrame into two groups based on the condition
            group1 = df[(df['Length'] == 9) | (df['Length'] == 10)]
            group2 = df[~((df['Length'] == 9) | (df['Length'] == 10))]

            # Drop the "Length" column if it's no longer needed
            group1.drop("Length", axis=1, inplace=True)
            group2.drop("Length", axis=1, inplace=True)

            group3 = group1[group1.index.str.startswith(('Q1_Q2', 'Q2_Q3', 'Q3_Q4'))]
            group4 = group1[~group1.index.str.startswith(('Q1_Q2', 'Q2_Q3', 'Q3_Q4'))]


            if reg:
                y_label = "RMSE"
                model_list = reg_model_list
            else:
                y_label = "F1"
                model_list = clf_model_list
            quantile_bar(group2,y_label,missingtype,dataname,task_type,"single",model_list[ml_index],model_list = model_list_name)
            quantile_bar(group3,y_label,missingtype,dataname,task_type,"adjacent",model_list[ml_index],model_list=model_list_name)
            quantile_bar(group4,y_label,missingtype,dataname,task_type,"non-adjacent",model_list[ml_index],model_list=model_list_name)
            
            quantile_line(group2,y_label,missingtype,dataname,task_type,"single",model_list[ml_index],model_list=model_list_name)
            quantile_line(group3,y_label,missingtype,dataname,task_type,"adjacent",model_list[ml_index],model_list=model_list_name)
            quantile_line(group4,y_label,missingtype,dataname,task_type,"non-adjacent",model_list[ml_index],model_list=model_list_name)

In [2]:
for model_list_name in ["full","ml","dl","selected","baseline"]:
    for ml_index in [0,1,2]:
        plot_quantile(missingtypelist,real_datalist,model_list_name,ml_index,task_type = "ml")
        plot_quantile(missingtypelist,real_datalist,model_list_name,ml_index,task_type = "ml-imp_test")
        plot_quantile(missingtypelist,real_datalist,model_list_name,ml_index,task_type = "ml-imp_train")