In [1]:
import json
import pandas as pd
import statistics as stats
import re
import pprint

In [2]:
ALL_TABULAR_DATASETS = ["adult", "census", "child", "covtype", "credit", "insurance", "health_insurance", 
                       "intrusion", "drugs", "loan", "pums"]
ALL_TABULAR_MODELS = {
    "gretel": ["actgan"], 
    "sdv": ["ctgan", "tvae", "gaussian_copula"], 
    "syn": ["ctgan", "tvae", "goggle", "arf", "ddpm", "nflow", "rtvae"], 
    "llm": ["great"], 
    "betterdata": ["gan", "gan_dp"]
}

ALL_SEQUENTIAL_DATASETS = ["taxi", "nasdaq", "pums"]

ALL_SEQUENTIAL_MODELS = {
    "gretel": ["dgan"],
    "sdv": ["par"]
}

# Jobs Evaluation

In [3]:
def get_cmds_configs(command_string):

    # Sample command string
    # command_string = "python3 run_model.py --m sequential --l sdv --s par --data nasdaq --o outputs --run_model_training --use_gpu"

    # Regular expression pattern to extract arguments for "--l", "--s", and "--data"
    pattern = r"--l (\S+).*?--s (\S+).*?--data (\S+)"

    # Use regex to find the match
    match = re.search(pattern, command_string)

    # Extract values from the match
    library = match.group(1) if match else None
    synthesizer = match.group(2) if match else None
    data = match.group(3) if match else None

    return (library, synthesizer, data)

In [4]:
sdg_jobs_df = pd.read_csv("../final_outs/sdg_jobs.csv")
if 'Unnamed: 0' in sdg_jobs_df.columns:
    sdg_jobs_df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
sdg_jobs_df.columns

Index(['Experiment Name', 'Run ID', 'Status', 'Creation Time', 'End Time',
       'Duration (seconds)', 'Compute Target', 'Command Parameters'],
      dtype='object')

In [6]:
# sdg_jobs_df[["Command Parameters"]]

## Incompleted Jobs -- Failed, Canceled, Running

In [7]:
# Iterate over rows and print 'Command Parameters' for specific 'Status' values
TABULAR_INCOMPLETE_JOBS = {
    "gretel": {}, 
    "sdv": {}, 
    "syn": {}
}
for index, row in sdg_jobs_df.iterrows():
    if row['Status'] in ['Failed', 'Canceled', 'Running']:
        (library, synthesizer, data) = get_cmds_configs(row['Command Parameters'])
        if library and synthesizer and data:
            # print(library, synthesizer, data)
            
            if library == "synthcity":
                library = "syn"
            
            if synthesizer in TABULAR_INCOMPLETE_JOBS[library]:
                TABULAR_INCOMPLETE_JOBS[library][synthesizer][data] = row['Status']
            else:
                TABULAR_INCOMPLETE_JOBS[library][synthesizer] = {}
                TABULAR_INCOMPLETE_JOBS[library][synthesizer][data] = row['Status']
                
                
pprint.pprint(TABULAR_INCOMPLETE_JOBS)

{'gretel': {'dgan': {'pums': 'Failed'}},
 'sdv': {'ctgan': {'insurance': 'Failed'}, 'par': {'pums': 'Failed'}},
 'syn': {'arf': {'pums': 'Failed'},
         'ctgan': {'covtype': 'Running',
                   'credit': 'Canceled',
                   'pums': 'Failed'},
         'ddpm': {'pums': 'Canceled'},
         'goggle': {'adult': 'Failed',
                    'covtype': 'Failed',
                    'intrusion': 'Failed',
                    'loan': 'Failed',
                    'pums': 'Running'},
         'nflow': {'adult': 'Failed',
                   'covtype': 'Failed',
                   'loan': 'Failed',
                   'pums': 'Failed'},
         'rtvae': {'credit': 'Canceled', 'pums': 'Failed'},
         'tvae': {'pums': 'Failed'}}}


In [10]:
ERROR_VAL  = -99
def get_scores_df(exp_dataset, models_list, case="tabular", all_jobs=False):
    
    final_report = {
        "dataset": [], 
        "model": [],
        "case": [], 
        "domain_cov": [], 
        "stats_cov": [], 
        "outliers_cov": [],
        "missing_cov": [],
        "ks_sim": [], 
        "tv_sim": [],
        "corr_sim": [], 
        "contin_sim": [], 
        "sdv_quality_report": [],
        "wass_dist": [], 
        "js_dist": [], 
        "new_row_synthesis": []
    }


    for lib, models in models_list.items():
        
        if case == "hpo" and lib!= "syn":
            continue
        for model in models: 
            
            if case in ["hyperimpute", "ice", "missforest", "simple", "only_hyperimpute", "only_ice", "only_missforest", "only_simple",]:  
                corr_file = f"../metrics_out/hyperimpute/{model}_{lib}/{exp_dataset}/{case}/{exp_dataset}_{model}_correlation.csv"
                metrics_file = f"../metrics_out/hyperimpute/{model}_{lib}/{exp_dataset}/{case}/{exp_dataset}_{model}_metrics.json"
            else:
                corr_file = f"../metrics_out/{case}/{model}_{lib}/{exp_dataset}/{exp_dataset}_{model}_correlation.csv"
                metrics_file = f"../metrics_out/{case}/{model}_{lib}/{exp_dataset}/{exp_dataset}_{model}_metrics.json"
                        
            try:
                with open(metrics_file, 'r') as file:
                    metrics_scores = json.load(file)
                corr_df = pd.read_csv(corr_file)
                # print("Success:", metrics_file)
            except Exception as e:
                # print(e)
                if all_jobs:
                    # print(model, lib)
                    status = None
                    if lib in TABULAR_INCOMPLETE_JOBS and model in TABULAR_INCOMPLETE_JOBS[lib]:
                        if exp_dataset in TABULAR_INCOMPLETE_JOBS[lib][model]:
                            # print("Status: ",  TABULAR_INCOMPLETE_JOBS[lib][model][exp_dataset])
                            # print("-")
                            status = TABULAR_INCOMPLETE_JOBS[lib][model][exp_dataset]
                        else:
                            status = -99 #"Skipped"
                    else:
                        status =  -99 #"Skipped"
                        # print(e)
                        # print("Skipped")
                    for k in final_report.keys():
                        if k not in ["model", "dataset", "case"]: 
                            final_report[k].append(ERROR_VAL) #.append(status)
                        # print(final_report)
                    
                continue
            
            final_report["model"].append(f"{model}_{lib}")
            final_report["dataset"].append(exp_dataset)
            final_report["case"].append(case)
            
            #----------------------
            # Domain Coverage
            #----------------------
            if metrics_scores["coverage"]["domain_coverage"].values():
                domain_coverage_mean = round(stats.mean(metrics_scores["coverage"]["domain_coverage"].values()), 2)
                final_report["domain_cov"].append(domain_coverage_mean)
            else:
                # Handle the case for empty list, e.g., append a default value or skip
                final_report["domain_cov"].append(ERROR_VAL)  # or any other default value

            #----------------------
            # Missing values Coverage
            #----------------------
            if metrics_scores["coverage"]["missing_values_coverage"].values():
                missing_values_coverage_mean = round(stats.mean(metrics_scores["coverage"]["missing_values_coverage"].values()), 2)
                final_report["missing_cov"].append(missing_values_coverage_mean)
            else:
                # Handle the case for empty list, e.g., append a default value or skip
                final_report["missing_cov"].append(ERROR_VAL)  # or any other default value

            #----------------------
            # Outliers Coverage
            #----------------------
            if metrics_scores["coverage"]["outlier_coverage"].values():
                outlier_coverage_mean = round(stats.mean(metrics_scores["coverage"]["outlier_coverage"].values()), 2)
                final_report["outliers_cov"].append(outlier_coverage_mean)
            else:
                # Handle the case for empty list, e.g., append a default value or skip
                final_report["outliers_cov"].append(ERROR_VAL)  # or any other default value
            
            #----------------------
            # Statistics Similarity
            #----------------------
            # Initialize total and count variables
            total_average = 0
            count = 0
            # Calculate the average of mean, median, and std for each column
            try:
                for c, s in metrics_scores["similarity"]["statistic"].items():
                    column_average = (s["mean"] + s["median"] + s["std"]) / 3
                    total_average += column_average
                    count += 1
                # Calculate the overall average
                overall_average = total_average / count if count > 0 else ERROR_VAL
                final_report["stats_cov"].append(round(overall_average, 2))
            except:
                final_report["stats_cov"].append(ERROR_VAL)

            #---------------------------
            # KSComplement & TVComplement
            #---------------------------
            ks_scores = []
            tv_scores = []
            for key, value in metrics_scores["sdv_quality_report"]["distribution"].items():
                if value["metric"] == "KSComplement":
                    ks_scores.append(value["score"])
                elif value["metric"] == "TVComplement":
                    tv_scores.append(value["score"])

            # Calculating averages
            ks_average = stats.mean(ks_scores) if ks_scores else ERROR_VAL
            tv_average = stats.mean(tv_scores) if tv_scores else ERROR_VAL
            final_report["ks_sim"].append(round(ks_average, 2))
            final_report["tv_sim"].append(round(tv_average, 2))

            #--------------------------------------------
            # contingency_average, correlation_average
            #--------------------------------------------
            contingency_average = corr_df[corr_df['Metric'] == 'ContingencySimilarity']["Score"].mean()
            correlation_average = corr_df[corr_df['Metric'] == 'CorrelationSimilarity']["Score"].mean()

            final_report["contin_sim"].append(round(contingency_average, 2))
            final_report["corr_sim"].append(round(correlation_average, 2))

            #--------------------------------------------
            # SDV quality report
            #--------------------------------------------
            final_report["sdv_quality_report"].append(round(metrics_scores["sdv_quality_report"]["score"], 2))
            
            #--------------------------------------------
            # Jensen Shannon Distance
            #--------------------------------------------
            if metrics_scores["similarity"]["js_distance"].values():
                js_dist_mean = round(stats.mean(metrics_scores["similarity"]["js_distance"].values()), 2)
                final_report["js_dist"].append(js_dist_mean)
            else:
                # Handle the case for empty list, e.g., append a default value or skip
                final_report["js_dist"].append(ERROR_VAL)  # or any other default value
                
            #--------------------------------------------
            # Wassertein Distance
            #--------------------------------------------
            if metrics_scores["similarity"]["wass_distance"].values():
                wass_dist_mean = round(stats.mean(metrics_scores["similarity"]["wass_distance"].values()), 2)
                final_report["wass_dist"].append(wass_dist_mean)
            else:
                # Handle the case for empty list, e.g., append a default value or skip
                final_report["wass_dist"].append(ERROR_VAL)  # or any other default value

            #--------------------------------------------
            # Privacy: New row synthesis
            #--------------------------------------------
            if "score" in metrics_scores["privacy"]["new_row_synthesis"]:
                final_report["new_row_synthesis"].append(round(metrics_scores["privacy"]["new_row_synthesis"]["score"], 2))
            else:
                final_report["new_row_synthesis"].append(ERROR_VAL)
            
    return final_report

# Standard Tabular Evaluation

In [190]:
# ALL_TABULAR_DATASETS = ["adult", "census", "child", "covtype", "credit", "insurance", "health_insurance", 
#                        "intrusion", "drugs", "loan", "pums"]
std_tabular_df = pd.DataFrame()
for exp_dataset in ALL_TABULAR_DATASETS:
    final_report = get_scores_df(exp_dataset, ALL_TABULAR_MODELS, "tabular", False) 
    std_tabular_df = pd.concat([std_tabular_df, pd.DataFrame(final_report)], ignore_index=True)
std_tabular_df

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,adult,actgan_gretel,tabular,0.89,0.99,0.49,1.00,0.79,0.92,0.97,0.87,0.88,0.02,0.24,1.00
1,adult,ctgan_sdv,tabular,0.87,0.99,0.79,1.00,0.87,0.91,0.98,0.82,0.87,0.02,0.22,1.00
2,adult,tvae_sdv,tabular,0.79,0.99,0.68,1.00,0.88,0.94,0.97,0.88,0.91,0.02,0.13,1.00
3,adult,gaussian_copula_sdv,tabular,0.92,0.95,0.41,1.00,0.70,0.80,0.99,0.73,0.76,0.07,0.29,1.00
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.00,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,loan,gan_dp_betterdata,tabular,0.99,0.98,0.35,1.00,0.92,0.95,0.91,0.73,0.85,0.02,0.16,1.00
99,pums,actgan_gretel,tabular,0.81,0.99,0.60,0.98,0.85,0.98,0.97,0.91,0.92,0.01,0.20,0.99
100,pums,ctgan_sdv,tabular,0.82,0.99,0.67,1.00,0.88,0.99,0.99,0.94,0.95,0.01,0.19,1.00
101,pums,tvae_sdv,tabular,0.80,0.97,0.63,1.00,0.81,0.92,0.95,0.82,0.86,0.04,0.29,1.00


# Imputation Evaluation -- imputed data and generated synthetic data (with imputation)

In [13]:
ALL_HYPERIMPUTE_MODELS = {
    "sdv": ["ctgan"]
}

IMPUTERS = ["missforest", "ice", "hyperimpute", "simple"]

tabular_impute_df = pd.DataFrame()
for imputer in IMPUTERS:
    final_report = get_scores_df("drugs", ALL_HYPERIMPUTE_MODELS, imputer, True)
    tabular_impute_df = pd.concat([tabular_impute_df, pd.DataFrame(final_report)], ignore_index=True)
    
    final_report = get_scores_df("drugs", ALL_HYPERIMPUTE_MODELS, "only_" + imputer, True)
    tabular_impute_df = pd.concat([tabular_impute_df, pd.DataFrame(final_report)], ignore_index=True)
    
tabular_impute_df

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,drugs,ctgan_sdv,missforest,0.92,0.99,0.0,0.42,0.95,0.88,0.92,0.17,0.53,0.01,0.07,-99
1,drugs,ctgan_sdv,only_missforest,0.93,1.0,1.0,0.42,0.98,0.91,0.97,0.19,0.55,0.01,0.05,-99
2,drugs,ctgan_sdv,ice,0.96,0.93,0.0,0.42,0.9,0.89,0.95,0.17,0.53,0.09,0.22,-99
3,drugs,ctgan_sdv,only_ice,0.96,1.0,1.0,0.42,0.98,0.92,0.97,0.19,0.56,0.01,0.05,-99
4,drugs,ctgan_sdv,hyperimpute,0.95,0.92,1.0,0.42,0.85,0.9,0.96,0.17,0.53,0.12,0.27,-99
5,drugs,ctgan_sdv,only_hyperimpute,0.95,1.0,1.0,0.42,0.98,0.92,0.97,0.19,0.56,0.01,0.05,-99
6,drugs,ctgan_sdv,simple,0.9,0.98,0.0,0.42,0.94,0.89,0.96,0.17,0.53,0.03,0.1,-99
7,drugs,ctgan_sdv,only_simple,0.91,1.0,1.0,0.42,0.98,0.92,0.97,0.19,0.56,0.01,0.05,-99


# Hyperparameter Optimisation

In [14]:
# exp_dataset = "adult"
HPO_DATASETS = ["adult", "loan"]
hpo_df = pd.DataFrame()
for exp_dataset in HPO_DATASETS:
    final_report = get_scores_df(exp_dataset, ALL_TABULAR_MODELS, "hpo", False)
    hpo_df = pd.concat([hpo_df, pd.DataFrame(final_report)], ignore_index=True)
hpo_df

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,adult,ctgan_syn,hpo,0.94,0.99,0.59,1.0,0.8,0.83,0.98,0.76,0.81,0.01,0.27,1.0
1,adult,tvae_syn,hpo,0.92,0.99,0.57,1.0,0.9,0.85,0.99,0.8,0.85,0.01,0.19,1.0
2,adult,arf_syn,hpo,0.98,1.0,0.81,1.0,0.9,0.92,0.99,0.87,0.9,0.01,0.16,1.0
3,adult,ddpm_syn,hpo,0.99,1.0,0.72,1.0,0.98,0.97,0.99,0.94,0.96,0.01,0.07,1.0
4,adult,rtvae_syn,hpo,0.96,0.98,0.95,1.0,0.84,0.8,0.96,0.74,0.79,0.03,0.23,1.0
5,loan,ctgan_syn,hpo,0.97,0.99,0.4,1.0,0.91,0.98,0.96,0.93,0.94,0.03,0.17,1.0
6,loan,tvae_syn,hpo,0.95,0.98,0.35,1.0,0.93,0.98,0.96,0.92,0.94,0.02,0.19,1.0
7,loan,arf_syn,hpo,1.0,0.99,0.46,1.0,0.9,0.99,0.98,0.93,0.94,0.02,0.14,1.0
8,loan,ddpm_syn,hpo,1.0,0.94,0.45,1.0,0.95,0.97,0.98,0.88,0.93,0.03,0.1,1.0
9,loan,rtvae_syn,hpo,0.87,0.97,0.35,1.0,0.87,0.98,0.94,0.81,0.88,0.04,0.47,1.0


# Sequential Evaulation

In [15]:
#TODO: pums-dgan is running 
seq_df = pd.DataFrame()
for exp_dataset in ALL_SEQUENTIAL_DATASETS:
    final_report = get_scores_df(exp_dataset, ALL_SEQUENTIAL_MODELS, "sequential", False)
    seq_df = pd.concat([seq_df, pd.DataFrame(final_report)], ignore_index=True)
seq_df 

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,taxi,dgan_gretel,sequential,,-99.0,0.17,0.88,,0.98,0.92,0.61,0.66,0.19,0.69,-99.0
1,taxi,par_sdv,sequential,,-99.0,0.11,0.98,,0.96,0.94,0.72,0.82,0.37,0.49,-99.0
2,nasdaq,dgan_gretel,sequential,0.32,0.95,0.82,0.97,0.65,0.36,0.88,0.38,0.55,0.05,0.36,1.0
3,nasdaq,par_sdv,sequential,0.37,0.96,0.43,0.99,0.89,0.57,0.89,0.46,0.69,0.03,0.21,1.0


In [286]:
# tabular_bm_df.to_csv("tabular_benchmarks.csv")

# Machine Learning Metrics

In [18]:
ML_CLASSIFICATION_TASK_DATASETS = [
    "adult", "census", "credit", "covtype", "loan", "intrusion"]
ML_REGRESSION_TASK_DATASETS = ["health_insurance"]

ERROR_VAL  = -99
def get_ml_scores_df(exp_dataset, report, metric):
    for lib, models in ALL_TABULAR_MODELS.items():
        for model in models: 
            # corr_file = f"../metrics_out/tabular/{model}_{lib}/{exp_dataset}/{exp_dataset}_{model}_correlation.csv"
            metrics_file = f"../metrics_out/tabular/{model}_{lib}/{exp_dataset}/{exp_dataset}_{model}_metrics.json"
                        
            try:
                with open(metrics_file, 'r') as file:
                    metrics_scores = json.load(file)
            except Exception as e:
                # print(e)
                # print(model, lib)
                # status = None
                # if lib in TABULAR_INCOMPLETE_JOBS and model in TABULAR_INCOMPLETE_JOBS[lib]:
                #     if exp_dataset in TABULAR_INCOMPLETE_JOBS[lib][model]:
                #         # print("Status: ",  TABULAR_INCOMPLETE_JOBS[lib][model][exp_dataset])
                #         # print("-")
                #         status = TABULAR_INCOMPLETE_JOBS[lib][model][exp_dataset]
                #     else:
                #         status = "Skipped"
                # else:
                #     status = "Skipped"
                #     # print(e)
                #     # print("Skipped")
                # for k in report.keys():
                #     if k not in ["model", "dataset", "case"]: 
                #         report[k].append(status)
                #     # print(final_report)
                continue
                
            report["dataset"].append(exp_dataset)
            report["model"].append(f"{model}_{lib}")
            
            for k, v in report.items():
                if k in ["dataset", "model"]:
                    continue   
                try:
                    scores = metrics_scores["ml_efficacy"][k]
                    # print(scores)
                    report[k].append(round(abs(scores[f"synthetic_{metric}"] - scores[f"real_{metric}"]), 2))
                except:
                    report[k].append(ERROR_VAL)
                # final_report_ml_cls["model"]["adaboost_classifier"].append()
                
    # for k, v in report.items():
    #     print(k, len(v))
    return report

## Classification

In [19]:
final_report_ml_cls = {
    "dataset": [],  
    "model": [],
    "adaboost_classification": [], 
    "decision_tree_classification": [], 
    "logistic_classification": [],
    "mlp_classification": [], 
}
ml_cls_metric = "f1"

tabular_ml_cls_df = pd.DataFrame()
for exp_dataset in ML_CLASSIFICATION_TASK_DATASETS:
    final_report = get_ml_scores_df(exp_dataset, final_report_ml_cls, ml_cls_metric)
    tabular_ml_cls_df = pd.concat([tabular_ml_cls_df, pd.DataFrame(final_report)], ignore_index=True)
    
tabular_ml_cls_df 

Unnamed: 0,dataset,model,adaboost_classification,decision_tree_classification,logistic_classification,mlp_classification
0,adult,actgan_gretel,0.03,0.01,0.05,0.02
1,adult,ctgan_sdv,0.01,0.03,0.05,0.01
2,adult,tvae_sdv,0.01,0.02,0.05,0.03
3,adult,gaussian_copula_sdv,0.02,0.11,0.06,0.03
4,adult,ctgan_syn,0.04,0.08,0.06,0.02


## Regression

In [20]:
final_report_ml_regress = {
    "dataset": [],  
    "model": [],
    "linear_regression": [], 
    "mlp_regression": []
}

ml_regress_metric = "r2"
tabular_ml_reg_df = pd.DataFrame()
for exp_dataset in ML_REGRESSION_TASK_DATASETS:
    final_report = get_ml_scores_df(exp_dataset, final_report_ml_regress, ml_regress_metric)
    tabular_ml_reg_df = pd.concat([tabular_ml_reg_df, pd.DataFrame(final_report)], ignore_index=True)
    
tabular_ml_reg_df 

Unnamed: 0,dataset,model,linear_regression,mlp_regression
0,health_insurance,actgan_gretel,0.55,0.01
1,health_insurance,ctgan_sdv,0.9,0.01
2,health_insurance,tvae_sdv,0.12,0.01
3,health_insurance,gaussian_copula_sdv,0.31,0.01
4,health_insurance,ctgan_syn,0.0,0.01
5,health_insurance,tvae_syn,0.05,0.01
6,health_insurance,goggle_syn,0.44,0.02
7,health_insurance,arf_syn,0.0,0.01
8,health_insurance,ddpm_syn,0.27,0.01
9,health_insurance,nflow_syn,0.1,0.01


# Performance Evaluation

In [21]:
ERROR_VAL  = -99
def get_execution_scores_df(exp_dataset, case="tabular"):
    
    final_report = {
        "dataset": [],
        "lib": [], 
        "model": [],
        "num_rows": [], 
        "num_cols": [], 
        "num_sampled_rows": [],
        "device": [],
        "num_epochs": [], 
        "train_time_sec": [],
        "sample_time_sec": [], 
        "peak_memory_mb": [], 
        "synthesizer_size": [],
        "synthetic_dataset_size_mb_deep": [], 
        "train_dataset_size_mb_deep": [], 
        "synthetic_dataset_size_mb": [],
        "train_dataset_size_mb": []
    }
    
    for lib, models in ALL_TABULAR_MODELS.items():
        
        for model in models: 
            
            BASE = f"final_outs/{lib}_tabular"
            execution_scores_path = f"../{BASE}/{model}/{exp_dataset}/{exp_dataset}_{model}_execution_scores.json"
            
            try:
                with open(execution_scores_path, 'r') as file:
                    execution_scores = json.load(file)
            except Exception as e:
                # print(e)
                continue
                      
            # corr_file = f"../metrics_out/{case}/{model}_{lib}/{exp_dataset}/{exp_dataset}_{model}_correlation.csv"
            # metrics_file = f"../metrics_out/{case}/{model}_{lib}/{exp_dataset}/{exp_dataset}_{model}_metrics.json"
            final_report["model"].append(f"{model}_{lib}")
            for k, v in final_report.items():
                if k in ["model"]: continue # "modality", "synthesizer"
                
                if k not in execution_scores:
                    final_report[k].append("-99")
                    continue
                final_report[k].append(execution_scores[k])
                    
# execution_scores
    # for k, v in final_report.items():
    #     print(k, len(v))
                    
    return final_report

In [22]:
execution_scores_df = pd.DataFrame()
for exp_dataset in ALL_TABULAR_DATASETS:
    final_report = get_execution_scores_df(exp_dataset, case="tabular")
    execution_scores_df = pd.concat([execution_scores_df, pd.DataFrame(final_report)], ignore_index=True)
execution_scores_df 

Unnamed: 0,dataset,lib,model,num_rows,num_cols,num_sampled_rows,device,num_epochs,train_time_sec,sample_time_sec,peak_memory_mb,synthesizer_size,synthetic_dataset_size_mb_deep,train_dataset_size_mb_deep,synthetic_dataset_size_mb,train_dataset_size_mb
0,adult,GRETEL_0.20.0,actgan_gretel,26048,16,26048,GPU,300,844.420231,2.135426,112.538464,37.958943,16.898896,17.094568,3.334272,3.542528
1,adult,SDV_1.2.1,ctgan_sdv,26048,15,32561,CPU,300,1019.838562,2.317747,101.742050,35.808308,20.880857,16.886184,3.907448,3.334144
2,adult,SDV_1.2.1,tvae_sdv,26048,15,32561,CPU,300,378.850136,0.980367,101.740078,0.417060,20.859236,16.886184,3.907448,3.334144
3,adult,SDV_1.2.1,gaussian_copula_sdv,26048,15,32561,CPU,0,11.544246,1.632146,32.554806,0.135811,20.769536,16.886184,3.907448,3.334144
4,adult,synthcity==0.2.9,ctgan_syn,26048,15,32561,cpu,300,5387.746336,9.032208,161.648593,38.916708,21.105475,16.886184,4.167808,3.334144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,loan,GREAT,great_llm,5000,14,5000,GPU,100,2870.947590,35.642844,203.695731,327.700000,-99,-99,-99,-99
94,pums,GRETEL_0.20.0,actgan_gretel,828161,36,828161,GPU,300,50401.469090,198.961060,9927.379896,3308.896283,284.691334,291.35225,238.510496,245.135656
95,pums,SDV_1.2.1,ctgan_sdv,828161,36,-99,GPU,300,34275.731180,190.307225,8131.216867,2639.754827,355.908334,291.35225,298.138304,245.135656
96,pums,SDV_1.2.1,tvae_sdv,828161,36,-99,GPU,300,22887.041172,153.788952,8131.214587,0.997506,355.923838,291.35225,298.138304,245.135656


# Anlaysis

In [191]:
# merge dataframes 
all_df = pd.DataFrame()
for df in [std_tabular_df, tabular_impute_df, hpo_df, seq_df]:
    all_df = pd.concat([all_df, df], ignore_index=True)

# std_tabular_df
# tabular_impute_df
# hpo_df
# seq_df

# execution_scores_df

# tabular_ml_reg_df
# tabular_ml_cls_df  

In [36]:
all_df.to_csv("tabular_benchmarks.csv")

In [40]:
all_df.columns 

Index(['dataset', 'model', 'case', 'domain_cov', 'stats_cov', 'outliers_cov',
       'missing_cov', 'ks_sim', 'tv_sim', 'corr_sim', 'contin_sim',
       'sdv_quality_report', 'wass_dist', 'js_dist', 'new_row_synthesis'],
      dtype='object')

In [39]:
all_df["case"].unique()

array(['tabular', 'missforest', 'only_missforest', 'ice', 'only_ice',
       'hyperimpute', 'only_hyperimpute', 'simple', 'only_simple', 'hpo',
       'sequential'], dtype=object)

In [41]:
all_df["dataset"].unique()

array(['adult', 'census', 'child', 'covtype', 'credit', 'insurance',
       'health_insurance', 'intrusion', 'drugs', 'loan', 'pums', 'taxi',
       'nasdaq'], dtype=object)

In [54]:
all_df[all_df["dataset"] == "adult"]

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,adult,actgan_gretel,tabular,0.89,0.99,0.49,1.0,0.79,0.92,0.97,0.87,0.88,0.02,0.24,1.0
1,adult,ctgan_sdv,tabular,0.87,0.99,0.79,1.0,0.87,0.91,0.98,0.82,0.87,0.02,0.22,1.0
2,adult,tvae_sdv,tabular,0.79,0.99,0.68,1.0,0.88,0.94,0.97,0.88,0.91,0.02,0.13,1.0
3,adult,gaussian_copula_sdv,tabular,0.92,0.95,0.41,1.0,0.7,0.8,0.99,0.73,0.76,0.07,0.29,1.0
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.0,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.0
5,adult,tvae_syn,tabular,0.86,0.99,0.65,1.0,0.81,0.89,0.99,0.81,0.84,0.01,0.3,1.0
6,adult,goggle_syn,tabular,0.33,0.95,0.27,1.0,0.62,0.67,0.97,0.49,0.59,0.06,0.54,1.0
7,adult,arf_syn,tabular,0.99,1.0,0.81,1.0,0.88,0.91,0.99,0.87,0.9,0.01,0.16,1.0
8,adult,ddpm_syn,tabular,0.99,1.0,0.68,1.0,0.98,0.97,0.98,0.95,0.97,0.01,0.06,1.0
9,adult,nflow_syn,tabular,0.97,0.98,0.7,1.0,0.89,0.77,0.97,0.69,0.77,0.02,0.23,1.0


In [55]:
all_df[all_df["model"] == "ctgan_syn"]

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.0,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.0
22,child,ctgan_syn,tabular,1.0,-99.0,-99.0,1.0,-99.0,0.94,,0.9,0.92,-99.0,-99.0,0.86
40,credit,ctgan_syn,tabular,0.56,1.0,0.8,1.0,0.93,1.0,0.97,0.64,0.94,0.01,0.08,0.99
47,insurance,ctgan_syn,tabular,1.0,-99.0,-99.0,1.0,-99.0,0.96,,0.93,0.94,-99.0,-99.0,0.91
58,health_insurance,ctgan_syn,tabular,0.92,0.94,0.32,1.0,0.86,0.88,0.98,0.8,0.85,0.07,0.21,1.0
71,intrusion,ctgan_syn,tabular,0.96,0.9,0.65,1.0,0.78,0.96,0.98,0.91,0.91,0.02,0.17,1.0
78,drugs,ctgan_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.89,0.85,0.93,0.14,0.5,-99.0,-99.0,-99.0
89,loan,ctgan_syn,tabular,0.96,0.96,0.49,1.0,0.87,0.96,0.98,0.85,0.9,0.05,0.21,1.0
111,adult,ctgan_syn,hpo,0.94,0.99,0.59,1.0,0.8,0.83,0.98,0.76,0.81,0.01,0.27,1.0
116,loan,ctgan_syn,hpo,0.97,0.99,0.4,1.0,0.91,0.98,0.96,0.93,0.94,0.03,0.17,1.0


In [158]:
# def highlight_top3(s, skip_cols):
#     '''
#     Highlight the top 3 unique numerical values in each column with different colors.
#     Skip specified columns.
#     '''
#     if s.name not in ['dataset', 'model', 'case']:
#         # print(s.name)
#         # Find the top 3 unique values
#         unique_top_values = s.drop_duplicates().nlargest(3)

#         return ['background-color: #ffd700' if v == unique_top_values.iloc[0] else
#                 'background-color: #c0c0c0' if v == unique_top_values.iloc[1] else
#                 'background-color: #cd7f32' if v == unique_top_values.iloc[2] else ''
#                 for v in s]
#     else:
#         return ['' for _ in s]  # Return empty styling for skipped columns
    
    
ORANGE = "#FFB347"
GREEN = "#77dd77"
YELLOW = "#fdfd96"
def highlight_top3_min3(s, skip_cols, min_cols):
    # print(s)
    '''
    Highlight the top 3 unique numerical values in each column with different colors.
    Highlight the smallest 3 unique values in specified columns.
    Skip specified columns.
    '''
    if s.name in min_cols:
        # Find the smallest 3 unique values for specified columns
        unique_bottom_values = s.drop_duplicates().nsmallest(3)
        return [f'background-color: {GREEN}' if v == unique_bottom_values.iloc[0] else
                f'background-color: {ORANGE}' if v == unique_bottom_values.iloc[1] else
                f'background-color: {YELLOW}' if v == unique_bottom_values.iloc[2] else ''
                for v in s]
    elif s.name not in skip_cols:
        # print(s.name)
        # Find the top 3 unique values for other columns
        unique_top_values = s.drop_duplicates().nlargest(3)
        return [f'background-color: {GREEN}' if v == unique_top_values.iloc[0] else
                f'background-color: {ORANGE}' if v == unique_top_values.iloc[1] else
                f'background-color: {YELLOW}' if v == unique_top_values.iloc[2] else ''
                for v in s]
    else:
        return ['' for _ in s]  # Return empty styling for skipped columns

In [171]:
skip_cols = ['dataset', 'model', 'case']
min_cols = ["wass_dist", "js_dist"]

## Does HPO performs better? 

### ctgan | loan & adult

In [161]:
subset = all_df[(all_df["model"] == "ctgan_syn") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
89,loan,ctgan_syn,tabular,0.96,0.96,0.49,1.0,0.87,0.96,0.98,0.85,0.9,0.05,0.21,1.0
116,loan,ctgan_syn,hpo,0.97,0.99,0.4,1.0,0.91,0.98,0.96,0.93,0.94,0.03,0.17,1.0


In [164]:
subset = all_df[(all_df["model"] == "ctgan_syn") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.0,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.0
111,adult,ctgan_syn,hpo,0.94,0.99,0.59,1.0,0.8,0.83,0.98,0.76,0.81,0.01,0.27,1.0


### ddpm | loan & adult

In [165]:
subset = all_df[(all_df["model"] == "ddpm_syn") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
93,loan,ddpm_syn,tabular,1.0,0.94,0.44,1.0,0.96,0.97,0.96,0.89,0.93,0.03,0.09,1.0
119,loan,ddpm_syn,hpo,1.0,0.94,0.45,1.0,0.95,0.97,0.98,0.88,0.93,0.03,0.1,1.0


In [166]:
subset = all_df[(all_df["model"] == "ddpm_syn") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
8,adult,ddpm_syn,tabular,0.99,1.0,0.68,1.0,0.98,0.97,0.98,0.95,0.97,0.01,0.06,1.0
114,adult,ddpm_syn,hpo,0.99,1.0,0.72,1.0,0.98,0.97,0.99,0.94,0.96,0.01,0.07,1.0


### arf | loan & adult

In [172]:
subset = all_df[(all_df["model"] == "arf_syn") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
92,loan,arf_syn,tabular,1.0,0.99,0.33,1.0,0.89,0.98,0.97,0.91,0.93,0.02,0.15,1.0
118,loan,arf_syn,hpo,1.0,0.99,0.46,1.0,0.9,0.99,0.98,0.93,0.94,0.02,0.14,1.0


In [173]:
subset = all_df[(all_df["model"] == "arf_syn") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
7,adult,arf_syn,tabular,0.99,1.0,0.81,1.0,0.88,0.91,0.99,0.87,0.9,0.01,0.16,1.0
113,adult,arf_syn,hpo,0.98,1.0,0.81,1.0,0.9,0.92,0.99,0.87,0.9,0.01,0.16,1.0


### rtvae | loan & adult

In [175]:
subset = all_df[(all_df["model"] == "rtvae_syn") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
95,loan,rtvae_syn,tabular,0.86,0.97,0.4,1.0,0.85,0.96,0.95,0.81,0.87,0.04,0.48,1.0
120,loan,rtvae_syn,hpo,0.87,0.97,0.35,1.0,0.87,0.98,0.94,0.81,0.88,0.04,0.47,1.0


In [174]:
subset = all_df[(all_df["model"] == "rtvae_syn") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
10,adult,rtvae_syn,tabular,0.73,0.98,0.39,1.0,0.77,0.74,0.98,0.65,0.73,0.02,0.48,1.0
115,adult,rtvae_syn,hpo,0.96,0.98,0.95,1.0,0.84,0.8,0.96,0.74,0.79,0.03,0.23,1.0


### tvae | loan & adult

In [177]:
subset = all_df[(all_df["model"] == "tvae_syn") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
90,loan,tvae_syn,tabular,0.93,0.97,0.36,1.0,0.88,0.97,0.96,0.89,0.92,0.04,0.27,1.0
117,loan,tvae_syn,hpo,0.95,0.98,0.35,1.0,0.93,0.98,0.96,0.92,0.94,0.02,0.19,1.0


In [178]:
subset = all_df[(all_df["model"] == "tvae_syn") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
5,adult,tvae_syn,tabular,0.86,0.99,0.65,1.0,0.81,0.89,0.99,0.81,0.84,0.01,0.3,1.0
112,adult,tvae_syn,hpo,0.92,0.99,0.57,1.0,0.9,0.85,0.99,0.8,0.85,0.01,0.19,1.0


## Hyperimpute benchmarking

In [199]:
## TODO: Need to run on TAXI -- mostly numerical missing values 

In [193]:

# subset = all_df[(all_df["model"] == "ctgan_sdv") & (all_df["dataset"] == "drugs")]

subset = all_df[(all_df["dataset"] == "drugs") & (all_df['case'].str.contains('only_'))]

df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
104,drugs,ctgan_sdv,only_missforest,0.93,1.0,1.0,0.42,0.98,0.91,0.97,0.19,0.55,0.01,0.05,-99.0
106,drugs,ctgan_sdv,only_ice,0.96,1.0,1.0,0.42,0.98,0.92,0.97,0.19,0.56,0.01,0.05,-99.0
108,drugs,ctgan_sdv,only_hyperimpute,0.95,1.0,1.0,0.42,0.98,0.92,0.97,0.19,0.56,0.01,0.05,-99.0
110,drugs,ctgan_sdv,only_simple,0.91,1.0,1.0,0.42,0.98,0.92,0.97,0.19,0.56,0.01,0.05,-99.0


In [198]:
subset = all_df[(all_df["dataset"] == "drugs") & (all_df["model"] == "ctgan_sdv") & ~(all_df['case'].str.contains('only_'))]

df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
75,drugs,ctgan_sdv,tabular,0.89,0.96,1.0,0.95,0.89,0.92,0.95,0.83,0.88,0.06,0.14,-99.0
103,drugs,ctgan_sdv,missforest,0.92,0.99,0.0,0.42,0.95,0.88,0.92,0.17,0.53,0.01,0.07,-99.0
105,drugs,ctgan_sdv,ice,0.96,0.93,0.0,0.42,0.9,0.89,0.95,0.17,0.53,0.09,0.22,-99.0
107,drugs,ctgan_sdv,hyperimpute,0.95,0.92,1.0,0.42,0.85,0.9,0.96,0.17,0.53,0.12,0.27,-99.0
109,drugs,ctgan_sdv,simple,0.9,0.98,0.0,0.42,0.94,0.89,0.96,0.17,0.53,0.03,0.1,-99.0


# Models Benchmarking 

In [None]:
# ALL_TABULAR_DATASETS = ["adult", "census", "child", "covtype", "credit", "insurance", "health_insurance", 
#                        "intrusion", "drugs", "loan", "pums"]

In [201]:
subset = all_df[(all_df["dataset"] == "adult") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,adult,actgan_gretel,tabular,0.89,0.99,0.49,1.0,0.79,0.92,0.97,0.87,0.88,0.02,0.24,1.0
1,adult,ctgan_sdv,tabular,0.87,0.99,0.79,1.0,0.87,0.91,0.98,0.82,0.87,0.02,0.22,1.0
2,adult,tvae_sdv,tabular,0.79,0.99,0.68,1.0,0.88,0.94,0.97,0.88,0.91,0.02,0.13,1.0
3,adult,gaussian_copula_sdv,tabular,0.92,0.95,0.41,1.0,0.7,0.8,0.99,0.73,0.76,0.07,0.29,1.0
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.0,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.0
5,adult,tvae_syn,tabular,0.86,0.99,0.65,1.0,0.81,0.89,0.99,0.81,0.84,0.01,0.3,1.0
6,adult,goggle_syn,tabular,0.33,0.95,0.27,1.0,0.62,0.67,0.97,0.49,0.59,0.06,0.54,1.0
7,adult,arf_syn,tabular,0.99,1.0,0.81,1.0,0.88,0.91,0.99,0.87,0.9,0.01,0.16,1.0
8,adult,ddpm_syn,tabular,0.99,1.0,0.68,1.0,0.98,0.97,0.98,0.95,0.97,0.01,0.06,1.0
9,adult,nflow_syn,tabular,0.97,0.98,0.7,1.0,0.89,0.77,0.97,0.69,0.77,0.02,0.23,1.0


In [217]:
subset = all_df[(all_df["dataset"] == "census") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
14,census,actgan_gretel,tabular,0.92,0.97,0.41,1.0,0.87,0.91,0.98,0.88,0.89,0.02,0.2,0.98
15,census,tvae_sdv,tabular,0.91,0.98,0.38,1.0,0.93,0.95,0.98,0.92,0.93,0.02,0.1,0.9
16,census,gaussian_copula_sdv,tabular,0.93,0.89,0.29,1.0,0.5,0.77,0.98,0.64,0.68,0.11,0.42,1.0
17,census,gan_betterdata,tabular,0.93,0.97,0.45,1.0,0.96,0.92,0.97,0.53,0.73,0.02,0.07,1.0


In [202]:
subset = all_df[(all_df["dataset"] == "loan") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
85,loan,actgan_gretel,tabular,0.99,0.96,0.4,1.0,0.83,0.89,0.92,0.85,0.87,0.05,0.16,1.0
86,loan,ctgan_sdv,tabular,0.97,0.97,0.37,1.0,0.85,0.93,0.91,0.88,0.89,0.04,0.15,1.0
87,loan,tvae_sdv,tabular,0.91,0.95,0.2,1.0,0.82,0.85,0.96,0.79,0.83,0.07,0.25,1.0
88,loan,gaussian_copula_sdv,tabular,0.94,0.98,0.39,1.0,0.89,0.97,0.97,0.91,0.93,0.03,0.19,1.0
89,loan,ctgan_syn,tabular,0.96,0.96,0.49,1.0,0.87,0.96,0.98,0.85,0.9,0.05,0.21,1.0
90,loan,tvae_syn,tabular,0.93,0.97,0.36,1.0,0.88,0.97,0.96,0.89,0.92,0.04,0.27,1.0
91,loan,goggle_syn,tabular,0.75,0.89,0.19,1.0,0.58,0.77,0.91,0.58,0.66,0.13,0.55,1.0
92,loan,arf_syn,tabular,1.0,0.99,0.33,1.0,0.89,0.98,0.97,0.91,0.93,0.02,0.15,1.0
93,loan,ddpm_syn,tabular,1.0,0.94,0.44,1.0,0.96,0.97,0.96,0.89,0.93,0.03,0.09,1.0
94,loan,nflow_syn,tabular,0.99,0.97,0.38,1.0,0.88,0.95,0.94,0.89,0.91,0.04,0.17,1.0


In [203]:
subset = all_df[(all_df["dataset"] == "health_insurance") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
54,health_insurance,actgan_gretel,tabular,1.0,0.95,0.33,1.0,0.85,0.88,0.91,0.82,0.85,0.07,0.19,1.0
55,health_insurance,ctgan_sdv,tabular,0.98,0.89,0.35,1.0,0.75,0.88,0.89,0.81,0.82,0.13,0.24,1.0
56,health_insurance,tvae_sdv,tabular,0.91,0.95,0.67,1.0,0.81,0.82,0.98,0.74,0.8,0.07,0.2,1.0
57,health_insurance,gaussian_copula_sdv,tabular,0.93,0.95,0.11,1.0,0.85,0.87,0.94,0.8,0.84,0.07,0.18,1.0
58,health_insurance,ctgan_syn,tabular,0.92,0.94,0.32,1.0,0.86,0.88,0.98,0.8,0.85,0.07,0.21,1.0
59,health_insurance,tvae_syn,tabular,0.93,0.98,0.33,1.0,0.89,0.96,0.96,0.86,0.9,0.03,0.26,1.0
60,health_insurance,goggle_syn,tabular,0.56,0.9,0.0,1.0,0.63,0.73,0.92,0.51,0.63,0.1,0.55,1.0
61,health_insurance,arf_syn,tabular,1.0,0.97,0.57,1.0,0.9,0.94,0.95,0.88,0.91,0.04,0.12,0.96
62,health_insurance,ddpm_syn,tabular,1.0,0.95,0.67,1.0,0.89,0.91,0.92,0.83,0.87,0.07,0.21,1.0
63,health_insurance,nflow_syn,tabular,0.95,0.98,0.27,1.0,0.9,0.87,0.96,0.82,0.86,0.02,0.15,1.0


In [204]:
subset = all_df[(all_df["dataset"] == "drugs") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
74,drugs,actgan_gretel,tabular,0.9,0.96,0.0,0.95,0.87,0.92,0.94,0.82,0.87,0.05,0.18,-99.0
75,drugs,ctgan_sdv,tabular,0.89,0.96,1.0,0.95,0.89,0.92,0.95,0.83,0.88,0.06,0.14,-99.0
76,drugs,tvae_sdv,tabular,0.77,0.93,0.0,0.95,0.83,,0.99,0.76,0.8,0.06,0.18,-99.0
77,drugs,gaussian_copula_sdv,tabular,0.87,1.0,0.0,0.92,0.92,0.88,0.97,0.73,0.8,0.01,0.08,-99.0
78,drugs,ctgan_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.89,0.85,0.93,0.14,0.5,-99.0,-99.0,-99.0
79,drugs,tvae_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.89,0.86,0.94,0.15,0.5,-99.0,-99.0,-99.0
80,drugs,goggle_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.85,0.74,0.95,0.08,0.41,-99.0,-99.0,-99.0
81,drugs,arf_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.91,0.86,0.94,0.15,0.51,-99.0,-99.0,-99.0
82,drugs,ddpm_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.93,0.9,0.93,0.18,0.54,-99.0,-99.0,-99.0
83,drugs,nflow_syn,tabular,-99.0,-99.0,-99.0,-99.0,0.9,0.85,0.96,0.15,0.5,-99.0,-99.0,-99.0


In [205]:
subset = all_df[(all_df["dataset"] == "intrusion") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
67,intrusion,actgan_gretel,tabular,0.74,0.89,0.36,1.0,0.83,0.91,0.96,0.78,0.84,0.07,0.18,0.98
68,intrusion,ctgan_sdv,tabular,0.67,0.86,0.36,1.0,0.77,0.88,0.92,0.74,0.8,0.12,0.22,0.98
69,intrusion,tvae_sdv,tabular,0.63,0.99,0.15,1.0,0.91,0.97,0.93,0.87,0.91,0.02,0.15,0.96
70,intrusion,gaussian_copula_sdv,tabular,0.72,0.85,0.54,1.0,0.59,0.63,0.95,0.48,0.56,0.28,0.39,1.0
71,intrusion,ctgan_syn,tabular,0.96,0.9,0.65,1.0,0.78,0.96,0.98,0.91,0.91,0.02,0.17,1.0
72,intrusion,tvae_syn,tabular,0.76,0.95,0.35,1.0,0.81,0.95,0.97,0.87,0.89,0.02,0.18,0.99
73,intrusion,arf_syn,tabular,0.98,0.91,0.69,1.0,0.8,0.93,0.98,0.88,0.89,0.06,0.14,0.97


In [206]:
subset = all_df[(all_df["dataset"] == "covtype") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
29,covtype,actgan_gretel,tabular,0.98,0.99,0.8,1.0,0.94,0.98,0.97,0.95,0.96,0.02,0.11,1.0
30,covtype,ctgan_sdv,tabular,0.84,0.97,0.68,1.0,0.88,0.96,0.97,0.91,0.93,0.04,0.15,1.0
31,covtype,tvae_sdv,tabular,0.76,0.97,0.42,1.0,0.84,0.98,0.96,0.93,0.94,0.04,0.22,1.0
32,covtype,gaussian_copula_sdv,tabular,0.75,0.99,0.56,1.0,0.95,0.17,0.98,0.17,0.26,0.01,0.1,1.0
33,covtype,arf_syn,tabular,1.0,1.0,0.79,1.0,0.97,1.0,0.99,0.99,0.99,0.01,0.06,1.0
34,covtype,ddpm_syn,tabular,0.99,1.0,0.88,1.0,0.99,1.0,0.99,0.98,0.99,0.0,0.06,1.0
35,covtype,rtvae_syn,tabular,0.86,0.99,0.29,1.0,0.88,0.98,0.97,0.91,0.94,0.02,0.7,1.0


In [208]:
subset = all_df[(all_df["dataset"] == "pums") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
99,pums,actgan_gretel,tabular,0.81,0.99,0.6,0.98,0.85,0.98,0.97,0.91,0.92,0.01,0.2,0.99
100,pums,ctgan_sdv,tabular,0.82,0.99,0.67,1.0,0.88,0.99,0.99,0.94,0.95,0.01,0.19,1.0
101,pums,tvae_sdv,tabular,0.8,0.97,0.63,1.0,0.81,0.92,0.95,0.82,0.86,0.04,0.29,1.0
102,pums,gaussian_copula_sdv,tabular,0.77,0.96,0.52,1.0,0.73,0.73,0.97,0.62,0.69,0.05,0.35,1.0


In [209]:
subset = all_df[(all_df["dataset"] == "credit") & (all_df["case"] == "tabular")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
36,credit,actgan_gretel,tabular,0.22,1.0,0.77,1.0,0.9,0.42,0.96,0.28,0.9,0.01,0.12,0.96
37,credit,ctgan_sdv,tabular,0.19,1.0,0.96,1.0,0.98,1.0,0.97,0.55,0.96,0.0,0.06,0.92
38,credit,tvae_sdv,tabular,0.15,0.99,0.37,1.0,0.93,1.0,0.96,0.54,0.93,0.01,0.14,0.72
39,credit,gaussian_copula_sdv,tabular,0.25,1.0,0.62,1.0,0.88,0.0,0.98,0.1,0.88,0.01,0.15,1.0
40,credit,ctgan_syn,tabular,0.56,1.0,0.8,1.0,0.93,1.0,0.97,0.64,0.94,0.01,0.08,0.99
41,credit,tvae_syn,tabular,0.47,1.0,0.63,1.0,0.93,1.0,0.98,0.62,0.94,0.01,0.08,0.98
42,credit,arf_syn,tabular,0.75,1.0,0.8,1.0,0.96,1.0,0.98,0.64,0.96,0.0,0.04,0.91
43,credit,rtvae_syn,tabular,0.37,1.0,0.61,1.0,0.92,1.0,0.97,0.55,0.93,0.01,0.11,1.0


In [216]:
# insurance, child
# subset = all_df[(all_df["dataset"] == "child") & (all_df["case"] == "tabular")]

# # subset = all_df[(all_df["dataset"] == "child") & (all_df["case"] == "tabular")]
# df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
# df_style


## Similar models across libraries 

In [224]:
subset = all_df[all_df["model"].isin(["ctgan_syn", "ctgan_sdv", "actgan_gretel"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

# adult, child, health_insurance, intrusion, drugs, loan, credit

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,adult,actgan_gretel,tabular,0.89,0.99,0.49,1.0,0.79,0.92,0.97,0.87,0.88,0.02,0.24,1.0
1,adult,ctgan_sdv,tabular,0.87,0.99,0.79,1.0,0.87,0.91,0.98,0.82,0.87,0.02,0.22,1.0
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.0,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.0


In [226]:
subset = all_df[all_df["model"].isin(["ctgan_syn", "ctgan_sdv", "actgan_gretel"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "health_insurance")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
54,health_insurance,actgan_gretel,tabular,1.0,0.95,0.33,1.0,0.85,0.88,0.91,0.82,0.85,0.07,0.19,1.0
55,health_insurance,ctgan_sdv,tabular,0.98,0.89,0.35,1.0,0.75,0.88,0.89,0.81,0.82,0.13,0.24,1.0
58,health_insurance,ctgan_syn,tabular,0.92,0.94,0.32,1.0,0.86,0.88,0.98,0.8,0.85,0.07,0.21,1.0


In [227]:
subset = all_df[all_df["model"].isin(["ctgan_syn", "ctgan_sdv", "actgan_gretel"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "intrusion")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
67,intrusion,actgan_gretel,tabular,0.74,0.89,0.36,1.0,0.83,0.91,0.96,0.78,0.84,0.07,0.18,0.98
68,intrusion,ctgan_sdv,tabular,0.67,0.86,0.36,1.0,0.77,0.88,0.92,0.74,0.8,0.12,0.22,0.98
71,intrusion,ctgan_syn,tabular,0.96,0.9,0.65,1.0,0.78,0.96,0.98,0.91,0.91,0.02,0.17,1.0


In [229]:
subset = all_df[all_df["model"].isin(["ctgan_syn", "ctgan_sdv", "actgan_gretel"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "credit")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
36,credit,actgan_gretel,tabular,0.22,1.0,0.77,1.0,0.9,0.42,0.96,0.28,0.9,0.01,0.12,0.96
37,credit,ctgan_sdv,tabular,0.19,1.0,0.96,1.0,0.98,1.0,0.97,0.55,0.96,0.0,0.06,0.92
40,credit,ctgan_syn,tabular,0.56,1.0,0.8,1.0,0.93,1.0,0.97,0.64,0.94,0.01,0.08,0.99


In [230]:
subset = all_df[all_df["model"].isin(["ctgan_syn", "ctgan_sdv", "actgan_gretel"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
85,loan,actgan_gretel,tabular,0.99,0.96,0.4,1.0,0.83,0.89,0.92,0.85,0.87,0.05,0.16,1.0
86,loan,ctgan_sdv,tabular,0.97,0.97,0.37,1.0,0.85,0.93,0.91,0.88,0.89,0.04,0.15,1.0
89,loan,ctgan_syn,tabular,0.96,0.96,0.49,1.0,0.87,0.96,0.98,0.85,0.9,0.05,0.21,1.0


#### TVAE

In [234]:
subset = all_df[all_df["model"].isin(["rtvae_syn", "tvae_sdv", "tvae_syn"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "adult")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

# adult, child, health_insurance, intrusion, drugs, loan, credit

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
2,adult,tvae_sdv,tabular,0.79,0.99,0.68,1.0,0.88,0.94,0.97,0.88,0.91,0.02,0.13,1.0
5,adult,tvae_syn,tabular,0.86,0.99,0.65,1.0,0.81,0.89,0.99,0.81,0.84,0.01,0.3,1.0
10,adult,rtvae_syn,tabular,0.73,0.98,0.39,1.0,0.77,0.74,0.98,0.65,0.73,0.02,0.48,1.0


In [236]:
subset = all_df[all_df["model"].isin(["rtvae_syn", "tvae_sdv", "tvae_syn"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "health_insurance")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
56,health_insurance,tvae_sdv,tabular,0.91,0.95,0.67,1.0,0.81,0.82,0.98,0.74,0.8,0.07,0.2,1.0
59,health_insurance,tvae_syn,tabular,0.93,0.98,0.33,1.0,0.89,0.96,0.96,0.86,0.9,0.03,0.26,1.0
64,health_insurance,rtvae_syn,tabular,0.83,0.96,0.0,1.0,0.78,0.96,0.9,0.73,0.82,0.07,0.53,1.0


In [244]:
# subset = all_df[all_df["model"].isin(["rtvae_syn", "tvae_sdv", "tvae_syn"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "drugs")]
# df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
# df_style

In [241]:
subset = all_df[all_df["model"].isin(["rtvae_syn", "tvae_sdv", "tvae_syn"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "loan")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
87,loan,tvae_sdv,tabular,0.91,0.95,0.2,1.0,0.82,0.85,0.96,0.79,0.83,0.07,0.25,1.0
90,loan,tvae_syn,tabular,0.93,0.97,0.36,1.0,0.88,0.97,0.96,0.89,0.92,0.04,0.27,1.0
95,loan,rtvae_syn,tabular,0.86,0.97,0.4,1.0,0.85,0.96,0.95,0.81,0.87,0.04,0.48,1.0


In [243]:
subset = all_df[all_df["model"].isin(["rtvae_syn", "tvae_sdv", "tvae_syn"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "intrusion")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
69,intrusion,tvae_sdv,tabular,0.63,0.99,0.15,1.0,0.91,0.97,0.93,0.87,0.91,0.02,0.15,0.96
72,intrusion,tvae_syn,tabular,0.76,0.95,0.35,1.0,0.81,0.95,0.97,0.87,0.89,0.02,0.18,0.99


In [242]:
subset = all_df[all_df["model"].isin(["rtvae_syn", "tvae_sdv", "tvae_syn"])  & (all_df["case"] == "tabular") & (all_df["dataset"] == "credit")]
df_style = subset.style.apply(lambda s: highlight_top3_min3(s, skip_cols=skip_cols, min_cols=min_cols))
df_style

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
38,credit,tvae_sdv,tabular,0.15,0.99,0.37,1.0,0.93,1.0,0.96,0.54,0.93,0.01,0.14,0.72
41,credit,tvae_syn,tabular,0.47,1.0,0.63,1.0,0.93,1.0,0.98,0.62,0.94,0.01,0.08,0.98
43,credit,rtvae_syn,tabular,0.37,1.0,0.61,1.0,0.92,1.0,0.97,0.55,0.93,0.01,0.11,1.0


## Execution Benchmarking

In [44]:
execution_scores_df.columns

Index(['dataset', 'lib', 'model', 'num_rows', 'num_cols', 'num_sampled_rows',
       'device', 'num_epochs', 'train_time_sec', 'sample_time_sec',
       'peak_memory_mb', 'synthesizer_size', 'synthetic_dataset_size_mb_deep',
       'train_dataset_size_mb_deep', 'synthetic_dataset_size_mb',
       'train_dataset_size_mb'],
      dtype='object')

In [45]:
execution_scores_df.to_csv("execution_scores.csv")

In [32]:
std_tabular_df[std_tabular_df["model"] == "ctgan_sdv"].style.highlight_max(color='#ffb6c1')

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
1,adult,ctgan_sdv,tabular,0.87,0.99,0.79,1.0,0.87,0.91,0.98,0.82,0.87,0.02,0.22,1.0
19,child,ctgan_sdv,tabular,0.97,-99.0,-99.0,0.98,-99.0,0.9,,0.84,0.87,-99.0,-99.0,0.89
30,covtype,ctgan_sdv,tabular,0.84,0.97,0.68,1.0,0.88,0.96,0.97,0.91,0.93,0.04,0.15,1.0
37,credit,ctgan_sdv,tabular,0.19,1.0,0.96,1.0,0.98,1.0,0.97,0.55,0.96,0.0,0.06,0.92
55,health_insurance,ctgan_sdv,tabular,0.98,0.89,0.35,1.0,0.75,0.88,0.89,0.81,0.82,0.13,0.24,1.0
68,intrusion,ctgan_sdv,tabular,0.67,0.86,0.36,1.0,0.77,0.88,0.92,0.74,0.8,0.12,0.22,0.98
75,drugs,ctgan_sdv,tabular,0.89,0.96,1.0,0.95,0.89,0.92,0.95,0.83,0.88,0.06,0.14,-99.0
86,loan,ctgan_sdv,tabular,0.97,0.97,0.37,1.0,0.85,0.93,0.91,0.88,0.89,0.04,0.15,1.0
100,pums,ctgan_sdv,tabular,0.82,0.99,0.67,1.0,0.88,0.99,0.99,0.94,0.95,0.01,0.19,1.0


In [27]:


# df.style.apply(highlight_top3)
std_tabular_df[std_tabular_df["dataset"] == "adult"].style.apply(highlight_top3)

Unnamed: 0,dataset,model,case,domain_cov,stats_cov,outliers_cov,missing_cov,ks_sim,tv_sim,corr_sim,contin_sim,sdv_quality_report,wass_dist,js_dist,new_row_synthesis
0,adult,actgan_gretel,tabular,0.89,0.99,0.49,1.0,0.79,0.92,0.97,0.87,0.88,0.02,0.24,1.0
1,adult,ctgan_sdv,tabular,0.87,0.99,0.79,1.0,0.87,0.91,0.98,0.82,0.87,0.02,0.22,1.0
2,adult,tvae_sdv,tabular,0.79,0.99,0.68,1.0,0.88,0.94,0.97,0.88,0.91,0.02,0.13,1.0
3,adult,gaussian_copula_sdv,tabular,0.92,0.95,0.41,1.0,0.7,0.8,0.99,0.73,0.76,0.07,0.29,1.0
4,adult,ctgan_syn,tabular,0.91,0.99,0.49,1.0,0.76,0.79,0.98,0.74,0.78,0.02,0.34,1.0
5,adult,tvae_syn,tabular,0.86,0.99,0.65,1.0,0.81,0.89,0.99,0.81,0.84,0.01,0.3,1.0
6,adult,goggle_syn,tabular,0.33,0.95,0.27,1.0,0.62,0.67,0.97,0.49,0.59,0.06,0.54,1.0
7,adult,arf_syn,tabular,0.99,1.0,0.81,1.0,0.88,0.91,0.99,0.87,0.9,0.01,0.16,1.0
8,adult,ddpm_syn,tabular,0.99,1.0,0.68,1.0,0.98,0.97,0.98,0.95,0.97,0.01,0.06,1.0
9,adult,nflow_syn,tabular,0.97,0.98,0.7,1.0,0.89,0.77,0.97,0.69,0.77,0.02,0.23,1.0


In [28]:
# def highlight_top3(s):
#     '''
#     Highlight the top 3 numerical values in each column with different colors.
#     Skip non-numerical columns.
#     '''
#     if s.name not in ["dataset", "lib", "model", "device", "num_sampled_rows", "synthetic_dataset_size_mb_deep", 
#                      "train_dataset_size_mb_deep", "synthetic_dataset_size_mb", "train_dataset_size_mb"]:
#     # in ['domain_cov', 'stats_cov', 'outliers_cov', 'missing_cov', 'ks_sim', 'tv_sim', 'corr_sim', 'contin_sim', 'sdv_quality_report', 'wass_dist', 'js_dist', 'new_row_synthesis']:
#         print(s.name)
#         top1 = s.nlargest(1)
#         top2 = s.nlargest(2).drop(top1.index)
#         top3 = s.nlargest(3).drop(top1.index).drop(top2.index)

#         return ['background-color: gold' if v in top1.values else
#                 'background-color: pink' if v in top2.values else
#                 'background-color: grey' if v in top3.values else ''
#                 for v in s]
#     else:
#         return ['' for _ in s]  # Return empty styling for non-numerical columns

# 

# TODOs

In [None]:
# Sequential Sequences Analysis

In [None]:
# HPO time analysis 

In [None]:
# Hyperimpute time analysis 

In [None]:
# Visualisation