In [1]:
import json
import os
import pandas as pd

In [11]:
from sdv.datasets.demo import get_available_demos, download_demo

In [2]:
from ydata_profiling import ProfileReport

  def hasna(x: np.ndarray) -> bool:


In [5]:
datasets_df = get_available_demos(modality='single_table')
datasets_df

Unnamed: 0,dataset_name,size_MB,num_tables
0,KRK_v1,0.07,1
1,adult,3.91,1
2,alarm,4.52,1
3,asia,1.28,1
4,census,98.17,1
5,census_extended,4.95,1
6,child,3.2,1
7,covtype,255.65,1
8,credit,68.35,1
9,expedia_hotel_logs,0.2,1


In [6]:
single_table_dataset_names = list(datasets_df["dataset_name"])

In [9]:
def get_dataset_with_sdv(modality, dataset_name):
    real_data, metadata = download_demo(
        modality=modality,
        dataset_name=dataset_name,
)

In [3]:
def get_minimal_profile(variables_dict):
    profiles = {}
    profiles["table"] = variables_dict["table"]
    profiles["alerts"] = variables_dict["alerts"]
    profiles["variables"] = {}
    for k, v in variables_dict["variables"].items():
        dtype = v["type"]
        var_profile = {}
        
        # Supported Ydata types
        # - Boolean
        # - Numerical
        # - Date (and Datetime)
        # - Categorical
        # - Time-series
        # - URL
        # - Path
        # - File
        # - Image
        if dtype == "Categorical":
            var_profile = {
                "type": dtype,
                "n_distinct": v["n_distinct"],
                "n_missing": v["n_missing"],
                "freq_counts": v["word_counts"],
                "imbalance": v["imbalance"]
            }
        elif dtype == "Numeric":
            var_profile = {
                "type": dtype,
                "n_distinct": v["n_distinct"],
                "n_missing": v["n_missing"],             
                "kurtosis": v["kurtosis"],
                "skewness": v["skewness"],
                "monotonic": v["monotonic"]
            }
        elif dtype == "Boolean":
            var_profile = {
                "type": dtype,
                "n_distinct": v["n_distinct"],
                "n_missing": v["n_missing"],
                "imbalance": v["imbalance"]
            }
        else:    
            var_profile = v
            print("None a valid type : ", dtype)
        profiles["variables"][k] = var_profile
    return profiles

In [None]:
for dataset_name in single_table_dataset_names:
    print("-*-"*10)
    print(" "*4, dataset_name)
    print("-*-"*10)
    
    output_path = f"dataset_profiles/{dataset_name}"
    if not os.path.exists(output_path):
        os.makedirs(output_path)
        
    df = pd.read_csv(f"data/tabular/{dataset_name}.csv")
    profile = ProfileReport(df, minimal=False)
    
    # As a JSON string
    json_data = profile.to_json()
        
    json_dict = json.loads(json_data)
    var_profiles = get_minimal_profile(json_dict)
        
    # Save the dictionary as a JSON file
    with open(f"{output_path}/{dataset_name}.json", "w") as json_file:
        json.dump(var_profiles, json_file)

    # As a file
    profile.to_file(f"{output_path}/{dataset_name}_detailed.json")
    profile.to_file(f"{output_path}/{dataset_name}.html")

In [3]:
seq_datasets_df = get_available_demos(modality='sequential')
seq_table_dataset_names = seq_datasets_df["dataset_name"]

In [4]:
seq_datasets_df

Unnamed: 0,dataset_name,size_MB,num_tables
0,ArticularyWordRecognition,8.61,1
1,AtrialFibrillation,0.92,1
2,BasicMotions,0.64,1
3,CharacterTrajectories,19.19,1
4,Cricket,17.24,1
5,DuckDuckGeese,291.38,1
6,ERing,1.25,1
7,EchoNASDAQ,4.16,1
8,EigenWorms,372.63,1
9,Epilepsy,3.17,1


In [None]:
for dataset_name in seq_table_dataset_names:
    print("-*-"*10)
    print(" "*4, dataset_name)
    print("-*-"*10)
    
    try:
        output_path = f"dataset_profiles/sequential/{dataset_name}"
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        else:
            continue

        df = pd.read_csv(f"data/sequential/{dataset_name}.csv")
        profile = ProfileReport(df, minimal=True)

        # As a JSON string
        json_data = profile.to_json()

        json_dict = json.loads(json_data)
        var_profiles = get_minimal_profile(json_dict)

        # Save the dictionary as a JSON file
        with open(f"{output_path}/{dataset_name}.json", "w") as json_file:
            json.dump(var_profiles, json_file)

        # As a file
        profile.to_file(f"{output_path}/{dataset_name}_detailed.json")
        profile.to_file(f"{output_path}/{dataset_name}.html")
    except Exception as e:
        print(e)

In [None]:
# seq_data_path = "/Users/anshusingh/DPPCC/whitespace/benchmarking-synthetic-data-generators/all_sample_datasets/seq"

# df = pd.read_csv(f"{seq_data_path}/taxi.csv")
# profile = ProfileReport(df, minimal=True)
# json_data = profile.to_json()

# json_dict = json.loads(json_data)
# var_profiles = get_minimal_profile(json_dict)

# # Save the dictionary as a JSON file
# with open(f"taxi.json", "w") as json_file:
#     json.dump(var_profiles, json_file)

In [35]:
all_exp_datasets = ["adult", "census", "child", "covtype", "credit", "insurance", 
                    "intrusion", "health_insurance", "drugs", "loan", "nasdaq", "taxi", "pums"]

# all_exp_datasets = ["pums"]

In [53]:
cmp_datasets = {
    "name": [],
    "memory_size": [],
    "record_size": [],
    "p_duplicates": [],
    "p_cells_missing": [],
    "n_rows": [],
    "n_cols": [],
    "n_numeric": [],
    "n_categoric": [],
    "n_boolean": [], 
    "n_datetime": [], 
    "n_text": []
}

n_distinct_cats = {}

for dn in all_exp_datasets:
    try:
        # Load the JSON file into a dictionary
        with open(f"../dataset_profiles/{dn}.json", "r") as json_file:
            data_dict = json.load(json_file)
        tab_data = data_dict["table"]
        cmp_datasets["name"].append(dn)
        cmp_datasets["memory_size"].append(tab_data["memory_size"])
        cmp_datasets["record_size"].append(tab_data["record_size"])
        try:
            cmp_datasets["p_duplicates"].append(tab_data["p_duplicates"])
        except Exception as e:
            cmp_datasets["p_duplicates"].append("NA")
            print(e)
        cmp_datasets["p_cells_missing"].append(tab_data["p_cells_missing"])
        cmp_datasets["n_rows"].append(tab_data["n"])
        cmp_datasets["n_cols"].append(tab_data["n_var"])
        cmp_datasets["n_numeric"].append(tab_data["types"]["Numeric"])
        try:
            cmp_datasets["n_categoric"].append(tab_data["types"]["Categorical"])
        except Exception as e:
            cmp_datasets["n_categoric"].append(0)
        try:
            cmp_datasets["n_boolean"].append(tab_data["types"]["Boolean"])
        except Exception as e:
            cmp_datasets["n_boolean"].append(0)
            
        try:
            cmp_datasets["n_text"].append(tab_data["types"]["Text"])
        except Exception as e:
            cmp_datasets["n_text"].append(0)
            
        try:
            cmp_datasets["n_datetime"].append(tab_data["types"]["DateTime"])
        except Exception as e:
            cmp_datasets["n_datetime"].append(0)
        
        # print("-"*30, dn.upper())
        data_cat_cols = {}
        for column, details in data_dict["variables"].items():
            if details["type"] == "Categorical":
                # print(f"{column}: n_distinct = {details['n_distinct']}")
                data_cat_cols[column] = details['n_distinct']
        n_distinct_cats[dn] = data_cat_cols
            
        # print(data_dict["alerts"])
        # print("-"*30)
             
    except Exception as e: 
        print(e)
        
with open(f"n_distinct_cats.json", "w") as json_file:
    json.dump(n_distinct_cats, json_file)

'p_duplicates'
'p_duplicates'
'p_duplicates'


In [40]:

cmp_datasets_df = pd.DataFrame(cmp_datasets)

In [41]:
cmp_datasets_df

Unnamed: 0,name,memory_size,record_size,p_duplicates,p_cells_missing,n_rows,n_cols,n_numeric,n_categoric,n_boolean,n_datetime,n_text
0,adult,4167936,128.003931,0.0,0.0,32561,16,7,9,0,0,0
1,census,100559888,336.000428,0.0,0.0001017504,299285,42,10,31,0,0,1
2,child,3360128,168.0064,0.0,0.01982619,20000,21,1,14,6,0,0
3,covtype,260293504,448.00022,0.0,0.0,581012,56,12,44,0,0,0
4,credit,70632264,248.000449,0.0,0.0,284807,31,30,1,0,0,0
5,insurance,3500128,175.0064,0.0,0.05205357,20000,28,1,20,7,0,0
6,intrusion,165991184,336.000259,0.0,0.0,494021,42,14,27,0,0,1
7,health_insurance,75056,56.095665,0.000747,0.0,1338,7,4,2,1,0,0
8,drugs,2443008,320.016767,0.0,0.5617108,7634,40,1,9,21,1,8
9,loan,560128,112.0256,0.0,0.0,5000,14,7,7,0,0,0
