In [1]:
import json
import os
import pandas as pd

In [11]:
from sdv.datasets.demo import get_available_demos, download_demo

In [5]:
from ydata_profiling import ProfileReport

  def hasna(x: np.ndarray) -> bool:


In [5]:
datasets_df = get_available_demos(modality='single_table')
datasets_df

Unnamed: 0,dataset_name,size_MB,num_tables
0,KRK_v1,0.07,1
1,adult,3.91,1
2,alarm,4.52,1
3,asia,1.28,1
4,census,98.17,1
5,census_extended,4.95,1
6,child,3.2,1
7,covtype,255.65,1
8,credit,68.35,1
9,expedia_hotel_logs,0.2,1


In [6]:
single_table_dataset_names = list(datasets_df["dataset_name"])

In [9]:
def get_dataset_with_sdv(modality, dataset_name):
    real_data, metadata = download_demo(
        modality=modality,
        dataset_name=dataset_name,
)

In [6]:
def get_minimal_profile(variables_dict):
    profiles = {}
    profiles["table"] = variables_dict["table"]
    profiles["alerts"] = variables_dict["alerts"]
    profiles["variables"] = {}
    for k, v in variables_dict["variables"].items():
        dtype = v["type"]
        var_profile = {}
        
        # Supported Ydata types
        # - Boolean
        # - Numerical
        # - Date (and Datetime)
        # - Categorical
        # - Time-series
        # - URL
        # - Path
        # - File
        # - Image
        if dtype == "Categorical":
            var_profile = {
                "type": dtype,
                "n_distinct": v["n_distinct"],
                "n_missing": v["n_missing"],
                "freq_counts": v["word_counts"],
                "imbalance": v["imbalance"]
            }
        elif dtype == "Numeric":
            var_profile = {
                "type": dtype,
                "n_distinct": v["n_distinct"],
                "n_missing": v["n_missing"],             
                "kurtosis": v["kurtosis"],
                "skewness": v["skewness"],
                "monotonic": v["monotonic"]
            }
        elif dtype == "Boolean":
            var_profile = {
                "type": dtype,
                "n_distinct": v["n_distinct"],
                "n_missing": v["n_missing"],
                "imbalance": v["imbalance"]
            }
        else:    
            var_profile = v
            print("None a valid type : ", dtype)
        profiles["variables"][k] = var_profile
    return profiles

In [None]:
for dataset_name in single_table_dataset_names:
    print("-*-"*10)
    print(" "*4, dataset_name)
    print("-*-"*10)
    
    output_path = f"dataset_profiles/{dataset_name}"
    if not os.path.exists(output_path):
        os.makedirs(output_path)
        
    df = pd.read_csv(f"data/tabular/{dataset_name}.csv")
    profile = ProfileReport(df, minimal=False)
    
    # As a JSON string
    json_data = profile.to_json()
        
    json_dict = json.loads(json_data)
    var_profiles = get_minimal_profile(json_dict)
        
    # Save the dictionary as a JSON file
    with open(f"{output_path}/{dataset_name}.json", "w") as json_file:
        json.dump(var_profiles, json_file)

    # As a file
    profile.to_file(f"{output_path}/{dataset_name}_detailed.json")
    profile.to_file(f"{output_path}/{dataset_name}.html")

In [3]:
seq_datasets_df = get_available_demos(modality='sequential')
seq_table_dataset_names = seq_datasets_df["dataset_name"]

In [4]:
seq_datasets_df

Unnamed: 0,dataset_name,size_MB,num_tables
0,ArticularyWordRecognition,8.61,1
1,AtrialFibrillation,0.92,1
2,BasicMotions,0.64,1
3,CharacterTrajectories,19.19,1
4,Cricket,17.24,1
5,DuckDuckGeese,291.38,1
6,ERing,1.25,1
7,EchoNASDAQ,4.16,1
8,EigenWorms,372.63,1
9,Epilepsy,3.17,1


In [None]:
for dataset_name in seq_table_dataset_names:
    print("-*-"*10)
    print(" "*4, dataset_name)
    print("-*-"*10)
    
    try:
        output_path = f"dataset_profiles/seq/{dataset_name}"
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        else:
            continue

        df = pd.read_csv(f"data/sequential/{dataset_name}.csv")
        profile = ProfileReport(df, minimal=True)

        # As a JSON string
        json_data = profile.to_json()

        json_dict = json.loads(json_data)
        var_profiles = get_minimal_profile(json_dict)

        # Save the dictionary as a JSON file
        with open(f"{output_path}/{dataset_name}.json", "w") as json_file:
            json.dump(var_profiles, json_file)

        # As a file
        profile.to_file(f"{output_path}/{dataset_name}_detailed.json")
        profile.to_file(f"{output_path}/{dataset_name}.html")
    except Exception as e:
        print(e)