# King County

In [28]:
DATASET_VERSION="a"
DATASET_VERSIONS=[f"{DATASET_VERSION}-{i+1}" for i in range(3)]
DATASET_NAME = "economicos"
import pandas as pd
import numpy as np

from syntheticml.data.synthetic import Synthetic, MODELS


In [29]:
def get_syn_a(data_version):
    df = pd.read_parquet('../datasets/economicos/raw/full_dedup_economicos_step0.parquet')

    #category_columns=("property_type", "transaction_type", "state", "county", "rooms", "bathrooms", "m_built", "m_size", "source", )
    category_columns=("property_type", "transaction_type", "state", "county", "rooms", "bathrooms", "source", )
    df_converted = df.dropna().astype({k: 'str' for k in ("description", "price", "title", "address", "owner",)})
    print(df_converted.shape)
    basedate = pd.Timestamp('2017-12-01')
    dtime = df_converted.pop("publication_date")
    df_converted["publication_date"] = dtime.apply(lambda x: (x - basedate).days)
    syn = Synthetic(df_converted, 
            id="url", 
            category_columns=category_columns,
            text_columns=("description", "price", "title", "address", "owner", "source", "url", ),
            exclude_columns=tuple(),
            synthetic_folder = f"../datasets/economicos/synth-{data_version}",
            models=["copulagan", "tvae", "gaussiancopula", "ctgan", "smote-enc", 'tddpm_mlp'],
            n_sample = df_converted.shape[0],
            target_column="_price",
            max_cpu_pool=1,
            model_parameters=dict(
                tddpm_mlp=dict(
                        batch_size=3750,
                        steps=3000000,
                        num_timesteps=100,
                        lr=5e-4,
                        model_params=dict(
                                rtdl_params=dict(
                                        dropout=0.0,
                                        d_layers=[1024, 512, 256]
                                )
                        )
                )
            )
    )

    syn.process()
    syn.process_scores()
    return syn

In [30]:
def get_syn_b(data_version):
    df = pd.read_parquet('../datasets/economicos/raw/full_dedup_economicos_step0.parquet')

    category_columns=("property_type", "transaction_type", "state", "county", "rooms", "bathrooms", "source", )
    df_converted = df.fillna(dict(
            property_type = "None",
            transaction_type = "None",
            state = "None",
            county = "None",
            rooms = -1,
            bathrooms = -1,
            m_built = -1,
            m_size = -1,
            source = "None"
    )).fillna(-1).astype({k: 'str' for k in ("description", "price", "title", "address", "owner",)})
    print(df_converted.shape)
    basedate = pd.Timestamp('2017-12-01')
    dtime = df_converted.pop("publication_date")
    df_converted["publication_date"] = dtime.apply(lambda x: (x - basedate).days)
    syn = Synthetic(df_converted, 
            id="url", 
            category_columns=category_columns,
            text_columns=("description", "price", "title", "address", "owner", "source", "url", ),
            exclude_columns=tuple(),
            synthetic_folder = f"../datasets/economicos/synth-{data_version}",
            models=["copulagan", "tvae", "gaussiancopula", "ctgan", "smote-enc", 'tddpm_mlp'],
            n_sample = df_converted.shape[0],
            target_column="_price",
            max_cpu_pool=1,
            model_parameters=dict(
                tddpm_mlp=dict(
                        batch_size=3750,
                        steps=300000,
                        num_timesteps=100,
                        lr=5e-4,
                        model_params=dict(
                                rtdl_params=dict(
                                        dropout=0.0,
                                        d_layers=[1024, 512, 256]
                                )
                        )
                )
                #tddpm_mlp=dict(
                #        batch_size=5000,
                #        steps=10000000,
                #        num_timesteps=10,
                #        lr=2e-6,
                #        model_params=dict(
                #                rtdl_params=dict(
                #                        dropout=0.0,
                #                        d_layers=[1024, 512, 256]
                #                )
                #        )
                #)
            )
    )

    syn.process()
    syn.process_scores()
    return syn

In [31]:
syns = [get_syn_b(data_version) for data_version in DATASET_VERSIONS]


(682338, 17)
{'columns': {'url': {'sdtype': 'text'}, 'description': {'sdtype': 'text'}, 'price': {'sdtype': 'text'}, 'property_type': {'sdtype': 'categorical'}, 'transaction_type': {'sdtype': 'categorical'}, 'state': {'sdtype': 'categorical'}, 'county': {'sdtype': 'categorical'}, 'rooms': {'sdtype': 'categorical'}, 'bathrooms': {'sdtype': 'categorical'}, 'm_built': {'sdtype': 'numerical'}, 'm_size': {'sdtype': 'numerical'}, 'source': {'sdtype': 'text'}, 'title': {'sdtype': 'text'}, 'address': {'sdtype': 'text'}, 'owner': {'sdtype': 'text'}, '_price': {'sdtype': 'numerical'}, 'publication_date': {'sdtype': 'numerical'}}, 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}
Fitting Models
Params
{'metadata': {
    "columns": {
        "url": {
            "sdtype": "text"
        },
        "description": {
            "sdtype": "text"
        },
        "price": {
            "sdtype": "text"
        },
        "property_type": {
            "sdtype": "categorical"
        },
        "transactio



Starting to generate data with model in:../datasets/economicos/synth-a-1/checkpoint/copulagan.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-1/checkpoint/tvae.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-1/checkpoint/gaussiancopula.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-1/checkpoint/ctgan.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-1/checkpoint/smote-enc.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-1/checkpoint/tddpm_mlp.ckp
(682338, 17)
{'columns': {'url': {'sdtype': 'text'}, 'description': {'sdtype': 'text'}, 'price': {'sdtype': 'text'}, 'property_type': {'sdtype': 'categorical'}, 'transaction_type': {'sdtype': 'categorical'}, 'state': {'sdtype': 'categorical'}, 'county': {'sdtype': 'categorical'}, 'rooms': {'sdtype': 'categorical'}, 'bathrooms': {'sdtype': 'categorical'}, 'm_built': {'sdtype': 'numerical'}, 'm_size': {'sdtype': 'n



Starting to generate data with model in:../datasets/economicos/synth-a-2/checkpoint/copulagan.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-2/checkpoint/tvae.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-2/checkpoint/gaussiancopula.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-2/checkpoint/ctgan.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-2/checkpoint/smote-enc.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-2/checkpoint/tddpm_mlp.ckp
(682338, 17)
{'columns': {'url': {'sdtype': 'text'}, 'description': {'sdtype': 'text'}, 'price': {'sdtype': 'text'}, 'property_type': {'sdtype': 'categorical'}, 'transaction_type': {'sdtype': 'categorical'}, 'state': {'sdtype': 'categorical'}, 'county': {'sdtype': 'categorical'}, 'rooms': {'sdtype': 'categorical'}, 'bathrooms': {'sdtype': 'categorical'}, 'm_built': {'sdtype': 'numerical'}, 'm_size': {'sdtype': 'n



Starting to generate data with model in:../datasets/economicos/synth-a-3/checkpoint/copulagan.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-3/checkpoint/tvae.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-3/checkpoint/gaussiancopula.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-3/checkpoint/ctgan.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-3/checkpoint/smote-enc.ckp
Starting to generate data with model in:../datasets/economicos/synth-a-3/checkpoint/tddpm_mlp.ckp


In [32]:
base_path = f"../docs/tesis/datasets/{DATASET_NAME.replace(' ','').lower()}-b"

In [33]:
def highlight_max_custom(series):
    # Extracción de los valores principales y los errores
    extracted = series.str.extract(r'([0-9.e+-]+)±([0-9.e+-]+)').astype(float)
    # Cálculo del valor modificado para el máximo
    modified_vals = extracted[0] 
    # Identificar el máximo
    max_val = modified_vals.max()
    return ['bfseries:;' if val == max_val else '' for val in modified_vals]

def highlight_min_custom(series):
    # Extracción de los valores principales y los errores
    extracted = series.str.extract(r'([0-9.e+-]+)±([0-9.e+-]+)').astype(float)
    # Cálculo del valor modificado para el mínimo
    modified_vals = extracted[0]
    # Identificar el mínimo
    min_val = modified_vals.min()
    return ['cellcolor:[rgb]{0.9, 0.54, 0.52};' if val == min_val else '' for val in modified_vals]


In [34]:
def scores_syns(syns):
    score_table = pd.concat([syn.scores.sort_values("score", ascending=False).assign(syni=i) for (i,syn) in enumerate(syns)]).groupby(["type", "name"]).agg({"score": lambda x: f"{np.mean(x):.2e}±{np.std(x):.2e}"}).reset_index().pivot(index="name", 
                                            columns=["type"],values="score").sort_values(
    "avg", ascending=False).rename(columns={'avg':'Score'}).loc[:,
    ["Column Pair Trends", "Column Shapes", "Coverage", "Boundaries", "Synthesis", "Score"]].reset_index().rename(columns={"name": "Model Name"}).rename(columns={"Score":"\\textbf{Score}"})
    formated_table = score_table.style.hide(axis="index").format(precision=3).format("\hline {}", score_table.columns[0], escape="latex").set_table_styles([
        {'selector': 'toprule', 'props': ':hline\n \\rowcolor[gray]{0.8};'},
        {'selector': 'bottomrule', 'props': ':hline;'}
    ], overwrite=False)\
    .apply(highlight_max_custom, subset=score_table.columns[1:])\
        .to_latex(
        column_format = f"|l|{'r|'*len(score_table.columns[1:])}",
        position="H",
        position_float="centering",
        caption = f"Evaluación de Métricas de Rendimiento para Diversos Modelos de Aprendizaje Automático, {DATASET_NAME.capitalize()}",
        label = f"table-score-{DATASET_NAME.lower()}-b",
        clines=None
    ).replace("\centering", "\\centering\n\\fontsize{7}{14}\\selectfont")
    with open(f"{base_path}/tables/table-score-{DATASET_NAME.lower()}-b.tex", "w") as stext:
        stext.write(formated_table)
    return formated_table
print(scores_syns(syns))


\begin{table}[H]
\centering
\fontsize{7}{14}\selectfont
\caption{Evaluación de Métricas de Rendimiento para Diversos Modelos de Aprendizaje Automático, Economicos}
\label{table-score-economicos-b}
\begin{tabular}{|l|r|r|r|r|r|r|}
\hline
 \rowcolor[gray]{0.8}
Model Name & Column Pair Trends & Column Shapes & Coverage & Boundaries & Synthesis & \textbf{Score} \\
\hline tddpm\_mlp & \bfseries 9.72e-01±1.50e-03 & \bfseries 9.83e-01±1.09e-03 & \bfseries 8.12e-01±1.89e-02 & \bfseries 1.00e+00±0.00e+00 & 9.90e-01±8.52e-04 & \bfseries 9.77e-01±6.88e-04 \\
\hline smote-enc & 9.59e-01±1.20e-03 & 9.76e-01±4.34e-04 & 6.27e-01±1.31e-02 & \bfseries 1.00e+00±0.00e+00 & 9.24e-01±1.97e-03 & 9.67e-01±8.19e-04 \\
\hline copulagan & 7.60e-01±1.58e-02 & 8.02e-01±2.69e-02 & 6.80e-01±6.95e-03 & \bfseries 1.00e+00±0.00e+00 & \bfseries 1.00e+00±0.00e+00 & 7.81e-01±2.03e-02 \\
\hline ctgan & 7.43e-01±1.27e-02 & 6.49e-01±1.69e-02 & 6.76e-01±7.85e-04 & \bfseries 1.00e+00±0.00e+00 & \bfseries 1.00e+00±0.00e+00 & 6

In [35]:
def coverage(syns):
    models = list(syns[0].get_details().keys())
    coverage_score = pd.concat([
        syn.get_details()[model]['diagnostic']['coverage'].assign(syni=i).assign(model=model)
        for (i, syn) in enumerate(syns)
        for model in ["smote-enc", "tddpm_mlp"]
    ]).groupby(["model", "Column", "Metric"]).agg({"Diagnostic Score": lambda x: f"{np.mean(x):.2e}±{np.std(x):.2e}"}).reset_index()\
    .pivot(index=["Column","Metric"], 
            values="Diagnostic Score", columns="model").sort_values("smote-enc", ascending=False).reset_index().rename(columns={"Column": "Columna", "Metric":"Metrica"})

    formated_coverage = coverage_score.sort_values("Columna").style.hide(axis="index")\
        .format("\hline {}", coverage_score.columns[0:1], escape="latex")\
        .format_index("{}", escape="latex", axis=1)\
        .set_table_styles([
        {'selector': 'toprule', 'props': ':hline\n\\rowcolor[gray]{0.8};'},
        {'selector': 'bottomrule', 'props': ':hline;'}
    ], overwrite=False)\
    .apply(highlight_max_custom, subset=coverage_score.columns[2:], axis=1)\
    .apply(highlight_min_custom, subset=coverage_score.columns[2:])\
        .to_latex(
        column_format = f"|l|l|{'r|'*len(coverage_score.columns[2:])}",
        position="H",
        position_float="centering",
        caption = f"Cobertura Categoría/Rango para Modelos Smote y Tddpm, {DATASET_NAME.capitalize()}",
        label = f"table-coverage-{DATASET_NAME.lower()}-b",
        clines=None
    )
    with open(f"{base_path}/tables/table-coverage-{DATASET_NAME.lower()}-b.tex", "w") as stext:
        stext.write(formated_coverage)
    return formated_coverage

print(coverage(syns))

\begin{table}[H]
\centering
\caption{Cobertura Categoría/Rango para Modelos Smote y Tddpm, Economicos}
\label{table-coverage-economicos-b}
\begin{tabular}{|l|l|r|r|}
\hline
\rowcolor[gray]{0.8}
Columna & Metrica & smote-enc & tddpm\_mlp \\
\hline \_price & RangeCoverage & 8.10e-01±1.34e-01 & \bfseries 9.11e-01±1.37e-02 \\
\hline bathrooms & CategoryCoverage & \bfseries 8.63e-01±5.00e-02 & 6.67e-01±1.39e-02 \\
\hline county & CategoryCoverage & 5.90e-01±3.05e-03 & \bfseries 7.99e-01±2.20e-02 \\
\hline m\_built & RangeCoverage & 3.18e-01±1.01e-01 & \bfseries 7.54e-01±1.77e-01 \\
\hline m\_size & RangeCoverage & \cellcolor[rgb]{0.9, 0.54, 0.52} 3.45e-02±1.98e-03 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 4.00e-01±1.51e-01 \\
\hline property\_type & CategoryCoverage & 6.30e-01±5.24e-02 & \bfseries 9.07e-01±5.24e-02 \\
\hline publication\_date & RangeCoverage & 9.77e-01±6.18e-03 & \bfseries 9.88e-01±4.44e-03 \\
\hline rooms & CategoryCoverage & 7.56e-01±3.98e-02 & \bfseries 7.97e-01±3.04e

In [36]:
def union_mean_std(x):
    return f"{np.mean(x):.2e}±{np.std(x):.2e}"

PCT = {
    "5th": "percentil 5",
    "1th": "percentil 1",
    "min": "mínimo"
}

def shape(syns):
    models = list(syns[0].get_details().keys())
    coverage_score = pd.concat([
        syn.get_details()[model]['report']['column_shape'].assign(syni=i).assign(model=model)
        for (i, syn) in enumerate(syns)
        for model in ["smote-enc", "tddpm_mlp"]
    ]).groupby(["model", "Column", "Metric"]).agg({"Quality Score": union_mean_std}).reset_index()\
    .pivot(index=["Column","Metric"], 
            values="Quality Score", columns="model").sort_values("smote-enc", ascending=False).reset_index().rename(columns={"Column": "Columna", "Metric":"Metrica"})

    formated_coverage = coverage_score.sort_values("Columna").style.hide(axis="index")\
        .format("\hline {}", coverage_score.columns[0:1], escape="latex")\
        .format_index("{}", escape="latex", axis=1)\
        .set_table_styles([
        {'selector': 'toprule', 'props': ':hline\n\\rowcolor[gray]{0.8};'},
        {'selector': 'bottomrule', 'props': ':hline;'}
    ], overwrite=False)\
    .apply(highlight_max_custom, subset=coverage_score.columns[2:], axis=1)\
    .apply(highlight_min_custom, subset=coverage_score.columns[2:])\
        .to_latex(
        column_format = f"|l|l|{'r|'*len(coverage_score.columns[2:])}",
        position="H",
        position_float="centering",
        caption = f"Evaluación de Similitud de Distribución para Modelos Smote y Tddpm, {DATASET_NAME.capitalize()}",
        label = f"table-shape-{DATASET_NAME.lower()}-b",
        clines=None
    )
    with open(f"{base_path}/tables/table-shape-{DATASET_NAME.lower()}-b.tex", "w") as stext:
        stext.write(formated_coverage)
    return formated_coverage

print(shape(syns))

\begin{table}[H]
\centering
\caption{Evaluación de Similitud de Distribución para Modelos Smote y Tddpm, Economicos}
\label{table-shape-economicos-b}
\begin{tabular}{|l|l|r|r|}
\hline
\rowcolor[gray]{0.8}
Columna & Metrica & smote-enc & tddpm\_mlp \\
\hline \_price & KSComplement & \bfseries 9.91e-01±3.85e-04 & 9.84e-01±3.53e-03 \\
\hline bathrooms & TVComplement & \bfseries 9.94e-01±6.66e-04 & 9.87e-01±2.15e-03 \\
\hline county & TVComplement & \cellcolor[rgb]{0.9, 0.54, 0.52} 9.22e-01±9.28e-04 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 9.66e-01±2.10e-03 \\
\hline m\_built & KSComplement & \bfseries 9.87e-01±2.14e-03 & \bfseries 9.87e-01±1.11e-03 \\
\hline m\_size & KSComplement & 9.72e-01±7.43e-04 & \bfseries 9.84e-01±3.22e-03 \\
\hline property\_type & TVComplement & 9.67e-01±1.33e-03 & \bfseries 9.82e-01±9.49e-04 \\
\hline publication\_date & KSComplement & 9.80e-01±1.61e-03 & \bfseries 9.85e-01±1.61e-03 \\
\hline rooms & TVComplement & 9.77e-01±2.28e-03 & \bfseries 9.81e-01±3.18

In [37]:
def dcr_score(syns, pct="5th"):
    dcr_score = pd.concat([ syn.scores[syn.scores["type"] == "avg"].sort_values("score", ascending=False).loc[:,[f"DCR ST {pct}", f"DCR SH {pct}", f"DCR TH {pct}", "score"]]
    for syn in syns ]).sort_values("score", ascending=False).reset_index().groupby("name").agg(union_mean_std).loc[["tddpm_mlp", "smote-enc", "ctgan", "copulagan", "gaussiancopula", "tvae"],:].reset_index().rename(columns={'name':"Modelo", "score": "\\textbf{Score}", f"DCR ST {pct}":"DCR ST", f"DCR SH {pct}": "DCR SH", f"DCR TH {pct}": "DCR TH" })

    formated_dcr = dcr_score.style.hide(axis="index")\
        .format(precision=3)\
        .format("\hline {}", dcr_score.columns[0], escape="latex")\
        .set_table_styles([
        {'selector': 'toprule', 'props': ':hline\n\\rowcolor[gray]{0.8};'},
        {'selector': 'bottomrule', 'props': ':hline;'}
    ], overwrite=False).apply(highlight_max_custom,
        subset=dcr_score.columns[1:],
        axis=0
    ).apply(highlight_min_custom,
        subset=dcr_score.columns[1:],
        axis=0
    ).to_latex(
        column_format = f"|l|l|{'r|'*len(dcr_score.columns[1:])}",
        position="H",
        position_float="centering",
        caption = f"Distancia de registros más cercanos entre conjuntos Sinteticos, {PCT[pct]}, {DATASET_NAME.capitalize()}",
        label = f"table-dcr-{DATASET_NAME.lower()}-b-{pct}",
        clines=None
    ).replace("\centering", "\\centering\n\\fontsize{10}{14}\\selectfont")
    with open(f"{base_path}/tables/table-dcr-{DATASET_NAME.lower()}-b-{pct}.tex", "w") as stext:
        stext.write(formated_dcr)
    return formated_dcr

print(dcr_score(syns))
print(dcr_score(syns, "1th"))
print(dcr_score(syns), "min")

\begin{table}[H]
\centering
\fontsize{10}{14}\selectfont
\caption{Distancia de registros más cercanos entre conjuntos Sinteticos, percentil 5, Economicos}
\label{table-dcr-economicos-b-5th}
\begin{tabular}{|l|l|r|r|r|r|}
\hline
\rowcolor[gray]{0.8}
Modelo & DCR ST & DCR SH & DCR TH & \textbf{Score} \\
\hline tddpm\_mlp & 4.29e-09±2.16e-10 & \cellcolor[rgb]{0.9, 0.54, 0.52} 3.50e-08±1.92e-09 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 1.28e-08±0.00e+00 & \bfseries 9.77e-01±6.88e-04 \\
\hline smote-enc & \cellcolor[rgb]{0.9, 0.54, 0.52} 2.90e-11±1.13e-12 & 4.41e-08±2.36e-09 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 1.28e-08±0.00e+00 & 9.67e-01±8.19e-04 \\
\hline ctgan & \bfseries 7.59e-06±5.75e-06 & \bfseries 1.91e-05±2.01e-05 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 1.28e-08±0.00e+00 & 6.96e-01±1.00e-02 \\
\hline copulagan & 1.27e-06±3.04e-07 & 2.73e-06±5.89e-07 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 1.28e-08±0.00e+00 & 7.81e-01±2.03e-02 \\
\hline gaussiancopula & 5.11e-06

\begin{table}[H]
\centering
\fontsize{10}{14}\selectfont
\caption{Distancia de registros más cercanos entre conjuntos Sinteticos, percentil 1, Economicos}
\label{table-dcr-economicos-b-1th}
\begin{tabular}{|l|l|r|r|r|r|}
\hline
\rowcolor[gray]{0.8}
Modelo & DCR ST & DCR SH & DCR TH & \textbf{Score} \\
\hline tddpm\_mlp & 1.44e-10±6.01e-12 & \cellcolor[rgb]{0.9, 0.54, 0.52} 1.40e-09±1.05e-10 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 0.00e+00±0.00e+00 & \bfseries 9.77e-01±6.88e-04 \\
\hline smote-enc & \cellcolor[rgb]{0.9, 0.54, 0.52} 0.00e+00±0.00e+00 & 1.41e-09±4.21e-10 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 0.00e+00±0.00e+00 & 9.67e-01±8.19e-04 \\
\hline ctgan & \bfseries 2.20e-06±1.50e-06 & \bfseries 3.24e-06±1.58e-06 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 0.00e+00±0.00e+00 & 6.96e-01±1.00e-02 \\
\hline copulagan & 2.04e-07±2.85e-08 & 4.37e-07±6.38e-08 & \bfseries \cellcolor[rgb]{0.9, 0.54, 0.52} 0.00e+00±0.00e+00 & 7.81e-01±2.03e-02 \\
\hline gaussiancopula & 8.04e-07

In [38]:
def nndr_score(syns, pct="5th"):
    dcr_score = pd.concat([ syn.scores[syn.scores["type"] == "avg"].sort_values("score", ascending=False).loc[:,[f"NNDR ST {pct}", f"NNDR SH {pct}", f"NNDR TH {pct}", "score"]]
    for syn in syns ]).reset_index().sort_values("score", ascending=False).groupby("name").agg(union_mean_std).loc[["tddpm_mlp", "smote-enc", "ctgan", "copulagan", "gaussiancopula", "tvae"],:].reset_index().rename(columns={'name':"Modelo", "score": "\\textbf{Score}", f"NNDR ST {pct}": "NNDR ST", f"NNDR SH {pct}": "NNDR SH", f"NNDR TH {pct}": "NNDR TH"})

    

    formated_dcr = dcr_score.style.hide(axis="index")\
        .format(precision=3)\
        .format("\hline {}", dcr_score.columns[0], escape="latex")\
        .set_table_styles([
        {'selector': 'toprule', 'props': ':hline\n\\rowcolor[gray]{0.8};'},
        {'selector': 'bottomrule', 'props': ':hline;'}
    ], overwrite=False).apply(highlight_max_i_custom,
        subset=dcr_score.columns[1:],
        axis=0
    ).apply(highlight_min_i_custom,
        subset=dcr_score.columns[1:],
        axis=0
    ).to_latex(
        column_format = f"|l|l|{'r|'*len(dcr_score.columns[1:])}",
        position="H",
        position_float="centering",
        caption = f"Proporción entre el más cercano y el segundo más cercano, {PCT[pct]}, {DATASET_NAME.capitalize()}",
        label = f"table-nndr-{DATASET_NAME.lower()}-b-{pct}",
        clines=None
    ).replace("\centering", "\\centering\n\\fontsize{10}{14}\\selectfont")
    with open(f"{base_path}/tables/table-nndr-{DATASET_NAME.lower()}-b-{pct}.tex", "w") as stext:
        stext.write(formated_dcr)
    return formated_dcr

print(nndr_score(syns))
print(nndr_score(syns, "1th"))
print(nndr_score(syns, "min"))

\begin{table}[H]
\centering
\fontsize{10}{14}\selectfont
\caption{Proporción entre el más cercano y el segundo más cercano, percentil 5, Economicos}
\label{table-nndr-economicos-b-5th}
\begin{tabular}{|l|l|r|r|r|r|}
\hline
\rowcolor[gray]{0.8}
Modelo & NNDR ST & NNDR SH & NNDR TH & \textbf{Score} \\
\hline tddpm\_mlp & 6.79e-02±7.37e-04 & \bfseries 1.00e-01±2.26e-03 & \cellcolor[rgb]{0.9, 0.54, 0.52} \bfseries 1.31e-02±0.00e+00 & \cellcolor[rgb]{0.9, 0.54, 0.52} 9.77e-01±6.88e-04 \\
\hline smote-enc & \bfseries 7.15e-04±7.49e-06 & 1.14e-01±4.79e-03 & 1.31e-02±0.00e+00 & 9.67e-01±8.19e-04 \\
\hline ctgan & 2.57e-01±8.81e-03 & 3.27e-01±4.72e-02 & 1.31e-02±0.00e+00 & 6.96e-01±1.00e-02 \\
\hline copulagan & 2.01e-01±1.27e-02 & 2.23e-01±5.47e-02 & 1.31e-02±0.00e+00 & 7.81e-01±2.03e-02 \\
\hline gaussiancopula & \cellcolor[rgb]{0.9, 0.54, 0.52} 3.07e-01±0.00e+00 & 2.76e-01±0.00e+00 & 1.31e-02±0.00e+00 & 6.91e-01±6.41e-17 \\
\hline tvae & 3.02e-01±6.15e-03 & \cellcolor[rgb]{0.9, 0.54, 0.52} 3