# King County

In [None]:
%%capture
import sys
!ln -s ../syntheticml .
!{sys.executable} -m pip install -r ../requirements.txt
!{sys.executable} -m pip install -U kaleido
!{sys.executable} -m pip install git+https://github.com/mostly-ai/virtualdatalab.git

In [None]:
import pandas as pd
df = pd.read_csv('../datasets/kingcounty/raw/kc_house_data.csv');
df.sample(3)

In [None]:
%%capture
from syntheticml.data.synthetic import Synthetic, MODELS
syn = Synthetic(df,
                    id="id",
                    #category_columns=("condition", "floors", "grade", "view", "waterfront", "yr_built", "yr_renovated", "zipcode", "bathrooms", "bedrooms",),
                    category_columns=("condition", "floors", "grade", "view",
                                      "waterfront", "zipcode", "bathrooms", "bedrooms",),
                    synthetic_folder="../datasets/kingcounty/synth",
                    models=MODELS.keys(),
                    n_sample=21613,
                    max_cpu_pool=1,
                    target_column="price"
                    )

In [None]:
syn.process()

syn.process_scores()


: 

## Metricas
### Continuas

In [None]:
metrics = syn.current_metrics()
metrics[~metrics.is_categorical].dropna(axis=1, how='all')

In [None]:
print(metrics[~metrics.is_categorical].dropna(axis=1, how='all').head(1).T
      #.style.to_latex()
      )

### Categoricas

In [None]:
metrics[metrics.is_categorical].dropna(axis=1, how='all').head(1).T.to_markdown()

In [None]:
len(metrics[~metrics.is_categorical].dropna(axis=1, how='all').columns)-1+len(metrics[metrics.is_categorical].dropna(axis=1, how='all').columns)-1

## Scores

In [None]:
best_model = "tddpm_mlp_21613"
second_best_model = "smote-enc_21613"
avg = syn.scores[syn.scores["type"] == "avg"]
avg.sort_values("score", ascending=False).loc[["tddpm_mlp_21613","smote-enc_21613","gaussiancopula_noise_21613","tvae_21613", "tvae_noise_21613","gaussiancopula_21613","copulagan_noise_21613","copulagan_21613","ctgan_noise_21613","ctgan_21613"],["score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]]
#["tddpm_mlp_21613","smote-enc_21613","gaussiancopula_noise_21613","tvae_21613", "tvae_noise_21613","gaussiancopula_21613","copulagan_noise_21613","copulagan_21613","ctgan_noise_21613","ctgan_21613"]

In [None]:
print(avg.sort_values("score", ascending=False).loc[["tddpm_mlp_21613","smote-enc_21613","gaussiancopula_noise_21613","tvae_21613", "tvae_noise_21613","gaussiancopula_21613","copulagan_noise_21613","copulagan_21613","ctgan_noise_21613","ctgan_21613"],["score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]].style.to_latex().replace("_", "\_"))


In [None]:
syn.scores.type.unique()

In [None]:
print(syn.scores.sort_values("score", ascending=False).loc[["tddpm_mlp_21613","smote-enc_21613","gaussiancopula_noise_21613","tvae_21613", "tvae_noise_21613","gaussiancopula_21613","copulagan_noise_21613","copulagan_21613","ctgan_noise_21613","ctgan_21613"],["type", "score"]].reset_index().pivot(index="name", columns=["type"],values="score").sort_values("avg", ascending=False).rename(columns={'avg':'Score'}).loc[:,["Synthesis","Column Pair Trends", "Column Shapes", "Coverage", "Boundaries", "Score"]].style.to_latex().replace("_", "\_"))
# .style.to_latex()
#syn.scores[syn.scores["type"] != "avg"].sort_values("score", ascending=False).loc[["tddpm_mlp_21613","smote-enc_21613"],["type", "score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]]

In [None]:
from syntheticml.data.charts import Charts
import os
folder_path = f"../docs/tesis/imagenes/kingcounty/{best_model}"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_charts(best_model, {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))

In [None]:
import os
folder_path = f"../docs/tesis/imagenes/kingcounty/top2+1"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_multiple_charts([best_model,second_best_model,"copulagan_21613"], {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))


In [None]:
from syntheticml.data.charts import Charts
import os
folder_path = f"../docs/tesis/imagenes/kingcounty/{second_best_model}"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_charts(second_best_model, {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))

In [None]:
syn.charts.pair_corr(syn.df, syn.fake_data[best_model], {'id', 'waterfront', 'yr_renovated'}, "price").update_layout(dict(width=1000)).show("png")

In [None]:
current_metrics = syn.current_metrics()
fake_metrics = syn.get_metrics_fake()

In [None]:
columns = ["bathrooms", "sqft_lot", "sqft_above", "price", "sqft_lot15", "sqft_living", "sqft_basement", "yr_built", "sqft_living15", "grade"]

In [None]:
current_metrics.loc[(current_metrics.name.isin(columns) & current_metrics.is_categorical),:].dropna(axis=1, how='all')

In [None]:
models = ["smote-enc_21613", "tddpm_mlp_21613"]
prop_cat = ["name", "top5", "top5_prob"]
import numpy as np
columns = list(current_metrics.name.unique())
dfs = [
    current_metrics.loc[(current_metrics.name.isin(columns) & current_metrics.is_categorical),prop_cat].dropna(axis=1, how='all').assign(model="Real")
]
for model_name in models:
    dfs.append(fake_metrics[model_name].loc[(fake_metrics[model_name].name.isin(columns) & fake_metrics[model_name].is_categorical),prop_cat].dropna(axis=1, how='all').assign(model=model_name))

diffdf = pd.concat(dfs).sort_values(["name", "model"], ascending=[True, True]).loc[:, ["model", "name"] + prop_cat[1:]]


for name in diffdf.name.unique():
    #print(name)
    #print(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]])
    #print(diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]])
    #diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")), ["top5_prob"]] = np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #    - diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values)
    #print(
    #    np.array(np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #    - diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values).tolist()).reshape(-1,5).sum(axis=1).reshape(-1,1)
    #)

    #diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")), ["top5_prob"]] = np.array(np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #    - diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values).tolist()).reshape(-1,5).sum(axis=1).reshape(-1,1)

    #diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")), ["top5_prob"]] = np.array(np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #- diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values).tolist()).sum(axis=2)
    pass
diffdf
#print(diffdf.style.to_latex().replace("_", "\_"))

In [None]:
models = ["smote-enc_21613", "tddpm_mlp_21613"]
prop_cat = ["name", "range", "min", "1.0%", "mean", "99.0%", "max"]
import numpy as np
columns = list(current_metrics.name.unique())
dfs = [
    current_metrics.loc[(current_metrics.name.isin(columns) & ~current_metrics.is_categorical),prop_cat].dropna(axis=1, how='all').assign(model="Real")
]
for model_name in models:
    dfs.append(fake_metrics[model_name].loc[(fake_metrics[model_name].name.isin(columns) & ~fake_metrics[model_name].is_categorical),prop_cat].dropna(axis=1, how='all').assign(model=model_name))

diffdf = pd.concat(dfs).sort_values(["name", "model"], ascending=[True, True]).loc[:, ["model", "name"] + prop_cat[1:]]
diffdf

In [None]:
k = syn.get_details()

In [None]:
k.keys()

In [None]:
k['smote-enc_21613']['report']['column_pair_trends']
#k['smote-enc_21613']['report']['column_shape']

In [None]:
pd.concat(
    [k[model_name]['report']['column_pair_trends'].loc[:,["Column 1", "Quality Score"]].groupby("Column 1").mean(numeric_only=True).assign(model=model_name) for model_name in models]
).reset_index().pivot(index=["Column 1"], values="Quality Score", columns="model").sort_values("smote-enc_21613", ascending=False)

In [None]:
display(
    pd.concat(
    [k[model_name]['report']['column_shape'].assign(model=model_name) for model_name in models]
).pivot(index=["Column","Metric"], values="Quality Score", columns="model").sort_values("smote-enc_21613", ascending=False)
#.style.to_latex().replace("_", "\_")
)

In [None]:
display(
    pd.concat(
    [k[model_name]['diagnostic']['coverage'].assign(model=model_name) for model_name in models]
).pivot(index=["Column","Metric"], values="Diagnostic Score", columns="model").sort_values("smote-enc_21613", ascending=False)
#.style.to_latex().replace("_", "\_")
)

In [None]:
list(k.keys())

In [None]:
pair_trends = k['smote-enc_21613']['report']['column_pair_trends']
pair_trends[pair_trends['Column 1'] == "price"]