# Economicos

In [1]:
%%capture
import sys
!ln -s ../syntheticml .
!{sys.executable} -m pip install -U -r ../requirements.txt
!{sys.executable} -m pip install -U kaleido

In [2]:
import pandas as pd
import numpy as np
from syntheticml.data.synthetic import Synthetic, MODELS

df = pd.read_parquet('../datasets/economicos/raw/full_dedup_economicos_step0.parquet');
df.sample(3)

Unnamed: 0,url,description,price,property_type,transaction_type,state,county,publication_date,rooms,bathrooms,m_built,m_size,source,title,address,owner,_price
48351,https://www.economicos.cl/propiedades/vendo-ca...,"Casa sólida de 1 piso en Villa Alemana, solea...",$ 57.000.000,Casa,Venta,Valparaíso,Villa Alemana,2018-02-27 19:29:54,3.0,1.0,70.0,326.0,,,,,2117.471337
403314,https://www.economicos.cl/propiedades/departam...,"67.000.000 Barrio Universitario, dormitorio, s...",$ 67.000.000,Departamento,Venta,,Santiago,2018-04-22 00:00:00,,,,,El Mercurio,Departamento en Venta en Santiago,"Santiago,",,2482.377896
616227,https://www.economicos.cl/propiedades/local-o-...,"Fabuloso Local Comercial, Manuel Montt, incluy...",55 UF,Local o Casa comercial,Arriendo,Metropolitana de Santiago,Providencia,2019-11-30 00:08:00,,2.0,,,El Mercurio,Local o Casa comercial en Arriendo en Providen...,"Providencia, Metropolitana de Santiago",,55.0


In [3]:
%%capture
category_columns=("property_type", "transaction_type", "state", "county", "rooms", "bathrooms", "source", )
df_converted = df.dropna().astype({k: 'str' for k in ("description", "price", "title", "address", "owner",)})
print(df_converted.shape)
basedate = pd.Timestamp('2017-12-01')
dtime = df_converted.pop("publication_date")
df_converted["publication_date"] = dtime.apply(lambda x: (x - basedate).days)
syn = Synthetic(df_converted, 
        id="url", 
        category_columns=category_columns,
        text_columns=("description", "price", "title", "address", "owner", ),
        exclude_columns=tuple(),
        synthetic_folder = "../datasets/economicos/synth-a",
        models=['copulagan', 'tvae', 'gaussiancopula', 'ctgan', 'smote-enc', 'tddpm_mlp'],
        n_sample = df_converted.shape[0],
        target_column="_price",
        max_cpu_pool=1,
        use_noise=False
)


In [4]:
%%capture
syn.process()
syn.process_scores()


In [5]:
best_model = "tddpm_mlp_27574"
second_best_model= "smote-enc_27574"

## Metricas
### Continuas

In [6]:
metrics = syn.current_metrics()
metrics[~metrics.is_categorical].dropna(axis=1, how='all')

m_built
publication_date
_price
m_size


Unnamed: 0,name,top5,top5_freq,top5_prob,is_categorical,nobs,missing,mean,std_err,upper_ci,...,mode_freq,median,0.1%,1.0%,5.0%,25.0%,75.0%,95.0%,99.0%,99.9%
4,m_built,"[140.0, 60.0, 120.0, 50.0, 70.0]","[700, 467, 444, 431, 415]","[0.031733079468697585, 0.021170497302688246, 0...",False,22059.0,0.0,1771.336159,664.364677,3073.466999,...,0.031733,107.0,2.0,23.0,33.0,60.0,200.0,490.0,1200.0,37946.564
6,publication_date,"[1545, 1693, 1546, 1549, 721]","[10883, 6103, 895, 320, 125]","[0.4933587197969083, 0.2766671199963734, 0.040...",False,22059.0,0.0,1470.755338,2.056273,1474.78556,...,0.493359,1545.0,450.696,531.0,628.9,1545.0,1693.0,1693.0,1693.0,1693.0
7,_price,"[12500.0, 10500.0, 11500.0, 8500.0, 9000.0]","[104, 99, 91, 86, 85]","[0.00471462894963507, 0.00448796409628723, 0.0...",False,22059.0,0.0,110378.900259,32745.67551,174559.244908,...,0.004715,5083.869574,0.262868,6.270122,11.759796,2040.766084,12000.0,32000.0,58942.0,262695.428
9,m_size,"[5000.0, 50.0, 60.0, 200.0, 70.0]","[601, 342, 321, 285, 281]","[0.027245115372410353, 0.015503875968992248, 0...",False,22059.0,0.0,146269.353488,105454.123661,352955.637885,...,0.027245,145.0,2.0,22.0,35.0,66.0,406.5,5000.0,10200.0,70000.0


In [7]:
fake_metrics = syn.get_metrics_fake()

url


ValueError: Selecting numericand categorical results in an empty DataFrame

: 

In [None]:
current_metrics = syn.current_metrics()
fake_metrics = syn.get_metrics_fake()

models = [best_model, second_best_model]
prop_cat = ["name", "top5", "top5_prob"]
import numpy as np
columns = list(current_metrics.name.unique())
dfs = [
    current_metrics.loc[(current_metrics.name.isin(columns) & current_metrics.is_categorical),prop_cat].dropna(axis=1, how='all').assign(model="Real")
]
for model_name in models:
    dfs.append(fake_metrics[model_name].loc[(fake_metrics[model_name].name.isin(columns) & fake_metrics[model_name].is_categorical),prop_cat].dropna(axis=1, how='all').assign(model=model_name))

diffdf = pd.concat(dfs).sort_values(["name", "model"], ascending=[True, True]).loc[:, ["model", "name"] + prop_cat[1:]]


for name in diffdf.name.unique():
    #print(name)
    #print(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]])
    #print(diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]])
    #diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")), ["top5_prob"]] = np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #    - diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values)
    #print(
    #    np.array(np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #    - diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values).tolist()).reshape(-1,5).sum(axis=1).reshape(-1,1)
    #)

    #diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")), ["top5_prob"]] = np.array(np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #    - diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values).tolist()).reshape(-1,5).sum(axis=1).reshape(-1,1)

    #diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")), ["top5_prob"]] = np.array(np.abs(diffdf.loc[((diffdf.name == name) & (diffdf.model != "Real")) , ["top5_prob"]].values
    #- diffdf.loc[((diffdf.name == name) & (diffdf.model == "Real")), ["top5_prob"]].values).tolist()).sum(axis=2)
    pass
diffdf
#print(diffdf.style.to_latex().replace("_", "\_"))

ValueError: Selecting numericand categorical results in an empty DataFrame

In [None]:
models = ["smote-enc_21613", "tddpm_mlp_21613"]
prop_cat = ["name", "range", "min", "1.0%", "mean", "99.0%", "max"]
import numpy as np
columns = list(current_metrics.name.unique())
dfs = [
    current_metrics.loc[(current_metrics.name.isin(columns) & ~current_metrics.is_categorical),prop_cat].dropna(axis=1, how='all').assign(model="Real")
]
for model_name in models:
    dfs.append(fake_metrics[model_name].loc[(fake_metrics[model_name].name.isin(columns) & ~fake_metrics[model_name].is_categorical),prop_cat].dropna(axis=1, how='all').assign(model=model_name))

diffdf = pd.concat(dfs).sort_values(["name", "model"], ascending=[True, True]).loc[:, ["model", "name"] + prop_cat[1:]]
diffdf

NameError: name 'fake_metrics' is not defined

### Categoricas

In [None]:
metrics[metrics.is_categorical].dropna(axis=1, how='all')

In [None]:
syn.scores.index.unique()

## Scores

In [None]:
#best_model = "smote-enc_noise_682338"

avg = syn.scores[syn.scores["type"] == "avg"]
avg.sort_values("score", ascending=False).loc[:,["score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]]

In [None]:
def formater(x):
    return '{:.2e}'.format(x)
sci_format = { k: formater for k in ["DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]}

print(avg.sort_values("score", ascending=False).loc[:,["score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]].style.format(sci_format).to_latex().replace("_", "\_"))


In [None]:
import os
folder_path = f"../docs/tesis/imagenes/economicos/top2"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_multiple_charts([best_model,second_best_model], {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))


In [None]:
import os
folder_path = f"../docs/tesis/imagenes/economicos/top2+1"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_multiple_charts([best_model,second_best_model,"copulagan_27574"], {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))


In [None]:
syn.charts.pair_corr(df.loc[:, syn.fake_data[best_model].columns].dropna(), syn.fake_data[best_model].astype({k:np.float64 for k in ('bathrooms','rooms','m_built','m_size')}), {'id', 'waterfront', 'yr_renovated'}, "_price").update_layout(dict(width=1000)).show("png")

In [None]:
syn.charts.pair_corr(df.loc[:, syn.fake_data[second_best].columns].dropna(), syn.fake_data[second_best].astype({k:np.float64 for k in ('bathrooms','rooms','m_built','m_size')}), {'id', 'waterfront', 'yr_renovated'}, "_price").update_layout(dict(width=1000)).show("png")