# Economicos

In [1]:
%%capture
import sys
!ln -s ../syntheticml .
!{sys.executable} -m pip install -U -r ../requirements.txt
!{sys.executable} -m pip install -U kaleido

In [2]:
import pandas as pd
import numpy as np
from syntheticml.data.synthetic import Synthetic, MODELS

df = pd.read_parquet('../datasets/economicos/raw/full_dedup_economicos_step0.parquet');
df.sample(3)

Unnamed: 0,url,description,price,property_type,transaction_type,state,county,publication_date,rooms,bathrooms,m_built,m_size,source,title,address,owner,_price
202672,https://www.economicos.cl/propiedades/departam...,Descripción detallada\nExcelente oportunidad!...,$ 55.000.000,Departamento,Venta,Biobío,Concepción,2019-03-20 21:14:28,3.0,2.0,68.0,,,Departamento Central en Concepción,"Concepción centro Concepción, Biobío",,1995.228881
411927,https://www.economicos.cl/propiedades/departam...,385.000 Arriendo cómodo departamento a 4 cuad...,$ 385.000,Departamento,Arriendo,Metropolitana de Santiago,Santiago,2018-04-10 00:00:03,,2.0,,,El Mercurio,Departamento en Arriendo en Santiago 2 baños,"Santiago, Metropolitana de Santiago",,14.275814
138294,https://www.economicos.cl/propiedades/casa-cen...,"Vendo casa aislada de 1 piso, 84 mts2 construi...",4000 UF,Casa,Venta,Coquimbo,La Serena,2019-12-30 11:11:21,3.0,2.0,84.0,232.0,,,,,4000.0


In [3]:
%%capture
category_columns=("property_type", "transaction_type", "state", "county", "rooms", "bathrooms", "source", )
df_converted = df.fillna(dict(
        property_type = "None",
        transaction_type = "None",
        state = "None",
        county = "None",
        rooms = -1,
        bathrooms = -1,
        m_built = -1,
        m_size = -1,
        source = "None"
)).fillna(-1).astype({k: 'str' for k in ("description", "price", "title", "address", "owner",)})
print(df_converted.shape)
basedate = pd.Timestamp('2017-12-01')
dtime = df_converted.pop("publication_date")
df_converted["publication_date"] = dtime.apply(lambda x: (x - basedate).days)
syn = Synthetic(df_converted, 
        id="url", 
        category_columns=category_columns,
        text_columns=("description", "price", "title", "address", "owner", ),
        exclude_columns=tuple(),
        synthetic_folder = "../datasets/economicos/synth-b",
        models=['copulagan', 'tvae', 'gaussiancopula', 'smote-enc', 
                'tddpm_mlp'
                ],
        n_sample = df_converted.shape[0],
        target_column="_price",
        max_cpu_pool=1,
        use_noise=False
)

In [4]:
%%capture
syn.process()
syn.process_scores()


## Metricas
### Continuas

In [5]:
metrics = syn.current_metrics()
metrics[~metrics.is_categorical].dropna(axis=1, how='all')

Unnamed: 0,name,top5,top5_freq,top5_prob,is_categorical,nobs,missing,mean,std_err,upper_ci,...,mode_freq,median,0.1%,1.0%,5.0%,25.0%,75.0%,95.0%,99.0%,99.9%
2,publication_date,"[1545, 1693, 1392, 1492, 1408]","[19744, 11666, 10260, 3838, 2445]","[0.036169784014508946, 0.021371388792203273, 0...",False,545870.0,0.0,702.7108,0.6245781,703.9349,...,0.03617,609.0,42.0,56.0,100.0,322.0,1037.0,1545.0,1693.0,1693.0
3,m_size,"[-1.0, 5000.0, 200.0, 60.0, 50.0]","[245062, 19573, 6932, 6312, 5748]","[0.44893839192481727, 0.0358565226152747, 0.01...",False,545870.0,0.0,2.035506e+16,2.035487e+16,6.024986e+16,...,0.448938,36.0,-1.0,-1.0,-1.0,-1.0,180.0,5000.0,50000.0,4920000.0
5,_price,"[0.0, 3500.0, 5500.0, 6500.0, 4500.0]","[17989, 865, 767, 763, 740]","[0.03295473281184165, 0.001584626376243428, 0....",False,545870.0,0.0,7098298000.0,7021735000.0,20860650000.0,...,0.032955,174.02451,0.0,0.0,1.26969,12.28306,3550.0,14400.0,47000.0,1906676.0
9,m_built,"[-1.0, 60.0, 50.0, 70.0, 100.0]","[188514, 13831, 11796, 11648, 9716]","[0.3453459614926631, 0.025337534577829886, 0.0...",False,545870.0,0.0,2271416000.0,2043921000.0,6277428000.0,...,0.345346,50.0,-1.0,-1.0,-1.0,-1.0,98.0,400.0,8350.51,550000.0


### Categoricas

In [6]:
metrics[metrics.is_categorical].dropna(axis=1, how='all')

Unnamed: 0,name,top5,top5_freq,top5_prob,is_categorical,nobs,missing
0,property_type,"[Departamento, Casa, Sitio o Terreno, Parcela ...","[211405, 142054, 31393, 30020, 27415]","[0.38728085441588656, 0.26023412167732246, 0.0...",True,545870.0,545870.0
1,source,"[None, El Mercurio, [El Mercurio de Valparaiso...","[371221, 135613, 13594, 6979, 4132]","[0.6800538589774122, 0.24843460897283237, 0.02...",True,545870.0,545870.0
4,transaction_type,"[Venta, Arriendo, Busco arriendo, Compro, None]","[282495, 258300, 3031, 1901, 86]","[0.5175133273490025, 0.473189587264367, 0.0055...",True,545870.0,545870.0
6,rooms,"[-1.0, 3.0, 2.0, 1.0, 4.0]","[196417, 125902, 97220, 54183, 44539]","[0.35982376756370565, 0.23064465898473996, 0.1...",True,545870.0,545870.0
7,state,"[Metropolitana de Santiago, Valparaíso, Biobío...","[272808, 108197, 29379, 21581, 16533]","[0.49976734387308336, 0.1982101965669482, 0.05...",True,545870.0,545870.0
8,bathrooms,"[-1.0, 2.0, 1.0, 3.0, 4.0]","[206916, 136581, 134963, 43884, 14719]","[0.3790572847014857, 0.2502079249638192, 0.247...",True,545870.0,545870.0
10,county,"[Santiago, Viña del Mar, Las Condes, Providenc...","[65125, 33263, 32327, 27981, 24863]","[0.11930496272006155, 0.06093575393408687, 0.0...",True,545870.0,545870.0


In [7]:
syn.scores.index.unique()

Index(['copulagan_682338', 'gaussiancopula_682338', 'smote-enc_682338',
       'tddpm_mlp_682338', 'tvae_682338'],
      dtype='object', name='name')

## Scores

In [8]:
#best_model = "smote-enc_noise_682338"
best_model = "tddpm_mlp_27574"
second_best_model= "smote-enc_27574"
avg = syn.scores[syn.scores["type"] == "avg"]
avg.sort_values("score", ascending=False).loc[:,["score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]]

Unnamed: 0_level_0,score,DCR ST 5th,DCR SH 5th,DCR TH 5th
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tddpm_mlp_682338,0.966218,8.050782e-15,8.062921e-15,9.000000000000001e-17
smote-enc_682338,0.947804,7.093774e-15,8.920271e-15,9.000000000000001e-17
tvae_682338,0.766746,1.274006e-14,1.233074e-14,9.000000000000001e-17
gaussiancopula_682338,0.714859,0.0004833396,0.0004836043,9.000000000000001e-17
copulagan_682338,0.670387,2.724884e-10,2.705633e-10,9.000000000000001e-17


In [9]:
def formater(x):
    return '{:.2e}'.format(x)
sci_format = { k: formater for k in ["DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]}

print(avg.sort_values("score", ascending=False).loc[:,["score", "DCR ST 5th", "DCR SH 5th", "DCR TH 5th"]].style.format(sci_format).to_latex().replace("_", "\_"))


\begin{tabular}{lrrrr}
 & score & DCR ST 5th & DCR SH 5th & DCR TH 5th \\
name &  &  &  &  \\
tddpm\_mlp\_682338 & 0.966218 & 8.05e-15 & 8.06e-15 & 9.00e-17 \\
smote-enc\_682338 & 0.947804 & 7.09e-15 & 8.92e-15 & 9.00e-17 \\
tvae\_682338 & 0.766746 & 1.27e-14 & 1.23e-14 & 9.00e-17 \\
gaussiancopula\_682338 & 0.714859 & 4.83e-04 & 4.84e-04 & 9.00e-17 \\
copulagan\_682338 & 0.670387 & 2.72e-10 & 2.71e-10 & 9.00e-17 \\
\end{tabular}



In [10]:
import os
folder_path = f"../docs/tesis/imagenes/economicos/top2-b"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_multiple_charts([best_model,second_best_model], {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))


KeyError: 'tddpm_mlp_27574'

In [None]:
import os
folder_path = f"../docs/tesis/imagenes/economicos/top2+1-b"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)
for fig in syn.get_multiple_charts([best_model,second_best_model,"copulagan_27574"], {'date', 'id', 'zipcode', 'lat', 'long', 'yr_renovated'}):
    if fig:
        file_name = f'{fig.layout.title.text.replace(":","").replace(" ","_").lower()}.svg'
        fig.write_image(f"{folder_path}/{file_name}")
        display(fig.show("png"))


In [None]:
syn.charts.pair_corr(df.loc[:, syn.fake_data[best_model].columns].dropna(), syn.fake_data[best_model].astype({k:np.float64 for k in ('bathrooms','rooms','m_built','m_size')}), {'id', 'waterfront', 'yr_renovated'}, "_price").update_layout(dict(width=1000)).show("png")

In [None]:
syn.charts.pair_corr(df.loc[:, syn.fake_data[second_best].columns].dropna(), syn.fake_data[second_best].astype({k:np.float64 for k in ('bathrooms','rooms','m_built','m_size')}), {'id', 'waterfront', 'yr_renovated'}, "_price").update_layout(dict(width=1000)).show("png")