In [None]:
import pandas as pd
import numpy as np
import duckdb as duck
import pyarrow as pa
import polars as pl

con = duck.connect(database='/home/garcia-ln/Documentos/real-state-prices/data/processed/real_state.duckdb')

## JSON > Parquet
Before starting the process of cleaning and transforming the data for  our analysis, we're gonna make sure to convert the files into ```.parquet``` format so that we're always dealing with optimazed performance datasets, no matter the situation.  

For that, we're gonna start by loading our data into [Pola.rs](pola.rs 'Most Efficient DataFrame Lib for Python') dataframes and try to get some information from our dataset.

In [None]:
sp_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/sp_properties.json')
rj_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/rj_properties.json')
pa_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/pa_properties.json')
bh_dt = pl.read_json('/home/garcia-ln/Documentos/real-state-prices/data/raw/bh_properties.json')

#sp_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/sp_properties.parquet')
#rj_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/rj_properties.parquet')
#pa_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/pa_properties.parquet')
#bh_dt.write_parquet('/home/garcia-ln/Documentos/real-state-prices/data/processed/bh_properties.parquet')

In [None]:
con.execute('''
    CREATE TABLE sp_tbl as 
    SELECT * FROM '~/Documentos/real-state-prices/data/processed/sp_properties.parquet';
    ALTER TABLE sp_tbl
    ADD COLUMN city VARCHAR DEFAULT 'Sao_Paulo'
'''
).fetchall()

sp_df = con.table('sp_tbl').df()
display(sp_df)


con.execute('''
    CREATE TABLE rj_tbl as 
    SELECT * FROM '~/Documentos/real-state-prices/data/processed/rj_properties.parquet';
    ALTER TABLE rj_tbl
    ADD COLUMN city VARCHAR DEFAULT 'Rio_de_Janeiro'
    '''
).fetchall()

rj_df = con.table('rj_tbl').df()
display(rj_df)


con.execute('''
    CREATE TABLE pa_tbl as 
    SELECT * FROM '~/Documentos/real-state-prices/data/processed/pa_properties.parquet';
    ALTER TABLE pa_tbl
    ADD COLUMN city VARCHAR DEFAULT 'Porto_Alegre'
    '''
).fetchall()

pa_df = con.table('pa_tbl').df()
display(pa_df)


con.execute('''
    CREATE TABLE bh_tbl as 
    SELECT * FROM '~/Documentos/real-state-prices/data/processed/bh_properties.parquet';
    ALTER TABLE bh_tbl
    ADD COLUMN city VARCHAR DEFAULT 'Belo_Horizonte'
    '''
).fetchall()

bh_df = con.table('bh_tbl').df()
display(bh_df)

## Dtypes

Now that we altered the file from ```.json``` to ```.parque``` and added the feature to our dataset we're gonna **add all the tables together and define the dtypes of our data**.  


After that we're gonna make sure to **change all dtypes of our dataset**, to keep a tidy dataset for our cleaning, analysis and modeling.

In [None]:
def set data_type(self: dataframe) -> dataframe:

    sp_dt = sp_dt.with_columns(
                [
                    (pl.col('type').cast(pl.Categorical)),
                    (pl.col('address').cast(pl.Categorical)),
                    (pl.col('neighborhood').cast(pl.Categorical)),
                    (pl.col('footage').cast(pl.Int16)),
                    (pl.col('doorms').cast(pl.Int8)),
                    (pl.col('garages').cast(pl.Int8)),
                    (pl.col('price').cast(pl.Float32))
                ]
            )
sp_dt.describe

In [None]:
dfs = [sp_dt, rj_dt, pa_dt, bh_dt]
cities = ['Sao_Paulo', 'Rio_de_Janeiro', 'Porto_Alegre', 'Belo_Horizonte']

dfs[0]['city'], dfs[1]['city'], dfs[2]['city'], dfs[3]['city'] = cities[0], cities[1], cities[2], cities[3]

for city in dfs:
    display(city)
    display(city.info())
    display(city.describe())

In [None]:
def set_dtype(self):
    self['type'] = self['type'].astype('category')
    self['address'] = self['address'].astype('category')
    self['neighborhood'] = self['neighborhood'].astype('category')
    self['footage'] = self['footage'].astype('int16')
    self['doorms'] = self['doorms'].astype('int8')
    self['garages'] = self['garages'].astype('int8')
    self['city'] = self['city'].astype('category')
    self['price'] = self['price'].astype('float64')
    return self

for city in dfs:
    set_dtype(city)
    display(city.describe())
    display(city.info())
    display(city)


In [None]:
sql = '''
    CREATE TABLE properties as
    SELECT * FROM sp_tbl 
    UNION ALL 
    SELECT * FROM rj_tbl 
    UNION ALL 
    SELECT * FROM pa_tbl 
    UNION ALL 
    SELECT * FROM bh_tbl
'''
con.execute(sql)