# Notebook for data cleaning

In [9]:
import pandas as pd
from unidecode import unidecode

In [8]:
df_main_raw = pd.read_parquet("data/properties_main.parquet", engine="pyarrow", dtype_backend="numpy_nullable")
df_features = pd.read_parquet("data/properties_characteristics.parquet")


In [6]:
df_main_raw

Unnamed: 0,property_code,property_reference,title,description,property_type,property_subtype,postal_code,address,locality,latitude,...,has_pantry,image_count,publisher_code,publisher_name,publisher_phone,price_per_sqm_sale,price_per_sqm_rent,total_monthly_cost,size_category,amenity_score
0,85001-L,85001-L,Apartamento Mobiliado para alugar com 1 quarto...,Apartamento Mobiliado para alugar com 1 quarto...,Apartamento,Padrão,14026591,"Jardim Nova Alianca, Ribeirao Preto","Jardim Nova Aliança,Ribeirão Preto,São Paulo,B...",-21.22099,...,False,16,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,49.019608,2500.0,medium,0
1,84999-S,84999-S,Cobertura Penthouse com 3 suites no bairro San...,Cobertura Penthouse com 3 suites no bairro San...,Apartamento,Cobertura,14020700,"Santa Cruz Do Jose Jacques, Ribeirao Preto","Santa Cruz Do José Jacques,Ribeirão Preto,São ...",,...,True,47,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,4934.210526,,,extra_large,6
2,84998-S,84998-S,Belissima Casa no Condominio do Jardim Nova Al...,Casa Sobrado a Venda – Residencial Nova Alianc...,Casa,Casa de Condomínio,14026551,"Jardim Nova Alianca, Ribeirao Preto","Jardim Nova Aliança,Ribeirão Preto,São Paulo,B...",-21.21515,...,False,22,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,5301.587302,,,extra_large,1
3,84996-L,84996-L,"Predio Comercial no Jardim Botanico, em Ribeir...","Predio Comercial no Jardim Botanico, em Ribeir...",Comercial,Prédio Inteiro,14021593,"Jardim Botanico, Ribeirao Preto","Jardim Botânico,Ribeirão Preto,São Paulo,Brasil",-21.21753,...,False,10,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,57.142857,16000.0,extra_large,0
4,84990-S,84990-S,Apartamento de 3 dormitorios na Quinta da Prim...,Encontre o imovel dos seus sonhos neste lindo ...,Apartamento,Padrão,14022100,"Quinta da Primavera, Ribeirao Preto","Quinta da Primavera,Ribeirão Preto,São Paulo,B...",-21.23351,...,False,36,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,8987.341772,,,medium,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11921,182-S,182-S,Terreno Padrão em São Carlos,", Área total 540,00 m²",Terreno,Terreno Padrão,13562060,"Planalto Paraiso, Sao Carlos","Planalto Paraíso,São Carlos,São Paulo,Brasil",-22.0076,...,False,4,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,,,,0
11922,162-S,162-S,"Casa Padrao a venda no Jardim Bethania, Sao Ca...",Linda casa padrao a venda no bairro Jardim Bet...,Casa,Padrão,13561060,"Jardim Bethania, Sao Carlos","Jardim Bethânia,São Carlos,São Paulo,Brasil",-22.00854,...,False,3,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,2792.55625,,,large,0
11923,114-S,114-S,"Apartamento Padrao a Venda em Miguel Abdelnur,...",Conheca este lindo apartamento com 2 dormitori...,Apartamento,Padrão,13571385,"Distrito Industrial Miguel Abdelnur, Sao Carlos","Distrito Industrial Miguel Abdelnur,São Carlos...",-22.04302,...,False,15,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,5555.555556,,,small,0
11924,6-S,6-S,Terreno a venda no bairro Parque Santa Felicia...,Encontre o terreno dos seus sonhos no bairro P...,Terreno,Terreno Padrão,13563307,"Santa Felicia, Sao Carlos","Santa Felícia,São Carlos,São Paulo,Brasil",-21.99665,...,False,4,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,,,,0


In [13]:
df_main = df_main_raw.copy()
df_main["city"] = df_main_raw["locality"].apply(lambda row: row.split(",")[1])
df_main["neighborhood"] = df_main_raw["locality"].apply(lambda row: unidecode(row.split(",")[0]).upper().strip())
df_main = df_main.drop(columns=["locality"])
sao_carlos = df_main[df_main["city"] == "São Carlos"]
sao_carlos["neighborhood"].value_counts()

neighborhood
CENTRO                               847
VILA PRADO                           272
SANTA FELICIA                        199
JARDIM SAO CARLOS                    189
CIDADE JARDIM                        178
                                    ... 
UIRAPURU                               1
GRANJA SAO JUDAS TADEU                 1
LOTEAMENTO MUNICIPAL SAO CARLOS 4      1
IDALINA POZZI MARGARIDO                1
VILA MONTEIRO (GLEBA I)                1
Name: count, Length: 229, dtype: int64

In [14]:
# Droping columns not to be put in the model (id is kept to re-join if needed)
# Also selecting only properties from sao carlos

selected = (sao_carlos
.drop(columns=[
    "property_reference",
    "title",
    'description',
    'postal_code',
    'address',
    'latitude',
    'longitude', # A more complex model may use it to calculate distance to uptown, will not be used at first
    'city',
    'neighborhood', # Using neighborhoods may introduce too much sparsity in the model (over 200 neighborhoods)
    'show_map',
    'has_sale_price',
    'has_rent_price', # Already visible by NaNs
    'image_count',
    'publisher_code',
    'publisher_name',
    'publisher_phone',
    'price_per_sqm_rent',
    'price_per_sqm_sale'
]))

In [25]:
selected['property_type'].unique()

['Casa', 'Apartamento', 'Comercial', 'Terreno', 'Rural']
Categories (5, object): ['Apartamento', 'Casa', 'Comercial', 'Rural', 'Terreno']

In [30]:
# We will separate the residencial properties to use in the model
residencial_df = selected[selected['property_type'].isin(["Casa", "Apartamento"])]


In [36]:
# lets look at nan values
pd.DataFrame(residencial_df.isnull().sum().divide(len(residencial_df)).sort_values(ascending=False)).T

Unnamed: 0,rent_price,total_monthly_cost,condominium_fee,suites,property_tax,sale_price,parking_spaces,bathrooms,area_total,size_category,...,has_bbq,has_gym,has_sports_court,has_party_room,has_24h_security,has_laundry,has_office,has_closet,has_pantry,amenity_score
0,0.824895,0.824895,0.475233,0.46975,0.355876,0.175105,0.085176,0.042405,0.024858,0.006215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Most columns have small proportions of missing values, which can be discarded safely (also seem to be misinputs or comercial rooms listed as houses). Among the ones with high proportions:
- sale_price/rent_price -> Self explanatory, will be divided in two datasets
- condominium_fee -> non condominium houses, can be set to 0
- property_tax -> needs exploration, may be removed
- total_montlhy_cost -> way to many NaNs, will be removed
- suites -> Probably equivalent to 0 or just not included (because half of all properties dont have it), may be removed

In [53]:
exploration_df = sao_carlos[sao_carlos['property_type'].isin(["Casa", "Apartamento"])]
exploration_df[exploration_df['condominium_fee'].isnull()]

Unnamed: 0,property_code,property_reference,title,description,property_type,property_subtype,postal_code,address,latitude,longitude,...,publisher_code,publisher_name,publisher_phone,price_per_sqm_sale,price_per_sqm_rent,total_monthly_cost,size_category,amenity_score,city,neighborhood
7,84987-L,84987-L,"Casa de Fundos para Alugar em Santa Felicia, S...",Conheca essa charmosa casa de fundos para alug...,Casa,Padrão,13563300,"Santa Felicia, Sao Carlos",-21.99261,-47.92462,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,23.714286,830.0,small,0,São Carlos,SANTA FELICIA
9,84981-S,84981-S,Casa padrao a venda no bairro Parque Industria...,Excelente oportunidade! Casa com 02 dormitorio...,Casa,Padrão,13564590,"Parque Industrial, Sao Carlos",-21.9877,-47.90301,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,3425.287356,,,medium,0,São Carlos,PARQUE INDUSTRIAL
10,84976-S,84976-S,"Casa a venda no Jardim Mercedes, Sao Carlos - ...",Conheca essa linda casa padrao no Jardim Merce...,Casa,Padrão,13570501,"Jardim Mercedes, Sao Carlos",-22.03106,-47.88149,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,3058.252427,,,extra_large,0,São Carlos,JARDIM MERCEDES
12,84972-L,84972-L,"Casa para alugar no Jardim Embare, Sao Carlos",Casa para alugar com dois quartos sendo uma su...,Casa,Padrão,13563844,"Jardim Embare, Sao Carlos",-21.96997,-47.93221,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,23.814286,1667.0,medium,1,São Carlos,JARDIM EMBARE
15,84962-S,84962-S,Charmosa Casa de 2 Dormitorios no Jardim Bande...,Encontre a casa dos seus sonhos no Jardim Band...,Casa,Padrão,13562150,"Jardim Bandeirantes, Sao Carlos",-22.00072,-47.90805,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,2727.272727,,,large,0,São Carlos,JARDIM BANDEIRANTES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11912,676-L,676-L,Casa de 3 dormitorios no Centro de Sao Carlos ...,Conheca essa incrivel casa no coracao de Sao C...,Casa,Padrão,13560042,"CENTRO, Sao Carlos",-22.01407,-47.89289,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,,37.5,6000.0,large,0,São Carlos,CENTRO
11913,566-S,566-S,Casa Padrão em São Carlos,"CASA - 3/4 - AV. ARARAQUARA, 188 - VILA COSTA ...",Casa,Padrão,13566770,"Vila Costa do Sol, Sao Carlos",-22.00418,-47.8806,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,2633.333333,,,large,0,São Carlos,VILA COSTA DO SOL
11919,376-S,376-S,"Casa a venda no Jardim Paraiso, Sao Carlos: 3 ...",Viva com conforto e estilo nesta casa no Jardi...,Casa,Padrão,13561140,"Jardim Paraiso, Sao Carlos",-22.01065,-47.90019,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,3472.222222,,,extra_large,0,São Carlos,JARDIM PARAISO
11920,224-S,224-S,Casa Padrão em São Carlos,"CASA - 3/4 - MARECHAL DEODORO, 1467 - CENTRO",Casa,Padrão,13560200,"CENTRO, Sao Carlos",-22.01502,-47.89626,...,2,Roca Administradora de Imóveis Eireli,(16) 3373-5000,4746.835443,,,large,0,São Carlos,CENTRO


Decisions:
- Excluded columns: total_monthly_cost, suites, property_tax. Complex mixture of misinput and systematic missing, best to avoid using.
- Drop NA columns: bathrooms, bedroom, area_util, area_total, size_category, parking_spaces. Misinput or invalid data (comercial rooms), sub 5% proportion of NaNs (8% for parking_spaces) allows us to drop these rows.
- Dataset split: rent_price, sale_price.
- Fill: condominium_fee. Most are houses or kitnets, will just fill with 0, will cause some wrong data for sure, but the rate of misinput is surely way smaller than the no fee places.

In [None]:
# Based on the EDA notebook, we shall remove nonsense outliers from the area_util column
residencial_df = residencial_df[residencial_df['area_util'] > 10]

In [63]:
clean_data = residencial_df.copy()
clean_data = clean_data.dropna(subset=['bathrooms', 'bedrooms', 'area_total', 'area_util', 'size_category', 'parking_spaces'])
clean_data["condominium_fee"] = clean_data["condominium_fee"].fillna(0)
clean_data = clean_data.drop(columns = ['total_monthly_cost', 'suites', 'property_tax'])
clean_data_sell = clean_data[~clean_data["sale_price"].isnull()].drop(columns=["rent_price"])
clean_data_rent = clean_data[~clean_data["rent_price"].isnull()].drop(columns=["sale_price"])
len(clean_data_rent), len(clean_data_sell)

(745, 3903)

In [65]:
# save to csv
clean_data_rent.to_csv("data/clean_data_rent.csv", index=False)
clean_data_sell.to_csv("data/clean_data_sell.csv", index=False)