# IMPORTS

In [222]:
import math
import warnings
import numpy  as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.core.display import HTML
import re
import unicodedata

warnings.filterwarnings( 'ignore' )

##  Helper Functions

In [223]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [16, 8]
#    plt.rcParams['font.size'] = 24   
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.set_option( 'display.expand_frame_repr', False )
    pd.set_option('max_colwidth',200)
    pd.set_option('display.max_rows',2000)
    pd.set_option('display.max_columns',50)
    pd.set_option('display.float_format',lambda x: '{:,.2f}'.format(x) )   

    
def remove_special_characters(text):
    """
    This function remove special characters common to brazilian portuguese language such as '^~$ç and was based on
    http://stackoverflow.com/a/517974/3464573
    """
    # Unicode normalize transforms a character into its Latin's like.
    nfkd = unicodedata.normalize('NFKD', text)
    newtext = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z0-9 \\\]', '', newtext)


def classe_imovel(x):
    #return print(type(title))
    if 'Casa' in x:
        return 'House'
    if 'Sobrado' in x:
        return 'House'
    if 'Lote' in x:
        return 'Allotment'
    if 'Apartamento' in x:
        return 'Apartment'
    if 'Chacara' in x:
        return 'Country house'
    if 'Sala' in x:
        return 'Comercial'    
    if 'Comercial' in x:
        return 'Commercial'    
    
    else:
        return 'Outro'

In [224]:
jupyter_settings();

Populating the interactive namespace from numpy and matplotlib


## Loading data

In [206]:
df_imoveis_raw = pd.read_csv( '../data/aldeia_realestate.csv')
df_condo_raw = pd.read_excel( '../data/condominium_x_streets.xlsx', sheet_name = 'condo')

In [207]:
df1=df_imoveis_raw.copy()

In [208]:
# delete duplicates url
df1.drop_duplicates(subset=['url'], inplace=True)

In [209]:
# Remove special characters
string_cols = df1.select_dtypes(include='object').columns
string_cols = string_cols[ (string_cols != 'address') & (string_cols != 'url')]
for col in string_cols:
    df1[col]=df1[col].apply([lambda x: remove_special_characters(x)]) 

In [210]:
# Classify real state per type
df1['type'] = df1['title'].apply(classe_imovel)
df1 = df1[df1['type'] != 'Outro']

In [211]:
# Clean string in numeric fields

df1['area_m2']=df1['area_m2'].str.replace('Area', '').str.strip()
df1['area_m2']=df1['area_m2'].str.replace('m2', '').str.strip()

df1['bedrooms']=df1['bedrooms'].str.replace('Quartos', '').str.strip()
df1['bedrooms']=df1['bedrooms'].str.replace('Quarto', '').str.strip()

df1['en_suites']=df1['en_suites'].str.replace('Suites', '').str.strip()
df1['en_suites']=df1['en_suites'].str.replace('Suite', '').str.strip()

df1['price']=df1['price'].str.replace('R', '').str.strip()
df1['price']=df1['price'].str.replace('Sob Consulta', '').str.strip()
df1['price']=df1['price'].str.replace('A partir de', '').str.strip()

df1['bathrooms']=df1['bathrooms'].str.replace('Banheiros', '').str.strip()
df1['bathrooms']=df1['bathrooms'].str.replace('Banheiro', '').str.strip()

df1['parking_spaces']=df1['parking_spaces'].str.replace('Vagas', '').str.strip()
df1['parking_spaces']=df1['parking_spaces'].str.replace('Vaga', '').str.strip()

In [212]:
# Filter real states not the three cities possible for Aldeia da Serra region
filter0 =  df1.address.str.contains('Barueri') | df1.address.str.contains('Santana') | \
           df1.address.str.contains('Itapevi')
df1 = df1[filter0]

In [213]:
#Clean up and prepare address to be splitted into street_name, street_number
#condo_name and city_name
df1.address = df1.address.str.replace('\n', '')
df1['address_tmp'] = df1['address'].str.replace('\n','').str.strip()

# replace city separator from ',' to '@'
df1.address_tmp = df1.address_tmp.str.replace(', Barueri', '@ Barueri')
df1.address_tmp = df1.address_tmp.str.replace(', Santana', '@ Santana')
df1.address_tmp = df1.address_tmp.str.replace(', Itapevi', '@ Itapevi')

# exclude 'Aldeia da Serra'
df1.address_tmp = df1.address_tmp.str.replace('Aldeia da Serra', '')
df1.address_tmp = df1.address_tmp.str.replace('-', '')

# exclude 'Residencial' to standardize condo name
df1.address_tmp = df1.address_tmp.str.replace('Residencial', '')

# insert condo name separator to '@'
df1.address_tmp = df1.address_tmp.str.replace('Morada', '@Morada')

In [214]:
#Create count_at(@) and count_comma to make splitting possible for every pattern
df1['count_at'] = df1.address_tmp.str.count('@')
df1['count_comma'] = df1.address_tmp.str.count(',')
#df1.groupby(['count_at', 'count_comma'])['page_seq'].count()

In [215]:
#desired format  : street_name '@' street_number '@' condo_name '@' city

#found format: street_name '@' city
#replace '@' by '@@@'
filter1 = (df1['count_at'] == 1) & (df1['count_comma'] == 0)
address_type1 = df1[filter1]['address_tmp'].str.replace('@','@ @ @')

#found format: street_name ',' '@' city
#replace '@' by '@@' and ',' by '@'
filter2 = (df1['count_at'] == 1) & (df1['count_comma'] == 1)
address_type2 = df1[filter2]['address_tmp'].str.replace('@',' @ @ ').str.replace(',',' @ ')

#found format: street_name '@' condo_name '@' city
#replace '@Morada' by '@ @Morada'
filter3 = (df1['count_at'] == 2) & (df1['count_comma'] == 0)
address_type3 = df1[filter3]['address_tmp'].str.replace('@Morada',' @ @Morada ')

#found format: street_name ',' street_number '@' condo_name '@' city
#replace ',' by '@'
filter4 = (df1['count_at'] == 2) & (df1['count_comma'] == 1)
address_type4 = df1[filter4]['address_tmp'].str.replace(',',' @ ')
new_address = pd.concat([address_type1,address_type2,address_type3,address_type4], axis=0).to_frame()

#new_address['street_name'], new_address['street_number'],new_address['condo'], new_address['city'] 
new_address = new_address['address_tmp'].str.split('@', n=3, expand=True)
new_address.columns=['street_name', 'street_number', 'condo_name', 'city']
cols = new_address.columns

#trim strings
for col in cols:
    new_address[col]=new_address[col].apply([lambda x: x.strip()]) 
    
#concatente features into the original dataset    
df1 = pd.concat([df1, new_address], axis=1)
df1.street_number = df1.street_number.str.replace('sn', '')


In [216]:
df_condo_raw.sample(1)

Unnamed: 0,condo_name,street_name
81,Morada dos Pinheiros,Alameda das Sameleiras


In [217]:
# merge dataset to get condo name through street_name
df1 = df1.merge(df_condo_raw, on='street_name', how='left')


In [218]:
#df1['condo_name_y'].isna().sum()
#df1['condo_name_y'].value_counts()

In [219]:
#df1['condo_name_y'] = df1['condo_name_y'].apply(lambda x: df1['condo_name_x'] if pd.isnull(x) else x )

In [226]:
df1.drop(columns='condo_name_x', inplace = True)

In [230]:
df1[df1['condo_name_y'] == 'Altavis'].sample(5)

Unnamed: 0,page_seq,house_seq,title,url,address,area_m2,bedrooms,en_suites,price,bathrooms,parking_spaces,type,address_tmp,count_at,count_comma,street_name,street_number,city,condo_name_y
452,13,2,LoteTerreno a Venda 520m2,www.vivareal.com.br/imovel/lote-terreno-aldeia-da-serra-bairros-santana-de-parnaiba-520m2-venda-RS480000-id-2462825873/,"Estrada Marechal Mascarenhas de Moraes - Aldeia da Serra, Santana de Parnaíba - SP",520,,,480000,,,Allotment,Estrada Marechal Mascarenhas de Moraes @ Santana de Parnaíba SP,1,0,Estrada Marechal Mascarenhas de Moraes,,Santana de Parnaíba SP,Altavis
1304,4,28,LoteTerreno a Venda 724m2,www.vivareal.com.br/imovel/lote-terreno-quintas-do-ingai-bairros-santana-de-parnaiba-724m2-venda-RS723980-id-2452614680/?__vt=plp:b,"Estrada Marechal Mascarenhas de Moraes - Morada dos Pássaros, Santana de Parnaíba - SP",724,,,723980,,,Allotment,Estrada Marechal Mascarenhas de Moraes @Morada dos Pássaros@ Santana de Parnaíba SP,2,0,Estrada Marechal Mascarenhas de Moraes,,Santana de Parnaíba SP,Altavis
1417,7,35,LoteTerreno a Venda 573m2,www.vivareal.com.br/imovel/lote-terreno-quintas-do-ingai-bairros-santana-de-parnaiba-573m2-venda-RS543917-id-91951304/?__vt=plp:b,"Estrada Marechal Mascarenhas de Moraes - Morada dos Pássaros, Santana de Parnaíba - SP",573,,,543917,,,Allotment,Estrada Marechal Mascarenhas de Moraes @Morada dos Pássaros@ Santana de Parnaíba SP,2,0,Estrada Marechal Mascarenhas de Moraes,,Santana de Parnaíba SP,Altavis
1392,7,10,LoteTerreno a Venda 4723m2,www.vivareal.com.br/imovel/lote-terreno-quintas-do-ingai-bairros-santana-de-parnaiba-4723m2-venda-RS472640-id-2452612841/?__vt=plp:b,"Estrada Marechal Mascarenhas de Moraes - Morada dos Pássaros, Santana de Parnaíba - SP",4723,,,472640,,,Allotment,Estrada Marechal Mascarenhas de Moraes @Morada dos Pássaros@ Santana de Parnaíba SP,2,0,Estrada Marechal Mascarenhas de Moraes,,Santana de Parnaíba SP,Altavis
429,12,15,LoteTerreno a Venda 501m2,www.vivareal.com.br/imovel/lote-terreno-aldeia-da-serra-bairros-santana-de-parnaiba-501m2-venda-RS396000-id-2432515612/,"Estrada Marechal Mascarenhas de Moraes - Aldeia da Serra, Santana de Parnaíba - SP",501,,,396000,,,Allotment,Estrada Marechal Mascarenhas de Moraes @ Santana de Parnaíba SP,1,0,Estrada Marechal Mascarenhas de Moraes,,Santana de Parnaíba SP,Altavis


In [None]:
df1['area_m2'] = pd.to_numeric(df1['area_m2'])
df1['bedrooms'] = pd.to_numeric(df1['bedrooms'])
df1['bathrooms'] = pd.to_numeric(df1['bedrooms'])
df1['en_suites'] = pd.to_numeric(df1['en_suites'])
df1['parking_spaces'] = pd.to_numeric(df1['parking_spaces'])
df1['price'] = pd.to_numeric(df1['price'])
df1.street_number = df1.street_number.str.replace('483/503', '483')
df1['street_number'] = pd.to_numeric(df1['street_number'])

In [None]:
sample_addr_t1 = df1[ (df1['count_at'] == 1) & (df1['count_comma'] == 0) ][['address', 'address_tmp', \
                                            'street_name','street_number','condo_name', 'city']].head(1)

sample_addr_t2 = df1[ (df1['count_at'] == 1) & (df1['count_comma'] == 1) ][['address', 'address_tmp', \
                                            'street_name','street_number','condo_name', 'city']].head(1)

sample_addr_t3 = df1[ (df1['count_at'] == 2) & (df1['count_comma'] == 0) ][['address', 'address_tmp', \
                                            'street_name','street_number','condo_name', 'city']].head(1)

sample_addr_t4 = df1[ (df1['count_at'] == 2) & (df1['count_comma'] == 1) ][['address', 'address_tmp', \
                                            'street_name','street_number','condo_name', 'city']].head(1)
sample_addr_types = pd.concat([sample_addr_t1, sample_addr_t2, sample_addr_t3, sample_addr_t4])

sample_addr_types

In [None]:
df1.info()

In [None]:
df1.drop(columns=['count_at', 'count_comma', 'address_tmp'], inplace=True)

In [None]:
df1.sample(5)

In [None]:
df1.isna().sum()

In [None]:
df1.info()

In [None]:
df1.to_csv('../data/cleaned_aldeia_realestate.csv', index=None)