## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

## Lendo as fontes de dados

In [2]:
# Listings -> df_anuncios e df_amenities

df_listings = pd.read_csv('listings.csv', index_col='id', engine='python', sep =',', encoding="utf-8")
df_listings = df_listings[['host_name', 'host_response_time', 'host_response_rate', 'host_is_superhost', 'host_identity_verified', 'neighbourhood', 'zipcode', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'cancellation_policy', 'calculated_host_listings_count', 'reviews_per_month']]
print('Total de registros carregados:',len(df_listings))
df_listings.head()

Total de registros carregados: 33715


Unnamed: 0_level_0,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,property_type,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17878,Matthias,within an hour,100%,t,t,Copacabana,22020-050,-22.96592,-43.17896,Condominium,...,9.0,10.0,10.0,10.0,10.0,9.0,t,strict_14_with_grace_period,1,2.13
25026,Viviane,within a day,91%,f,t,Copacabana,22060-020,-22.97712,-43.19045,Apartment,...,9.0,10.0,9.0,10.0,10.0,9.0,f,strict_14_with_grace_period,3,2.04
31560,Renata,within an hour,100%,t,t,Ipanema,22410-003,-22.98302,-43.21427,Apartment,...,10.0,10.0,10.0,10.0,10.0,10.0,t,strict_14_with_grace_period,1,2.38
35636,Patricia,within an hour,100%,t,t,Ipanema,22081-020,-22.98816,-43.19359,Apartment,...,10.0,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,1,2.28
35764,Patricia Miranda & Paulo,within an hour,100%,t,t,Copacabana,21031-300,-22.98127,-43.19046,Loft,...,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,1,2.84


In [3]:
# Metro Quadrado -> df_metro_quadrado

# Estimativa do valor do metro quadrado por bairro
# Fonte: prefeitura do RJ - http://www2.rio.rj.gov.br/smf/siam/logradouro.asp

metro_quadrado = {
    'Ipanema':20908,'Copacabana':12169,'Leme':12211,'Catete':10486,'Lapa':3277,
    'Barra da Tijuca':9334,'Inhaúma':1484,'Leblon':23625,'Vila Kosmos':1541,'Botafogo':11378,
    'Recreio dos Bandeirantes':6120,'Tijuca':6193,'Glória':9264,'Laranjeiras':9687,'Santa Teresa':7869, 
    'Engenho de Dentro':2049,'Centro':7660,'Jardim Botânico':14269,'Joá':7973,'Vidigal':1749,
    'Flamengo':11099,'Gávea':13814,'Rio Comprido':1670,'Engenho Novo':3553,'Humaitá':11684, 
    'São Cristóvão':1812,'Cosme Velho':8460,'Vila Isabel':5085,'Maracanã':2297,'Andaraí':4961, 
    'Saúde':1813,'Benfica':1272,'Lagoa':15616,'Méier':2727,'Marechal Hermes':1213, 
    'Penha':1734,'São Conrado':12211,'Urca':11661,'Todos os Santos':2281,'Grajaú':4995, 
    'Lins de Vasconcelos':863,'Parque Anchieta':855,'Del Castilho':1621,'Estacio': 1688,'Cachambi':1475,
    'Praça da Bandeira':2285,'Gamboa':1666,'Vaz Lobo':1184,'Irajá':1204,'Olaria':1813, 
    'Madureira':1573,'Bonsucesso':1815,'Rocha':1740,'Quintino Bocaiúva':1372,'Vila da Penha':1987,
    'Tomás Coelho':1199,'Bento Ribeiro':1499,'Abolição': 1635,'Brás de Pina': 1341,'Riachuelo':1797, 
    'Guadalupe':1171,'Parada de Lucas':1062,'Grumari':2601,'Rocinha':1531,'Encantado':1641, 
    'Cordovil':1249,'Barra de Guaratiba':1401,'Cidade Nova':2422,'Maria da Graça':1738,'Piedade':1547, 
    'Oswaldo Cruz':1305,'Santo Cristo':1526,'Penha Circular': 1566,'Higienópolis':1831,'Catumbi':1158, 
    'Engenho da Rainha':1169,'Pilares':1577,'Ramos':1512,'Cascadura':1501,'Sampaio':1698, 
    'Barros filho':744,'Jacaré':1229,'Rocha Miranda':1331,'Caju':866,'Colégio':1070, 
    'Coelho Neto':1084,'Mangueira':490,'Ricardo de Albuquerque':1010,'Anchieta':965,'Manguinhos':439,
    'Pavuna':2193,'Vigário Geral':1025,'Vincente de Carvalho':1272,'Honório Gurgel':3063,'Cavalcante':981, 
    'Deodoro':999,'Complexo do Alemão':544
}
pd.DataFrame(data=list(metro_quadrado.items()),columns=['neighbourhood','m2']).head()

Unnamed: 0,neighbourhood,m2
0,Ipanema,20908
1,Copacabana,12169
2,Leme,12211
3,Catete,10486
4,Lapa,3277


## Gerando os dataframes

In [4]:
# Gerando df_anuncios

df_anuncios = df_listings.drop('amenities', axis=1)
df_anuncios.head()

Unnamed: 0_level_0,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,property_type,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17878,Matthias,within an hour,100%,t,t,Copacabana,22020-050,-22.96592,-43.17896,Condominium,...,9.0,10.0,10.0,10.0,10.0,9.0,t,strict_14_with_grace_period,1,2.13
25026,Viviane,within a day,91%,f,t,Copacabana,22060-020,-22.97712,-43.19045,Apartment,...,9.0,10.0,9.0,10.0,10.0,9.0,f,strict_14_with_grace_period,3,2.04
31560,Renata,within an hour,100%,t,t,Ipanema,22410-003,-22.98302,-43.21427,Apartment,...,10.0,10.0,10.0,10.0,10.0,10.0,t,strict_14_with_grace_period,1,2.38
35636,Patricia,within an hour,100%,t,t,Ipanema,22081-020,-22.98816,-43.19359,Apartment,...,10.0,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,1,2.28
35764,Patricia Miranda & Paulo,within an hour,100%,t,t,Copacabana,21031-300,-22.98127,-43.19046,Loft,...,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,1,2.84


In [5]:
# Gerando df_amenities

df_amenities = df_listings['amenities']
df_amenities = df_amenities.str.strip('}{').str.replace(' ', '_').str.replace('"', '').str.split(',')
mlb = MultiLabelBinarizer()
df_amenities = pd.DataFrame(
        mlb.fit_transform(df_amenities),
        index=df_amenities.index,
        columns=mlb.classes_)
df_amenities.drop(columns='', inplace=True)
df_amenities.head()

Unnamed: 0_level_0,24-hour_check-in,Accessible-height_bed,Accessible-height_toilet,Air_conditioning,Amazon_Echo,BBQ_grill,Baby_bath,Baby_monitor,Babysitter_recommendations,Balcony,...,Wide_entrance,Wide_entrance_for_guests,Wide_entryway,Wide_hallways,Wifi,Window_guards,Wine_cooler,_toilet,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17878,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
25026,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
31560,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
35636,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
35764,0,1,1,1,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,1


In [6]:
# Gerando df_metro_quadrado

df_metro_quadrado = pd.DataFrame(data=list(metro_quadrado.items()),columns=['neighbourhood','m2'])
df_metro_quadrado = df_metro_quadrado.set_index('neighbourhood')
df_metro_quadrado.head()

Unnamed: 0_level_0,m2
neighbourhood,Unnamed: 1_level_1
Ipanema,20908
Copacabana,12169
Leme,12211
Catete,10486
Lapa,3277


## Salvando os dataframes resultantes para as próximas etapas

In [7]:
df_anuncios.to_csv('anuncios.csv')

In [8]:
df_amenities.to_csv('amenities.csv')

In [9]:
df_metro_quadrado.to_csv('metro_quadrado.csv')