## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np

## Instanciando os dataframes

In [2]:
df_anuncios = pd.read_csv('anuncios.csv', index_col='id', engine='python', sep =',', encoding="utf-8")
print('df_anuncios: ' + str(df_anuncios.shape))
df_amenities = pd.read_csv('amenities.csv', index_col='id', engine='python', sep =',', encoding="utf-8")
print('df_amenities: ' + str(df_amenities.shape))
df_metro_quadrado = pd.read_csv('metro_quadrado.csv', index_col='neighbourhood', engine='python', sep =',', encoding="utf-8")
print('df_metro_quadrado: ' + str(df_metro_quadrado.shape))

df_anuncios: (33715, 35)
df_amenities: (33715, 168)
df_metro_quadrado: (97, 1)


## Agregando os dados num único dataframe

In [3]:
# Concatenando df_anuncios e df_amenities

df_airbnb = pd.concat([df_anuncios, df_amenities], axis=1)

In [4]:
# Relacionando df_airbnb com df_metro_quadrado 

df_airbnb = df_airbnb.join(df_metro_quadrado, on='neighbourhood', how='left')

## Limpando os dados

In [5]:
# Realizando a estatística descritiva

df_airbnb.describe()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,number_of_reviews,...,Wide_entrance_for_guests,Wide_entryway,Wide_hallways,Wifi,Window_guards,Wine_cooler,_toilet,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,m2
count,33715.0,33715.0,33715.0,33661.0,33673.0,33667.0,33715.0,33715.0,33715.0,33715.0,...,33715.0,33715.0,33715.0,33715.0,33715.0,33715.0,33715.0,33715.0,33715.0,31747.0
mean,-22.965208,-43.254228,4.199941,1.694765,1.646126,2.574242,1.718434,4.789826,957.1741,9.374344,...,0.072876,0.045855,0.069465,0.888863,0.03144,0.000119,0.01661,0.089367,0.137565,11363.364601
std,0.035244,0.097488,2.625252,1.508228,1.075649,2.12149,1.585645,22.640328,54736.14,24.815311,...,0.259936,0.209174,0.254246,0.314307,0.174506,0.010892,0.127806,0.285277,0.344448,5170.018781
min,-23.0734,-43.73709,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,439.0
25%,-22.98471,-43.32336,2.0,1.0,1.0,1.0,1.0,1.0,30.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9334.0
50%,-22.97085,-43.20022,4.0,1.0,1.0,2.0,1.0,2.0,1125.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11378.0
75%,-22.946725,-43.187085,6.0,2.0,2.0,3.0,2.0,4.0,1125.0,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12169.0
max,-22.75038,-43.10406,160.0,200.0,20.0,69.0,25.0,1123.0,10000000.0,372.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23625.0


In [6]:
# Eliminando de registros incompletos

df_airbnb = df_airbnb.dropna()

In [7]:
# Formatando os atributos

df_airbnb['host_response_rate'] = df_airbnb.host_response_rate.str.rstrip('%').astype('float') / 100
df_airbnb['price'] = df_airbnb.price.str.lstrip('$').str.replace(',','').astype('float')
df_airbnb['security_deposit'] = df_airbnb.security_deposit.str.lstrip('$').str.replace(',','').astype('float')
df_airbnb['cleaning_fee'] = df_airbnb.cleaning_fee.str.lstrip('$').str.replace(',','').astype('float')
df_airbnb['extra_people'] = df_airbnb.extra_people.str.lstrip('$').str.replace(',','').astype('float')
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].astype(int)
df_airbnb.round({"review_scores_rating":2,"review_scores_accuracy":2,"review_scores_cleanliness":2,"review_scores_checkin":2,"revireview_scores_communication":2,"review_scores_location":2,"review_scores_value":2,"reviews_per_month":2})
df_airbnb.round({"latitude":6, "longitude":6})

Unnamed: 0_level_0,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,property_type,...,Wide_entrance_for_guests,Wide_entryway,Wide_hallways,Wifi,Window_guards,Wine_cooler,_toilet,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,m2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17878,Matthias,within an hour,1.00,t,t,Copacabana,22020-050,-22.96592,-43.17896,Condominium,...,0,0,0,1,0,0,0,0,0,12169.0
25026,Viviane,within a day,0.91,f,t,Copacabana,22060-020,-22.97712,-43.19045,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0
31560,Renata,within an hour,1.00,t,t,Ipanema,22410-003,-22.98302,-43.21427,Apartment,...,0,0,0,1,0,0,0,1,1,20908.0
35636,Patricia,within an hour,1.00,t,t,Ipanema,22081-020,-22.98816,-43.19359,Apartment,...,0,0,0,1,0,0,0,0,0,20908.0
35764,Patricia Miranda & Paulo,within an hour,1.00,t,t,Copacabana,21031-300,-22.98127,-43.19046,Loft,...,1,1,0,1,0,0,0,0,1,12169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40009653,Lya,within a few hours,1.00,f,f,Copacabana,22081-000,-22.98304,-43.19569,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0
40025707,Marcos Antonio,within an hour,1.00,f,t,Copacabana,22040-001,-22.97132,-43.18929,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0
40033232,Priscila,within a few hours,0.92,f,t,Santa Teresa,20241-080,-22.91599,-43.17894,Apartment,...,0,0,0,1,0,0,0,0,0,7869.0
40050039,Roberto,within a few hours,0.88,f,f,Copacabana,22040-010,-22.96768,-43.18330,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0


## Exibindo o dataframe após o processamento

In [8]:
df_airbnb

Unnamed: 0_level_0,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,property_type,...,Wide_entrance_for_guests,Wide_entryway,Wide_hallways,Wifi,Window_guards,Wine_cooler,_toilet,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,m2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17878,Matthias,within an hour,1.00,t,t,Copacabana,22020-050,-22.96592,-43.17896,Condominium,...,0,0,0,1,0,0,0,0,0,12169.0
25026,Viviane,within a day,0.91,f,t,Copacabana,22060-020,-22.97712,-43.19045,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0
31560,Renata,within an hour,1.00,t,t,Ipanema,22410-003,-22.98302,-43.21427,Apartment,...,0,0,0,1,0,0,0,1,1,20908.0
35636,Patricia,within an hour,1.00,t,t,Ipanema,22081-020,-22.98816,-43.19359,Apartment,...,0,0,0,1,0,0,0,0,0,20908.0
35764,Patricia Miranda & Paulo,within an hour,1.00,t,t,Copacabana,21031-300,-22.98127,-43.19046,Loft,...,1,1,0,1,0,0,0,0,1,12169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40009653,Lya,within a few hours,1.00,f,f,Copacabana,22081-000,-22.98304,-43.19569,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0
40025707,Marcos Antonio,within an hour,1.00,f,t,Copacabana,22040-001,-22.97132,-43.18929,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0
40033232,Priscila,within a few hours,0.92,f,t,Santa Teresa,20241-080,-22.91599,-43.17894,Apartment,...,0,0,0,1,0,0,0,0,0,7869.0
40050039,Roberto,within a few hours,0.88,f,f,Copacabana,22040-010,-22.96768,-43.18330,Apartment,...,0,0,0,1,0,0,0,0,0,12169.0


In [9]:
df_airbnb.describe()

Unnamed: 0,host_response_rate,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,...,Wide_entrance_for_guests,Wide_entryway,Wide_hallways,Wifi,Window_guards,Wine_cooler,_toilet,translation_missing:_en.hosting_amenity_49,translation_missing:_en.hosting_amenity_50,m2
count,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,...,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0,9860.0
mean,0.918131,-22.970369,-43.230659,4.232657,1.597363,1.568154,2.639249,345.83712,654.215416,145.861562,...,0.134787,0.083874,0.124949,0.941176,0.057302,0.000406,0.030122,0.094726,0.138844,12595.971805
std,0.176144,0.028152,0.080292,2.334539,0.888044,0.99513,1.963967,677.173787,1360.142918,102.36473,...,0.341513,0.277213,0.330678,0.235306,0.232431,0.020138,0.170931,0.292851,0.345801,5178.58597
min,0.0,-23.07262,-43.56764,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,490.0
25%,0.9,-22.98462,-43.22483,2.0,1.0,1.0,1.0,139.0,0.0,90.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9334.0
50%,1.0,-22.97511,-43.19281,4.0,1.0,1.0,2.0,210.0,398.0,150.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12169.0
75%,1.0,-22.96272,-43.18508,5.0,2.0,2.0,3.0,361.0,800.0,190.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12169.0
max,1.0,-22.80657,-43.15908,25.0,15.0,15.0,35.0,31398.0,20991.0,1469.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23625.0


## Salvando o dataframe resultante para as próximas etapas

In [10]:
df_airbnb.to_csv('airbnb.csv')