In [2]:
import os
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import nltk


from itertools import chain

from branca.colormap import linear, LinearColormap
from statsmodels.formula.api import ols
from IPython.display import Markdown, IFrame
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from matplotlib.colors import rgb2hex

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import GammaRegressor
from sklearn.metrics import mean_squared_error, mean_gamma_deviance, mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor

from scipy.stats import expon

# A predição do preço foi escolhida para modelagem

## Bases

Existem 5 diferentes bases, mas a base lisitings é a mais completa enquanto as outras possuem poucas informações. Apenas ela será utilizada.

### Base listings

In [165]:
listings=pd.read_csv('http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2020-12-23/data/listings.csv.gz')

In [99]:
listings.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,17878,https://www.airbnb.com/rooms/17878,20201223235510,2020-12-30,"Very Nice 2Br in Copacabana w. balcony, fast WiFi",Discounts for long term stays. <br />- Large b...,This is the one of the bests spots in Rio. Bec...,https://a0.muscache.com/pictures/65320518/3069...,68997,https://www.airbnb.com/users/show/68997,...,10.0,10.0,9.0,,t,1,1,0,0,2.03
1,25026,https://www.airbnb.com/rooms/25026,20201223235510,2020-12-24,Beautiful Modern Decorated Studio in Copa,"Our apartment is a little gem, everyone loves ...",Copacabana is a lively neighborhood and the ap...,https://a0.muscache.com/pictures/3003965/68ebb...,3746246,https://www.airbnb.com/users/show/3746246,...,10.0,10.0,9.0,,f,11,11,0,0,1.85
2,35636,https://www.airbnb.com/rooms/35636,20201223235510,2020-12-24,Cosy flat close to Ipanema beach,This cosy apartment is just a few steps away ...,The apartment street is very quiet and safe ....,https://a0.muscache.com/pictures/20009355/38b6...,153232,https://www.airbnb.com/users/show/153232,...,10.0,10.0,9.0,,f,1,1,0,0,2.07


In [171]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25784 entries, 0 to 25783
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            25784 non-null  int64  
 1   listing_url                                   25784 non-null  object 
 2   scrape_id                                     25784 non-null  int64  
 3   last_scraped                                  25784 non-null  object 
 4   name                                          25755 non-null  object 
 5   description                                   24423 non-null  object 
 6   neighborhood_overview                         14109 non-null  object 
 7   picture_url                                   25784 non-null  object 
 8   host_id                                       25784 non-null  int64  
 9   host_url                                      25784 non-null 

### Variáveis de interesse:

Diversas variáveis podem influir no preço, mas apenas algumas foram selecionadas numa tentativa de simplificar a modelagem. Essa triagem foi feita pela experiência do autor. Ex. número de banheiros é mais relevante na montagem do preço do que se o host tem foto.

Lista de variáveis escolhidas:

* bathrooms_text
* property_type
* room_type
* host_verifications
* accommodates
* amenities
* minimum_nights
* maximum_nights
* minimum_minimum_nights
* maximum_minimum_nights
* minimum_maximum_nights
* maximum_maximum_nights
* minimum_nights_avg_ntm
* maximum_nights_avg_ntm
* calculated_host_listings_count
* calculated_host_listings_count_entire_homes
* calculated_host_listings_count_private_rooms
* calculated_host_listings_count_shared_rooms
* reviews_per_month
* has_availability
* availability_30
* availability_60
* availability_90
* availability_365

In [182]:
listings_red=listings.iloc[:,[31,32,33,35,36,37,38,39,40,41,42,43,44,45,46,47,49,50,51,52,53,73]]

In [183]:
listings_red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25784 entries, 0 to 25783
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   property_type           25784 non-null  object 
 1   room_type               25784 non-null  object 
 2   accommodates            25784 non-null  int64  
 3   bathrooms_text          25714 non-null  object 
 4   bedrooms                24058 non-null  float64
 5   beds                    25546 non-null  float64
 6   amenities               25784 non-null  object 
 7   price                   25784 non-null  object 
 8   minimum_nights          25784 non-null  int64  
 9   maximum_nights          25784 non-null  int64  
 10  minimum_minimum_nights  25784 non-null  int64  
 11  maximum_minimum_nights  25784 non-null  int64  
 12  minimum_maximum_nights  25784 non-null  int64  
 13  maximum_maximum_nights  25784 non-null  int64  
 14  minimum_nights_avg_ntm  25784 non-null

# Tratamento das variáveis objeto

## Variável preço

### Conversão em variável numérica

In [184]:
listings_red['price']=listings_red['price'].str.replace('[$,]','').astype(float)

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Verificação de outliers

Criação da variável preço por pessoa, que é uma métrica mais verossímel para algumas análises

In [185]:
listings_red['price_person']=listings_red['price'].div(listings_red['accommodates'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [186]:
listings_red.price_person.quantile(.25)

66.66666666666667

In [187]:
listings_red.price_person.quantile(.75)

188.0

In [188]:
listings_red.price_person.max()

118653.2

Aparentemente existem valores estranhos. Eles serão filtrados usando tuckey fences.

In [190]:
q_low=listings_red["price_person"].quantile(0.25)
q_hi=listings_red["price_person"].quantile(0.75)
iqr=q_hi-q_low

list_o=listings_red[((listings_red['price_person']) > (q_low-iqr)) & ((listings_red['price_person']) < (q_hi+iqr))]

Variável list_o criada contemplando os dados sem outliers

In [191]:
list_o.price.max()

4643.0

In [192]:
i_i,z=list_o.shape

In [195]:
i_o,z=listings_red.shape

In [196]:
1-i_i/i_o

0.10715947874650944

Apenas 10.7 % dos dados foram perdidos, o que não justificar imputar tais dados.

In [199]:
list_o.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23021 entries, 0 to 25783
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   property_type           23021 non-null  object 
 1   room_type               23021 non-null  object 
 2   accommodates            23021 non-null  int64  
 3   bathrooms_text          22974 non-null  object 
 4   bedrooms                21407 non-null  float64
 5   beds                    22814 non-null  float64
 6   amenities               23021 non-null  object 
 7   price                   23021 non-null  float64
 8   minimum_nights          23021 non-null  int64  
 9   maximum_nights          23021 non-null  int64  
 10  minimum_minimum_nights  23021 non-null  int64  
 11  maximum_minimum_nights  23021 non-null  int64  
 12  minimum_maximum_nights  23021 non-null  int64  
 13  maximum_maximum_nights  23021 non-null  int64  
 14  minimum_nights_avg_ntm  23021 non-null

## Variável tipo de propriedade

## Variável banheiro

### Conversão em variável numérica

In [197]:
list_o.bathrooms_text.head(8)

0            1 bath
1            1 bath
2         1.5 baths
3            1 bath
4            1 bath
6    1 private bath
7         3.5 baths
8            1 bath
Name: bathrooms_text, dtype: object

In [162]:
list_o.bathrooms_text.describe()

count      22974
unique        44
top       1 bath
freq        9523
Name: bathrooms_text, dtype: object

A variável banheiro é um objeto com 44 categorias. No tratamento o valor numérico será considerado.

In [220]:
baths = list_o['bathrooms_text'].value_counts().to_frame()

In [221]:
baths['baths'] = baths.index.str.extract('([0-9\.]{1,3}[0-9]?)').values

In [222]:
list_o['baths'] = list_o['bathrooms_text'].map(baths['baths'].to_dict()).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Os valores faltantes de n_baths serão imputados usando a mediana.

In [223]:
list_o['baths']

0        1.0
1        1.0
2        1.5
3        1.0
4        1.0
        ... 
25777    2.5
25778    1.5
25780    1.0
25781    2.0
25783    1.0
Name: baths, Length: 23021, dtype: float64

### Verificação de outliers

In [230]:
list_o['baths'].min()

0.0

In [231]:
list_o['baths'].max()

20.0

Aparentemente não há outliers na variável baths