# Análise da Web com Python, Beautiful Soup e Selenium

## 1. Requests, BeautifulSoup e Chrome DevTools

In [1]:
import requests
from bs4 import BeautifulSoup

##### 1. biblioteca `Requests`
    * para obter a página
##### 2. BeautifulSoup
    * para encontrar elementos específicos
##### 3. Chrome DevTools
    * localizar os elementos

<img src='https://raw.githubusercontent.com/fredericmenezes/analise-do-airbnb/main/img-airbnb/img01.png'>

In [2]:
def get_listings(search_page):
    answer = requests.get(search_page, timeout=7)
    content = answer.content
    soup = BeautifulSoup(content, 'html.parser')
    listings = soup.find_all('div', 'cy5jw6o') # div de cada locacao

    return listings

<img src='https://raw.githubusercontent.com/fredericmenezes/analise-do-airbnb/main/img-airbnb/img02.png'>

Opções selecionadas para busca:
* São Luís - MA
* Check-in: 02/01/2023
* Checkout: 08/01/2023

<img src='https://raw.githubusercontent.com/fredericmenezes/analise-do-airbnb/main/img-airbnb/img03.png'>

Link copiado para ser utilizado na variável `airbnb_url`.

In [3]:
airbnb_url = 'https://www.airbnb.com.br/s/S%C3%A3o-Lu%C3%ADs-~-MA/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&query=S%C3%A3o%20Lu%C3%ADs%20-%20MA&place_id=ChIJIW1_b_CP9gcRR96jWeQCMZg&date_picker_type=calendar&checkin=2023-01-02&checkout=2023-01-08&source=structured_search_input_header&search_type=autocomplete_click'

In [4]:
listings = get_listings(airbnb_url)
len(listings)

20

In [5]:
print(listings[0].prettify())

<div aria-labelledby="title_683467078155547534" class="cy5jw6o dir dir-ltr" role="group">
 <a aria-labelledby="title_683467078155547534" class="bn2bl2p dir dir-ltr" href="/rooms/683467078155547534?check_in=2023-01-02&amp;check_out=2023-01-08&amp;previous_page_section_name=1000" rel="noopener noreferrer nofollow" target="listing_683467078155547534">
 </a>
 <div class="lwy0wad l1tup9az dir dir-ltr">
  <div class="g1qv1ctd cb4nyux dir dir-ltr">
   <div class="t1jojoys dir dir-ltr" id="title_683467078155547534">
    Condomínio em São Luís
   </div>
   <div class="f15liw5s s1cjsi4j dir dir-ltr">
   </div>
   <div class="nquyp1l s1cjsi4j dir dir-ltr">
    <span class="t6mzqp7 dir dir-ltr" lang="pt">
     Espaço aconchegante estilo industrial
    </span>
   </div>
   <div class="f15liw5s s1cjsi4j dir dir-ltr">
    <span class="dir dir-ltr">
     2 camas de solteiro
    </span>
   </div>
   <div class="phbjkf1 dir dir-ltr">
    <div style="--pricing-guest-display-price-alignment:flex-start;--p

##### Extraindo os dados

<img src='https://raw.githubusercontent.com/fredericmenezes/analise-do-airbnb/main/img-airbnb/img04.png'>

In [29]:
RULES_SEARCH_PAGE = {
    'url': {'tag': 'a', 'get': 'href'},
    'header': {'tag': 'div', 'class': 't1jojoys'},
    'name': {'tag': 'div', 'class': 'nquyp1l'},
    'rooms': {'tag': 'div', 'class': 'f15liw5s', 'order': 1},
    'rating_n_reviews': {'tag': 'span', 'class': 'r1dxllyb'},
    'price_1': {'tag': 'span', 'class': '_tyxjp1'},
    'price_2': {'tag': 'span', 'class': '_1y74zjx'},
    'badge': {'tag': 'div', 'class': 't1mwk1n0'},
}

In [30]:
def extract_element(listing_html, params):
    # 1. Encontra a etiqueta certa
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extrai o elemento certo
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
    
    # 3. Obter texto
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [35]:
# o que acontece se o tiver a etiqueta na imagem (badge)
listagem = listings[4]

print(f"Titulo: {extract_element(listagem, RULES_SEARCH_PAGE['url'])}")
print(f"Titulo: {extract_element(listagem, RULES_SEARCH_PAGE['header'])}")
print(f"Descricao: {extract_element(listagem, RULES_SEARCH_PAGE['name'])}")
print(f"Camas: {extract_element(listagem, RULES_SEARCH_PAGE['rooms'])}")
print(f"Classificacao: {extract_element(listagem, RULES_SEARCH_PAGE['rating_n_reviews'])}")
print(f"Preco: {extract_element(listagem, RULES_SEARCH_PAGE['price_1'])}")
print(f"Preco: {extract_element(listagem, RULES_SEARCH_PAGE['price_2'])}")
print(f"Etiqueta: {extract_element(listagem, RULES_SEARCH_PAGE['badge'])}")

Titulo: /rooms/779692147118777188?check_in=2023-01-02&check_out=2023-01-08&previous_page_section_name=1000
Titulo: Apartamento em São Luís
Descricao: Flat Cult Frente Mar Litorânea
Camas: 2 camas
Classificacao: Novo


IndexError: list index out of range

In [36]:
# corrigindo a questao da falta de algum dos elementos
def extract_page_features(soup, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = extract_element(soup, rules[feature])
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

In [37]:
extract_page_features(listings[4], RULES_SEARCH_PAGE)

{'url': '/rooms/779692147118777188?check_in=2023-01-02&check_out=2023-01-08&previous_page_section_name=1000',
 'header': 'Apartamento em São Luís',
 'name': 'Flat Cult Frente Mar Litorânea',
 'rooms': '2 camas',
 'rating_n_reviews': 'Novo',
 'price_1': 'empty',
 'price_2': 'R$286\xa0',
 'badge': 'Superhost'}

##### Paginação

<img src='https://miro.medium.com/max/564/1*Q9iBSu5nniBwc8Wt2-8Ujw.png'>

In [15]:
# funcao para pegar as divs de cada pagina
def get_url_pages(search_page):
    soup = BeautifulSoup(requests.get(search_page).content, 'html.parser')
    listings = soup.find('div', '_jro6t0')

    return listings

In [16]:
# funcao para pegar todas as urls das 15 paginas da pesquisa
def build_urls(main_url, pages_per_location=15):
    url_list = []
    url_list.append(main_url)
    new_url = main_url
    airbnb_url = 'https://www.airbnb.com.br'
    

    for i in range(pages_per_location-1):
        url_list.append(new_url)
        url_pages=get_url_pages(new_url)
        next_page = url_pages.find('a', {'class':'_1bfat5l'}).get('href') # pega a proxima pagina
        new_url = airbnb_url + next_page
        

    return url_list

In [17]:
url_list = build_urls(airbnb_url)

In [18]:
len(url_list)

15

In [19]:
url_list

['https://www.airbnb.com.br/s/S%C3%A3o-Lu%C3%ADs-~-MA/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&query=S%C3%A3o%20Lu%C3%ADs%20-%20MA&place_id=ChIJIW1_b_CP9gcRR96jWeQCMZg&date_picker_type=calendar&checkin=2023-01-02&checkout=2023-01-08&source=structured_search_input_header&search_type=autocomplete_click',
 'https://www.airbnb.com.br/s/S%C3%A3o-Lu%C3%ADs-~-MA/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&query=S%C3%A3o%20Lu%C3%ADs%20-%20MA&place_id=ChIJIW1_b_CP9gcRR96jWeQCMZg&date_picker_type=calendar&checkin=2023-01-02&checkout=2023-01-08&source=structured_search_input_header&search_type=autocomplete_click',
 'https://www.airbnb.com.br/s/São-Luís-~-MA/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=6&q

##### Raspando as páginas de pesquisa

In [20]:
def process_search_pages(url_list):
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, RULES_SEARCH_PAGE)
            features_list.append(features)

    return features_list

In [21]:
# fazendo em uma página
base_features = process_search_pages(url_list[4:5])

In [22]:
base_features

[{'url': '/rooms/46915030?check_in=2023-01-02&check_out=2023-01-08&previous_page_section_name=1000',
  'header': 'Quarto inteiro em Jardim Renascença',
  'name': 'Suíte casal em Apto compartilhado com anfitriã',
  'rooms': '1 cama de casal',
  'rating_n_reviews': '5,0 (7)',
  'price': 'R$163\xa0',
  'badge': 'empty'},
 {'url': '/rooms/37743324?check_in=2023-01-02&check_out=2023-01-08&previous_page_section_name=1000',
  'header': 'Apartamento em Calhau',
  'name': 'Apartamento em frente a melhor praia de São Luís!',
  'rooms': '2 camas de casal',
  'rating_n_reviews': '4,43 (84)',
  'price': 'R$535\xa0',
  'badge': 'Raridade'},
 {'url': '/rooms/591912254031427169?check_in=2023-01-02&check_out=2023-01-08&previous_page_section_name=1000',
  'header': 'Apartamento em Ponta D Areia',
  'name': 'Apartamento na melhor localização de São Luís.',
  'rooms': '3 camas',
  'rating_n_reviews': '4,5 (4)',
  'price': 'R$233\xa0',
  'badge': 'empty'},
 {'url': '/rooms/53921766?check_in=2023-01-02&chec

## 2. Páginas dinâmicas

Vamos inspecionar uma página de detalhes e tentar extrair um dos elementos

In [23]:
# listing name: div, _b8stb0
detail_url = 'https://airbnb.com' + base_features[0]['url']


answer = requests.get(detail_url)
detail_soup = BeautifulSoup(answer.content)

In [24]:
#detail_soup.find_all('div', '_b8stb0')
detail_soup.find_all('div', '_11jhslp')

[]

In [25]:
# algumas funções JS dentro
detail_soup

<!DOCTYPE html>
<html class="scrollbar-gutter" data-hyperloop-version="1" data-is-hyperloop="true" dir="ltr" lang="en"><head><meta charset="utf-8"/><meta content="en" name="locale"/><meta content="notranslate" name="google"/><meta content="authenticity_token" id="csrf-param-meta-tag" name="csrf-param"/><meta content="" id="csrf-token-meta-tag" name="csrf-token"/><meta content="" id="english-canonical-url"/><meta content="on" name="twitter:widgets:csp"/><meta content="yes" name="mobile-web-app-capable"/><meta content="yes" name="apple-mobile-web-app-capable"/><meta content="Airbnb" name="application-name"/><meta content="Airbnb" name="apple-mobile-web-app-title"/><meta content="#ffffff" name="theme-color"/><meta content="#ffffff" name="msapplication-navbutton-color"/><meta content="black-translucent" name="apple-mobile-web-app-status-bar-style"/><meta content="/?utm_source=homescreen" name="msapplication-starturl"/><script>(function() {
  var pgRequest = new XMLHttpRequest();
  var diff

### Selenium

Temos que instalar um chromedriver

In [26]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

servico = Service(ChromeDriverManager().install())

driver = webdriver.Chrome(service=servico)


[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.46M/6.46M [00:01<00:00, 3.55MB/s]


In [27]:
# Abrindo a página
driver.get(detail_url)

In [28]:
# obtendo html
page_detailed = driver.page_source

# fechando o driver
driver.quit()

# BS
detail_soup = BeautifulSoup(page_detailed)

In [29]:
detail_soup.find_all('div', '_b8stb0')
#detail_soup.find_all('div', '_11jhslp')

[<div class="_b8stb0"><span class="_1n81at5"><h1 class="_fecoyn4" elementtiming="LCP-target" tabindex="-1">Suíte casal em Apto compartilhado com anfitriã</h1></span></div>]

### Botões, tempo de carregamento

In [30]:
# podemos clicar nos botões
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains, ScrollOrigin

driver = webdriver.Chrome(service=servico)

driver.get(detail_url)

In [31]:
#driver.implicitly_wait(3)

In [40]:
# fechar mensagem no inicio

#fechar_msg = driver.find_element(By.CLASS_NAME,'_1piuevz') 

In [33]:
#element.click()

In [39]:
# ou pode fazer tudo de uma vez

#fechar_msg = driver.find_element(By.CLASS_NAME, "_1piuevz").click()

In [35]:
# encontrar o botão de comodidades

element = driver.find_element(By.CLASS_NAME,'b65jmrv') 

In [None]:
# clicar no botão
element.click()

In [42]:
# não esquecer de parar o driver
driver.quit()

##### Agora fazendo tudo de uma vez

In [43]:
# botao comodidades
driver = webdriver.Chrome(service=servico)

driver.get(detail_url)
driver.implicitly_wait(600)

#fechar_msg = driver.find_element(By.CLASS_NAME, "_1piuevz").click()

driver.implicitly_wait(150)
botao = driver.find_element(By.CLASS_NAME, "b65jmrv")
scroll_origin = ScrollOrigin.from_element(botao)
ActionChains(driver)\
    .scroll_from_origin(scroll_origin, 0, 500)\
    .perform()

driver.implicitly_wait(150)
driver.find_element(By.CLASS_NAME,'b65jmrv').click()

ElementClickInterceptedException: Message: element click intercepted: Element <button type="button" class="b65jmrv v7aged4 dir dir-ltr">...</button> is not clickable at point (191, 51). Other element would receive the click: <div class="_11g6x33">...</div>
  (Session info: chrome=107.0.5304.108)
Stacktrace:
Backtrace:
	Ordinal0 [0x0065ACD3+2075859]
	Ordinal0 [0x005EEE61+1633889]
	Ordinal0 [0x004EB7BD+571325]
	Ordinal0 [0x00521499+791705]
	Ordinal0 [0x0051F4AC+783532]
	Ordinal0 [0x0051D0AB+774315]
	Ordinal0 [0x0051BD37+769335]
	Ordinal0 [0x00511C76+728182]
	Ordinal0 [0x0053731C+881436]
	Ordinal0 [0x005115BF+726463]
	Ordinal0 [0x00537534+881972]
	Ordinal0 [0x0054B56A+963946]
	Ordinal0 [0x00537136+880950]
	Ordinal0 [0x0050FEFD+720637]
	Ordinal0 [0x00510F3F+724799]
	GetHandleVerifier [0x0090EED2+2769538]
	GetHandleVerifier [0x00900D95+2711877]
	GetHandleVerifier [0x006EA03A+521194]
	GetHandleVerifier [0x006E8DA0+516432]
	Ordinal0 [0x005F682C+1665068]
	Ordinal0 [0x005FB128+1683752]
	Ordinal0 [0x005FB215+1683989]
	Ordinal0 [0x00606484+1729668]
	BaseThreadInitThunk [0x7513FEF9+25]
	RtlGetAppContainerNamedObjectPath [0x771A7BBE+286]
	RtlGetAppContainerNamedObjectPath [0x771A7B8E+238]


In [47]:
driver.quit()

##### Temos que esperar até que apareça

In [45]:
import time

In [48]:
driver = webdriver.Chrome(service=servico)

driver.get(detail_url)
time.sleep(10)

#fechar_msg = driver.find_element(By.CLASS_NAME, "_1piuevz").click()

time.sleep(3)
botao = driver.find_element(By.CLASS_NAME, "b65jmrv")
scroll_origin = ScrollOrigin.from_element(botao)
ActionChains(driver)\
    .scroll_from_origin(scroll_origin, 0, 200)\
    .perform()

time.sleep(5)
driver.find_element(By.CLASS_NAME,'b65jmrv').click()

In [49]:
page_detailed = driver.page_source
driver.quit()
detail_soup_clicked = BeautifulSoup(page_detailed)

In [50]:
# sem comodidades antes
comodidades = detail_soup.find_all('div', {'class': '_11jhslp'})
len(comodidades)

0

In [51]:
# have them now
comodidades = detail_soup_clicked.find_all('div', {'class': '_11jhslp'})
len(comodidades)

12

### Economizando algum tempo
##### Precisamos de imagens?
Regards to Oguz ;)

In [52]:
from selenium.webdriver.chrome.options import Options

In [53]:
options = Options()
options.add_argument('--blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(options=options, service=servico)

driver.get(detail_url)

In [54]:
driver.quit()

## 3. Clicando em mais botões

<img src='https://miro.medium.com/max/700/1*8b78NMFeRidmZDz35HTfvA.png'>

In [55]:
detail_url

'https://airbnb.com/rooms/46915030?check_in=2023-01-02&check_out=2023-01-08&previous_page_section_name=1000'

In [57]:
driver = webdriver.Chrome(service=servico)

#detail_url = 'https://www.airbnb.com/rooms/31741201?adults=4&check_in=2021-04-06&check_out=2021-04-13&federated_search_id=7941b65b-bf17-47dc-8fe0-bce247d0657e&source_impression_id=p3_1613151799_D%2BvOz7MMKLyJexNa&guests=1'
driver.get(detail_url)

In [None]:
#time.sleep(10)

# caso aparece mensagem no inicio habilitar o codigo abaixo
#fechar_msg = driver.find_element(By.CLASS_NAME, "_1piuevz").click()

In [58]:
# procurando o elemento do botão

element = driver.find_element(By.CLASS_NAME, "b65jmrv")

    se nenhum erro -> elemento está presente

In [59]:
# that doesn't work (in some cases)
element.click()

##### Outra maneira de clicar em um botão - podemos usar cadeias de ação (ActionChains)

In [60]:
from selenium.webdriver import ActionChains

In [61]:
# vamos tentar sem ver o botão
actions = ActionChains(driver)
actions.move_to_element(element)
actions.click().perform()

##### O elemento deve estar em uma viewport

In [62]:
# vamos rolar manualmente
actions = ActionChains(driver)
actions.move_to_element(element)
actions.click().perform()

### Rolagem com Selenium

In [63]:
actions = ActionChains(driver)
driver.execute_script("arguments[0].scrollIntoView(true);", element)

    Eu geralmente consigo com 4 tentativas

In [64]:
# finalmente clicando
actions.move_to_element(element)
actions.click().perform()

In [65]:
# ou
element.click()

In [66]:
driver.quit()

## 4. Raspando uma página de detalhes
### Modificando a função de extração

    Há uma quantidade arbitrária de alguns itens.

    Então, vamos apenas raspá-los todos e concatená-los.

    Nós nos preocuparemos com a limpeza de dados depois

In [67]:
# Próxima geração :)
def extract_element(listing_html, params):
    # 1. Encontre a tag certa
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extraia o texto dessas tags
    if 'get' in params:
        element_texts = [el.get(params['get']) for el in elements_found]
    else:
        element_texts = [el.get_text() for el in elements_found]
    
    # 2. Extraia o elemento certo
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Selecione um determinado texto ou concatene todos eles
    tag_order = params.get('order', 0)
    if tag_order == -1:
        output = '**__**'.join(element_texts)
    else:
        output = element_texts[tag_order]

    return output

In [68]:
extract_element(detail_soup_clicked, {'tag': 'div', 'class': '_gw4xx4', 'order': 0})

'Secador de cabelo'

In [69]:
extract_element(detail_soup_clicked, {'tag': 'div', 'class': '_gw4xx4', 'order': -1})

'Secador de cabelo**__**Produtos de limpeza**__**Água quente**__**Máquina de Lavar**__**Secadora**__**Básico**__**Cabides**__**Roupa de cama**__**Cobertores e travesseiros extras**__**Blackout nas cortinas**__**Ferro de passar**__**Varal para secar roupas**__**Cofre**__**Local para guardar as roupas**__**Conexão à Ethernet**__**HDTV com TV a cabo, HBO Max**__**Ar-condicionado central**__**Tranca na porta do quarto**__**Wi-Fi**__**Espaço de trabalho exclusivo**__**Cozinha**__**Refrigerador**__**Microondas**__**Itens básicos de cozinha**__**Louças e talheres**__**Freezer**__**Fogão**__**Forno**__**Cafeteira: Nespresso, cafeteira com coador**__**Taças de vinho**__**Utensílios para churrasco**__**Mesa de jantar**__**Lavanderia nas proximidades**__**Pátio ou varanda (Compartilhada)**__**Rede**__**Churrasqueira**__**Estacionamento incluído**__**Piscina compartilhada**__**Elevador**__**É permitido deixar as malas**__**Estadias de longa duração são permitidas**__**Self check-in**__**Funcionári

### Processando comodidades

In [70]:
#amenities[0]
comodidades[0]

<div class="_11jhslp"><div class="_ak5d0on"><h3 class="_14i3z6h" elementtiming="LCP-target" tabindex="-1">Banheiro</h3></div><div><div class="rcem0st dir dir-ltr" id="pdp_v3_bathroom_45_46915030-0"><div class="_jro6t0"><div class="_7ytxmo"><div class="_oyq8vf"><svg aria-hidden="true" focusable="false" role="presentation" style="display: block; height: 24px; width: 24px; fill: currentcolor;" viewbox="0 0 32 32" xmlns="http://www.w3.org/2000/svg"><path d="M14 27l-.005.2a4 4 0 0 1-3.789 3.795L10 31H4v-2h6l.15-.005a2 2 0 0 0 1.844-1.838L12 27zM10 1c.536 0 1.067.047 1.58.138l.38.077 17.448 3.64a2 2 0 0 1 1.585 1.792l.007.166v6.374a2 2 0 0 1-1.431 1.917l-.16.04-13.554 2.826 1.767 6.506a2 2 0 0 1-1.753 2.516l-.177.008H11.76a2 2 0 0 1-1.879-1.315l-.048-.15-1.88-6.769A9 9 0 0 1 10 1zm5.692 24l-1.799-6.621-1.806.378a8.998 8.998 0 0 1-1.663.233l-.331.008L11.76 25zM10 3a7 7 0 1 0 1.32 13.875l.331-.07L29 13.187V6.813L11.538 3.169A7.027 7.027 0 0 0 10 3zm0 2a5 5 0 1 1 0 10 5 5 0 0 1 0-10zm0 2a3 3 0 

* Cada comodidade tem um cabeçalho: class="_ak5d0on"
* Dentro de cada comodidade existem várias classes "_gw4xx4"

In [71]:
comodidades[0].find('div', '_ak5d0on').get_text()

'Banheiro'

In [72]:
comodidades[0].find_all('div', '_gw4xx4')

[<div class="_gw4xx4" id="pdp_v3_bathroom_45_46915030-0-row-title">Secador de cabelo</div>,
 <div class="_gw4xx4" id="pdp_v3_bathroom_665_46915030-0-row-title">Produtos de limpeza</div>,
 <div class="_gw4xx4" id="pdp_v3_bathroom_77_46915030-0-row-title">Água quente</div>]

In [73]:
# às vezes há mais elementos dentro
[a.get_text() for a in comodidades[-1].find_all('div', '_gw4xx4')]

['Indisponível: Câmeras de segurança na propriedadeCâmeras de segurança na propriedade',
 'Indisponível: Detector de fumaçaDetector de fumaça',
 'Indisponível: Alarme de monóxido de carbonoAlarme de monóxido de carbono',
 'Indisponível: XampuXampu',
 'Indisponível: Entrada privadaEntrada privada',
 'Indisponível: Aquecimento CentralAquecimento Central']

In [74]:
# não vamos nos aprofundar
[a.find(text=True) for a in comodidades[-1].find_all('div', '_gw4xx4')]

['Indisponível: Câmeras de segurança na propriedade',
 'Indisponível: Detector de fumaça',
 'Indisponível: Alarme de monóxido de carbono',
 'Indisponível: Xampu',
 'Indisponível: Entrada privada',
 'Indisponível: Aquecimento Central']

##### Colocando em uma função

In [75]:
import json

def extract_amenities(soup):
    amenities = soup.find_all('div', {'class': '_11jhslp'})
    
    amenities_dict = {}
    for amenity in amenities:
        header = amenity.find('div', {'class': '_ak5d0on'}).get_text()
        values = amenity.find_all('div', {'class': '_gw4xx4'})
        values = [v.find(text=True) for v in values]
        
        amenities_dict['amenity_' + header] = values
        
    return json.dumps(amenities_dict)

In [76]:
extract_amenities(detail_soup_clicked)

'{"amenity_Banheiro": ["Secador de cabelo", "Produtos de limpeza", "\\u00c1gua quente"], "amenity_Quarto e lavanderia": ["M\\u00e1quina de Lavar", "Secadora", "B\\u00e1sico", "Cabides", "Roupa de cama", "Cobertores e travesseiros extras", "Blackout nas cortinas", "Ferro de passar", "Varal para secar roupas", "Cofre", "Local para guardar as roupas"], "amenity_Entretenimento": ["Conex\\u00e3o \\u00e0 Ethernet", "HDTV com TV a cabo, HBO Max"], "amenity_Climatiza\\u00e7\\u00e3o": ["Ar-condicionado central"], "amenity_Seguran\\u00e7a dom\\u00e9stica": ["Tranca na porta do quarto"], "amenity_Internet e escrit\\u00f3rio": ["Wi-Fi", "Espa\\u00e7o de trabalho exclusivo"], "amenity_Cozinha e sala de jantar": ["Cozinha", "Refrigerador", "Microondas", "Itens b\\u00e1sicos de cozinha", "Lou\\u00e7as e talheres", "Freezer", "Fog\\u00e3o", "Forno", "Cafeteira: Nespresso, cafeteira com coador", "Ta\\u00e7as de vinho", "Utens\\u00edlios para churrasco", "Mesa de jantar"], "amenity_Caracter\\u00edsticas

In [77]:
string = json.loads(extract_amenities(detail_soup_clicked))

In [78]:
type(string)

dict

In [79]:
string

{'amenity_Banheiro': ['Secador de cabelo',
  'Produtos de limpeza',
  'Água quente'],
 'amenity_Quarto e lavanderia': ['Máquina de Lavar',
  'Secadora',
  'Básico',
  'Cabides',
  'Roupa de cama',
  'Cobertores e travesseiros extras',
  'Blackout nas cortinas',
  'Ferro de passar',
  'Varal para secar roupas',
  'Cofre',
  'Local para guardar as roupas'],
 'amenity_Entretenimento': ['Conexão à Ethernet',
  'HDTV com TV a cabo, HBO Max'],
 'amenity_Climatização': ['Ar-condicionado central'],
 'amenity_Segurança doméstica': ['Tranca na porta do quarto'],
 'amenity_Internet e escritório': ['Wi-Fi', 'Espaço de trabalho exclusivo'],
 'amenity_Cozinha e sala de jantar': ['Cozinha',
  'Refrigerador',
  'Microondas',
  'Itens básicos de cozinha',
  'Louças e talheres',
  'Freezer',
  'Fogão',
  'Forno',
  'Cafeteira: Nespresso, cafeteira com coador',
  'Taças de vinho',
  'Utensílios para churrasco',
  'Mesa de jantar'],
 'amenity_Características da localização': ['Lavanderia nas proximidades'

### Todas as características

In [None]:
RULES_DETAIL_PAGE = {
    'location': {'tag': 'span', 'class': '_9xiloll'},
    
    'specialties_1': {'tag': 'div', 'class': '_1qsawv5', 'order': -1},
    #'specialties_2': {'tag': 'div', 'class': '_1qsawv5', 'order': -1},

    #'price_per_night': {'tag': 'div', 'class': '_ymq6as'},
    'price_per_night': {'tag': 'span', 'class': '_tyxjp1'},
    
    'refundables': {'tag': 'div', 'class': '_cexc0g', 'order': -1},
        
    'prices_1': {'tag': 'li', 'class': '_ryvszj', 'order': -1},
    'prices_2': {'tag': 'li', 'class': '_adhikmk', 'order': -1},
    
    'listing_ratings': {'tag': 'span', 'class': '_4oybiu', 'order': -1},
    
    'host_joined': {'tag': 'div', 'class': '_1fg5h8r', 'order': 1},
    'host_feats': {'tag': 'span', 'class': '_pog3hg', 'order': -1},
    
    'lang_responses': {'tag': 'li', 'class': '_1q2lt74', 'order': -1},
    'house_rules': {'tag': 'div', 'class': '_u827kd', 'order': -1},
}

In [None]:
def extract_soup_js(listing_url, waiting_time=[3, 1]):
    """Extracts HTML from JS pages: open, wait, click, wait, extract"""

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--blink-settings=imagesEnabled=false')
    driver = webdriver.Chrome(options=options)

    driver.get(listing_url)
    time.sleep(waiting_time[0])
        
    # looking for price details
    price_dropdown = 0
    try:
        element = driver.find_element_by_class_name('_gby1jkw')
        price_dropdown = 1
    except:
        pass

    # if the element is present - click on it
    if price_dropdown == 1:
        for i in range(10): # 10 attempts to scroll to the price button
            try:
                actions = ActionChains(driver)
                driver.execute_script("arguments[0].scrollIntoView(true);", element);
                actions.move_to_element_with_offset(element, 5, 5)
                actions.click().perform()
                break
            except:
                pass
        
    driver.execute_script("window.scrollTo(0, 0);")
    try:
        driver.find_element_by_class_name('_13e0raay').click()
    except:
        pass # amentities button not found

    time.sleep(waiting_time[1])

    detail_page = driver.page_source

    driver.quit()

    return BeautifulSoup(detail_page, features='html.parser')

In [None]:
# Scrape single detail page
def process_detail_page(url):
    soup = extract_soup_js(url, waiting_time=[3, 1])
    
    features_list = []
    features = extract_page_features(soup, RULES_DETAIL_PAGE)
    features['amenities'] = extract_amenities(soup)
    features_list.append(features)

    return features_list

##### Measuring time

In [None]:
t0 = time.time()
detail_features = process_detail_page(detail_url)
print(time.time() - t0)

In [None]:
detail_features

## 5. Let's parallelize

* ~10 second * 300 listings = almost 1 hour
* CPU load = 3-8%

We could:
- open multiple Chrome windows at once
- mock clicks there and extract the elements

... All in parallel

In [None]:
# CPU intensive process -> use multiprocessing :)
from multiprocessing import Pool

Should find an optimal number of pools:
- Too few - not fast enough -> slow
- Too many - will lack resources for scraping -> missing data

In [None]:
# typically we could set "n" to the number of cpu's
import os
os.cpu_count()

### Experiment with pools and waiting times. Look at script time and missing data

In [None]:
listings_urls = ['https://www.airbnb.com'+l['url'] for l in base_features]

In [None]:
len(listings_urls)

In [None]:
# check the ratio of empty values
def check_empty(features):
    # -2 as we have 2 prices (-1) and 2 specialties (-1)
    cnt, cnt_empty = -2, -2
    for listing in features:
        for key in listing[0]:
            cnt += 1
            if listing[0][key] == 'empty':
                cnt_empty += 1
    return cnt_empty/cnt

In [None]:
for n_pools in [4,8]:
    t0 = time.time()

    with Pool(n_pools) as pool:
        result = pool.map(process_detail_page, listings_urls)
    pool.close()
    pool.join()

    print(f"n_pool={n_pools}\n\ttime={round(time.time() - t0, 2)}\n\tempty_ratio={round(check_empty(result), 2)}")

##### Waiting times
Our waiting times were [3, 1]

We could try to be more patient: [5, 2]

In [None]:
def process_detail_page(url):
    soup = extract_soup_js(url, waiting_time=[5, 2])
    
    features_list = []
    features = extract_page_features(soup, RULES_DETAIL_PAGE)
    features['amenities'] = extract_amenities(soup)
    features_list.append(features)

    return features_list

In [None]:
# and repeat for 8 pools only
for n_pools in [8]:
    t0 = time.time()

    with Pool(n_pools) as pool:
        result = pool.map(process_detail_page, listings_urls)
    pool.close()
    pool.join()

    print(f"n_pool={n_pools}\n\ttime={round(time.time() - t0, 2)}\n\tempty_ratio={round(check_empty(result), 2)}")

##### Was there an improvement?
* 8 cores are faster than 4, but scrape less data
* adjusting waiting time help

## Summary

##### The script
https://github.com/x-technology/airbnb-analytics/blob/main/Part%201%20-%20Web%20Scraping/airbnb_parser.py

##### Some data
https://github.com/x-technology/airbnb-analytics/blob/main/Part%201%20-%20Web%20Scraping/data_sample.csv

### Issues to consider
- missing data
- dirty data
- A/B tests
- new names for classes or new page structure
- blocking our scraper
- ...

### Next steps in a project
To build ML models we must have clean data

In [None]:
rooms_dirty = '7 guests · 4 bedrooms · 4 beds · 3 baths'

In [None]:
rooms_dirty.split(' · ')

In [None]:
lang_responses = 'Languages: English, Deutsch**__**Response rate: 100%**__**Response time: within an hour'

In [None]:
lang_responses.split('**__**')

Steps to implement:
- clean the data
- build features
- think of filling the empty values

___
### All imports in one cell (just in case)

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains

import json
import time

import pandas as pd

from multiprocessing import Pool

import os