In [611]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from time import sleep
from lxml import html
import pandas as pd
import requests
import whisper
import pdb

In [612]:
def initial_information(return_indexes=False):
    options = webdriver.ChromeOptions()
    #options.add_argument("--headless=new")
    ua = UserAgent()
    user_agent = ua.random
    options.add_argument(f'--user-agent={user_agent}')

    navegador = webdriver.Chrome(options=options)

    navegador.get('https://sisaps.saude.gov.br/sisvan/relatoriopublico/index')

    checkbox = navegador.find_elements(By.CSS_SELECTOR, "a.showSingle[target='1']")
    checkbox[0].click()

    year_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='nuAno']")
    year_select = Select(year_selector[0])
    year_select.select_by_visible_text('2023')
    sleep(0.1)

    month_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='nuMes']")
    month_select = Select(month_selector[0])
    month_select.select_by_visible_text('TODOS')
    sleep(0.1)

    group_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='tpFiltro']")
    group_select = Select(group_selector[0])
    group_select.select_by_value('M')
    sleep(0.5)

    uf_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='coUfIbge']")
    uf_select = Select(uf_selector[0])
    uf_select.select_by_visible_text('CE')
    sleep(0.5)

    # Selecionar todos os municípios
    p_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='coMunicipioIbge']")
    p_select = Select(p_selector[0])
    p_select.select_by_visible_text('TODOS')

    life_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='nu_ciclo_vida']")
    life_select = Select(life_selector[0])
    life_select.select_by_visible_text('CRIANÇA')
    sleep(0.5)

    age1_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='nu_idade_inicio']")
    age1_select = Select(age1_selector[0])
    age1_select.select_by_visible_text('0')
    sleep(0.5)

    age2_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='nu_idade_fim']")
    age2_select = Select(age2_selector[0])
    age2_select.select_by_visible_text('< 5 anos')

    if return_indexes == True:
        # Obter lista de índices
        idx_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='nu_indice_cri']")
        idx_select = Select(idx_selector[0])
        indices = []
        for option in idx_select.options:
            indices.append(option.text)
        return navegador, indices

    return navegador 
  

In [613]:
navegador, indices = initial_information(return_indexes=True)
navegador.quit()

The chromedriver version (117.0.5938.149) detected in PATH at /bin/chromedriver might not be compatible with the detected chrome version (118.0.5993.70); currently, chromedriver 118.0.5993.70 is recommended for chrome 118.*, so it is advised to delete the driver in PATH and retry


In [614]:
def index_selection(navegador, index):
    index_selector = navegador.find_elements(By.CSS_SELECTOR, "select[id='nu_indice_cri']")
    index_select = Select(index_selector[0])
    index_select.select_by_visible_text(index)

In [615]:
model = whisper.load_model("base")

In [616]:
def transcribe(url):
    with open('.temp', 'wb') as f:
        f.write(requests.get(url).content)
    result = model.transcribe('.temp')
    return result["text"].strip()

def click_checkbox(navegador):
    navegador.switch_to.default_content()
    navegador.switch_to.frame(navegador.find_element(By.XPATH, ".//iframe[@title='reCAPTCHA']"))
    sleep(4)
    
    navegador.find_element(By.ID, "recaptcha-anchor-label").click()
    navegador.switch_to.default_content()

def request_audio_version(navegador):
    navegador.switch_to.default_content()
    
    wait = WebDriverWait(navegador, 5)

    try:
        wait.until(EC.visibility_of_element_located((By.XPATH, "//*[contains(text(), 'Confirme que você não é um robô.')]")))
        navegador.switch_to.frame(navegador.find_element(By.XPATH, ".//iframe[@title='recaptcha challenge expires in two minutes']"))
        navegador.find_element(By.ID, "recaptcha-audio-button").click()
        return True 
    except TimeoutException:
        return False

def solve_audio_captcha(navegador):
    text = transcribe(navegador.find_element(By.ID, "audio-source").get_attribute('src'))
    navegador.find_element(By.ID, "audio-response").send_keys(text)
    navegador.find_element(By.ID, "recaptcha-verify-button").click()
    navegador.switch_to.default_content()


In [617]:
def get_table(navegador, id, data):
    title = navegador.find_element(By.CSS_SELECTOR, "thead > tr > th").text.strip()

    head_rows = navegador.find_elements(By.CSS_SELECTOR, "thead > tr")
    subtitle = head_rows[1].find_elements(By.CSS_SELECTOR, "th")[5].text.strip()
    subtitle = subtitle.replace('\n', ' ')

    body_rows = navegador.find_elements(By.CSS_SELECTOR, "tbody > tr")

    municipios = []
    qnt = []

    for rw in body_rows:
        if id == 0:
            municipios.append(rw.find_elements(By.CSS_SELECTOR, "td")[4].text.strip())
        qnt.append(rw.find_elements(By.CSS_SELECTOR, "td")[6].text.strip())


    if id == 0:
        municipios = municipios[:-3]
        data['Município'] = municipios
    
    qnt = qnt[:-3]         
    data[title + ' / ' + subtitle] = qnt

    return data

In [619]:
data = pd.DataFrame()

for id, indice in enumerate(indices):
    navegador = initial_information()

    index_selection(navegador, indice)

    click_checkbox(navegador)
    request = request_audio_version(navegador)
    sleep(1)

    if request == True:
        solve_audio_captcha(navegador)
        sleep(10)

    navegador.find_elements(By.CSS_SELECTOR, "button#verTela[type='submit']")[0].click()

    WebDriverWait(navegador, 10).until(
    EC.number_of_windows_to_be(2)
    )   
    child = navegador.window_handles[1]
    navegador.switch_to.window(child)

    WebDriverWait(navegador, 999999999).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "table.table-striped"))
    )

    navegador.switch_to.window(navegador.window_handles[1])

    data = get_table(navegador, id, data)

    navegador.quit()
    

The chromedriver version (117.0.5938.149) detected in PATH at /bin/chromedriver might not be compatible with the detected chrome version (118.0.5993.70); currently, chromedriver 118.0.5993.70 is recommended for chrome 118.*, so it is advised to delete the driver in PATH and retry
The chromedriver version (117.0.5938.149) detected in PATH at /bin/chromedriver might not be compatible with the detected chrome version (118.0.5993.70); currently, chromedriver 118.0.5993.70 is recommended for chrome 118.*, so it is advised to delete the driver in PATH and retry
The chromedriver version (117.0.5938.149) detected in PATH at /bin/chromedriver might not be compatible with the detected chrome version (118.0.5993.70); currently, chromedriver 118.0.5993.70 is recommended for chrome 118.*, so it is advised to delete the driver in PATH and retry
The chromedriver version (117.0.5938.149) detected in PATH at /bin/chromedriver might not be compatible with the detected chrome version (118.0.5993.70); cur

In [620]:
data

Unnamed: 0,Município,PESO X IDADE / Peso Muito Baixo para a Idade,PESO X ALTURA / Magreza acentuada,ALTURA X IDADE / Altura Muito Baixa para a Idade,IMC X IDADE / Magreza acentuada
0,ABAIARA,0.69%,2.27%,4.86%,4.17%
1,ACARAPE,0.54%,2.97%,7.84%,5.26%
2,ACARAU,0.44%,0.91%,4.4%,1.41%
3,ACOPIARA,0.87%,2.75%,5.66%,4.05%
4,AIUABA,0.47%,1.43%,5.85%,3.95%
...,...,...,...,...,...
182,VARZEA ALEGRE,0.59%,1.54%,3.07%,2.42%
183,VICOSA DO CEARA,0.62%,1.7%,5.06%,2.2%
184,1.93%,86.67%,57.07%,88.42%,54.93%
185,2.79%,87.4%,62.04%,87.71%,59.38%


In [621]:
data.to_csv("sisvan.csv", index=False)