In [1]:
from selenium import webdriver
from ray.util.multiprocessing import Pool
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep
from lxml import html
import pandas as pd
import numpy as np
import threading
import requests
import pickle
import pdb
import ray

In [2]:
def initialize_webdriver():
    service = Service(executable_path='../../../../usr/bin/chromedriver')
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless=new")
    navegador = webdriver.Chrome(service=service, options=options)

    navegador.get('https://cecad.cidadania.gov.br/tab_cad.php')

    # Marcar opção "Com marcação PBF (Agosto 2023)"
    checkbox = navegador.find_elements(By.CSS_SELECTOR, "input[value='PBF']")
    checkbox[0].click()

    return navegador

In [3]:
navegador = initialize_webdriver()

In [4]:
# Marcar opção "Com marcação PBF (Agosto 2023)"
checkbox = navegador.find_elements(By.CSS_SELECTOR, "input[value='PBF']")
checkbox[0].click()

In [5]:
# Obter lista de estados do Brasil
uf_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='uf_ibge']")
uf_select = Select(uf_selector[0])
states = []
for option in uf_select.options:
    states.append(option.text)
states = states[1:-6]    

In [6]:
# Obter lista de métricas
metric_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='var1']")
metric_select = Select(metric_selector[0])
metrics = []
for option in metric_select.options:
    metrics.append(option.text)
metrics = metrics[1:]      

In [7]:
navegador.close()

In [8]:
# Obter lista de municípios do estado
def get_municipalities(navegador):
    WebDriverWait(navegador, 20).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div#selector_municipiosSAGIUF"))
    )

    p_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='p_ibge']")
    p_select = Select(p_selector[0])
    municipios = []
    for option in p_select.options:
        municipios.append(option.text)
    municipios = municipios[2:]
    return municipios   

In [9]:
# Selecionar um estado
def state_selection(state, navegador):
    p_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='uf_ibge']")
    p_select = Select(p_selector[0])
    p_select.select_by_visible_text(state)

In [10]:
# Selecionar um município
def municipality_selection(municipality, navegador):
    
    WebDriverWait(navegador, 20).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div#selector_municipiosSAGIUF"))
    )

    p_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='p_ibge']")
    p_select = Select(p_selector[0])
    p_select.select_by_visible_text(municipality)

In [11]:
# Selecionar uma variável
def variable_selection(variable, navegador):
    var_selector = navegador.find_elements(By.CSS_SELECTOR, "select[name='var1']")
    var_select = Select(var_selector[0])
    var_select.select_by_visible_text(variable)

In [12]:
# Clicar no botão %Total e esperar até a tabela ser carregada
def search_table(navegador):
    total_button = navegador.find_elements(By.CSS_SELECTOR, "button.btn-success")
    total_button[1].click()
    
    try:
        WebDriverWait(navegador, 90).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "table.table-striped"))
        )

        return True
    except:
        return False


In [13]:
def get_table(navegador):
    table = navegador.find_elements(By.CSS_SELECTOR, "tbody")
    table = table[-1]
    table2 = table.get_attribute('outerHTML')
    table2 = html.fromstring(table2)
    rows = table2.xpath('.//tr')

    table = []

    for row in rows[:-2]:
        cols_th = row.xpath('.//th')
        cols_td = row.xpath('.//td')
        cols1 = cols_th + cols_td

        cols = []

        for col in cols1[:-1]:
            if col.text != None:
                cols.append(col.text.strip())
        table.append(cols)
    
    #pdb.set_trace()

    table[-1] = table[-1][1:-1]

    for i, val in enumerate(table[1]):
        table[1][i] = str(table[0][0]) + ' / ' + str(val)

    table = table[1:]
    return table

In [14]:
def process_municipality(idx, municipality,  id_state, state, metric, navegador):
    result = False

    while not result:
        #print(str(idx) + '   '+ str(j))

        variable_selection(metric, navegador)
        result = search_table(navegador)

        if result:
            table = get_table(navegador)

            if idx == 0 and id_state == 0:
                data_columns = table[0]
            else:
                data_columns = None   

            municipality_values = table[-1]
            
            return municipality_values, data_columns, navegador
        
        else:
            navegador.close()
            navegador = initialize_webdriver()
            state_selection(state, navegador)
            municipality_selection(municipality, navegador)

    
    


In [229]:
navegador = initialize_webdriver()
data_values = []
data_columns = ['Município']
threads = []

for id_state, state in enumerate(states):
    state_selection(state, navegador)
    municipalities = get_municipalities(navegador)

    for idx, municipality in enumerate(municipalities):
        municipality_selection(municipality, navegador)

        # Nomes dos municípios + sigla do estado
        municipality_values = [municipality + ' - ' + state.split(' - ')[0]]

        for j, metric in enumerate(metrics):
            mv, dc, navegador =  process_municipality(idx, municipality,  id_state, state, metric, navegador)
            municipality_values += mv

            if dc != None:
                data_columns += dc

        data_values.append(municipality_values)



In [145]:
data_columns

['Município',
 'Estado cadastral da família / Sem Registro Civil',
 'Estado cadastral da família / Cadastrado',
 'Faixa da renda total da família / Até 1 S.M.',
 'Faixa da renda total da família / Entre 1 e 2 S.M.',
 'Faixa da renda total da família / Entre 2 e 3 S.M.',
 'Faixa da renda total da família / Acima de 3 S.M.',
 'Faixa da renda familiar per capita / Pobreza 1 (até R$ 109)',
 'Faixa da renda familiar per capita / Pobreza 2 (de R$ 109 a R$ 218)',
 'Faixa da renda familiar per capita / Baixa Renda',
 'Faixa da renda familiar per capita / Acima de 1/2 S.M.',
 'Forma de coleta / Informação migrada como inexistente',
 'Forma de coleta / Sem visita Domiciliar',
 'Forma de coleta / Com visita domiciliar',
 'Número de meses após a última atualização cadastral / até 12 Meses',
 'Número de meses após a última atualização cadastral / 13 a 18 Meses',
 'Número de meses após a última atualização cadastral / 19 a 24 Meses',
 'Número de meses após a última atualização cadastral / 25 a 36 Me

In [None]:
data = pd.DataFrame(data_values, columns=data_columns)
data.replace([',', '%'], ['.',''], regex=True, inplace=True)

In [237]:
data

Unnamed: 0,Município,Estado cadastral da família / Sem Registro Civil,Estado cadastral da família / Cadastrado,Faixa da renda total da família / Até 1 S.M.,Faixa da renda total da família / Entre 1 e 2 S.M.,Faixa da renda total da família / Entre 2 e 3 S.M.,Faixa da renda total da família / Acima de 3 S.M.,Faixa da renda familiar per capita / Pobreza 1 (até R$ 109),Faixa da renda familiar per capita / Pobreza 2 (de R$ 109 a R$ 218),Faixa da renda familiar per capita / Baixa Renda,...,Grupos Populacionais Tradicionais e Específicos / Familia Agricultores Familiares,Grupos Populacionais Tradicionais e Específicos / Familia Assentada da Reforma Agraria,Grupos Populacionais Tradicionais e Específicos / Familia Beneficiaria do Programa Nacional do Credito Fundiario,Grupos Populacionais Tradicionais e Específicos / Familia Acampada,Grupos Populacionais Tradicionais e Específicos / Familia Atingida por Empreendimentos de Infraestrutura,Grupos Populacionais Tradicionais e Específicos / Familia de Preso do Sistema Carcerario,Grupos Populacionais Tradicionais e Específicos / Familia Catadores de Material Reciclavel,Grupos Populacionais Tradicionais e Específicos / Nenhuma,Situação de Rua / Não,Situação de Rua / Sim
0,Acrelândia - AC,0.02,99.98,75.65,17.99,4.44,1.91,48.07,13.52,19.95,...,21.45,0.98,0.00,0.00,0.00,0.00,0.01,75.10,100.00,0.00
1,Assis Brasil - AC,0.00,100.00,84.64,11.84,2.86,0.66,64.03,10.92,17.19,...,13.78,0.22,0.02,0.00,0.00,0.27,0.03,79.38,99.99,0.01
2,Brasiléia - AC,0.00,100.00,85.45,10.79,2.55,1.21,63.70,8.23,15.48,...,29.12,0.35,0.02,0.00,0.00,0.00,0.02,68.25,100.00,0.00
3,Bujari - AC,0.00,100.00,82.47,13.78,2.79,0.95,32.38,33.44,22.32,...,5.03,2.39,0.00,0.06,0.00,0.12,0.24,89.86,100.00,0.00
4,Capixaba - AC,0.00,100.00,79.33,13.36,4.60,2.71,52.28,12.04,18.31,...,4.98,24.47,0.05,0.00,0.00,0.00,0.09,68.27,100.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,Vista Alegre do Alto - SP,0.00,100.00,42.50,33.02,14.56,9.92,1.59,22.18,39.76,...,0.08,0.00,0.00,0.00,0.00,0.00,0.16,99.72,100.00,0.00
5568,Vitória Brasil - SP,0.00,100.00,35.38,35.14,20.22,9.27,8.54,7.34,38.63,...,0.00,0.00,0.00,0.00,0.00,0.00,0.48,99.52,100.00,0.00
5569,Votorantim - SP,0.00,100.00,63.00,20.71,8.58,7.71,37.08,10.28,25.88,...,0.01,0.33,0.00,0.14,0.01,0.27,3.02,96.18,99.69,0.31
5570,Votuporanga - SP,0.00,100.00,54.53,23.67,11.02,10.78,25.50,10.59,24.02,...,0.08,0.00,0.00,0.01,0.00,0.17,0.81,98.85,99.75,0.25


In [238]:
!pip3 install unidecode
import unidecode

Defaulting to user installation because normal site-packages is not writeable


In [239]:
def remove_accents(a):
  a = a.encode('utf-8', errors='ignore')
  return unidecode.unidecode(a.decode('utf-8'))

In [240]:
for index, row in data.iterrows():
    name = row['Município']
    s = name[-2:]
    name = s + ' - ' + name[:-5]
    name = remove_accents(name)
    
    data['Município'][index] = name

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Município'][index] = name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Município'][index] = name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Município'][index] = name
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Município'][index] = name
A value is trying to be set on a cop

In [235]:
data.to_csv("cadunico_data_brazil.csv", index=False)