# Scraping

Script to download government proposals for each mayor candidate in a defined state and city

In [1]:
# Importing necessary libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import time
import os
import sys

In [2]:
# Configuring the headless web browser
chrome_driver_path = 'E:/Projetos/Eleicoes2020/chromedriver'
chrome_options = Options()

# Configure the webdriver in order to save pdf files
chrome_options.add_experimental_option('prefs', {
"download.default_directory": "E:\Projetos\Eleicoes2020\propostas", #Change default directory for downloads
"download.prompt_for_download": False, #To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True #It will not show PDF directly in chrome
})

# Define the webdriver to be headless
chrome_options.add_argument('--headless')
webdriver = webdriver.Chrome(
    executable_path=chrome_driver_path, options=chrome_options
)

In [3]:
path_to_downloads = 'E:\Projetos\Eleicoes2020\propostas'

def download_wait(path_to_downloads):
    '''Stop code until the download is done
    
    Keyword arguments:
    path_to_downloads -- folder where the method keeps checking until the download is finished
    '''
    seconds = 0 # If wants a minimum timing
    dl_wait = True
    
    #while dl_wait and seconds < 20:
    while dl_wait:
        time.sleep(1)
        dl_wait = False
        for fname in os.listdir(path_to_downloads):
            if fname.endswith('.crdownload'):
                dl_wait = True
        seconds += 1
    return seconds

In [4]:
def download_proposal(state, city, webdriver, path_to_downloads):
    state_url = f'http://divulgacandcontas.tse.jus.br/divulga/#/estados/2020/2030402020/{state}/municipios'
    
    with webdriver as driver:    
        # Set timeout time
        ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
        wait = WebDriverWait(driver, 20, ignored_exceptions=ignored_exceptions)

        # Open state url
        driver.get(state_url)

        # Waits until city appears on the list
        xpath = f'//*[contains(div/text(), "{city}")]'
        wait.until(EC.presence_of_element_located((By.XPATH, xpath)))

        # Click done by javascript
        element = driver.find_element_by_xpath(xpath)
        driver.execute_script('arguments[0].click();', element)

        # Wait until at least one candidate appears
        wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(text(), "Deferido")]')))

        # Calculate the number of candidates
        num_rows = driver.find_elements_by_xpath("//table[@class='table table-hover visible-xs visible-sm dvg-margin-top-10']/tbody/tr")

        # Interate over each candidate
        for i in range(len(num_rows)):
            table = driver.find_element_by_xpath("//table[@class='table table-hover visible-xs visible-sm dvg-margin-top-10']")
            row = table.find_element_by_css_selector('tr:nth-of-type({})'.format(i+1))

            # Clicks on candidate
            element = row.find_elements_by_xpath(".//a[@class = 'dvg-link-list-mobile']")
            driver.execute_script('arguments[0].click();', element[0])

            wait.until(EC.presence_of_element_located((By.XPATH, '//span[contains(text(), "Proposta de Governo")]')))

            # Save candidate page to return after downloading
            candidate_url_id = driver.current_window_handle

            # Goes to the government proposal
            element = driver.find_elements_by_xpath('//a[contains(span/text(), "Proposta de Governo")]')         
            driver.execute_script('arguments[0].click();', element[0])

            # Since the file opens in a new tab, switch to it in order to download
            for window_handle in driver.window_handles:
                if window_handle != candidate_url_id:
                    driver.switch_to.window(window_handle)
                    break
                   
            driver.switch_to.window(candidate_url_id)
            download_wait(path_to_downloads)

            driver.back()
            wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(text(), "Deferido")]')))                           
                   
        driver.quit()    

In [5]:
state = 'RN'
city = 'NATAL'
download_proposal(state, city, webdriver, path_to_downloads)

UnboundLocalError: local variable 'seconds' referenced before assignment