-
Notifications
You must be signed in to change notification settings - Fork 0
/
Brazilian Government Licitation Scraper.py
80 lines (62 loc) · 4.15 KB
/
Brazilian Government Licitation Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#%%
# This a training code scraper with studies porposese, using scrapy lib. The tutorial used https://oxylabs.io/blog/python-web-scraping
# We are going to scrape some brazilian government licitation from http://comprasnet.gov.br/ConsultaLicitacoes/ConsLicitacao_Filtro.asp
# You can find some documentation on http://compras.dados.gov.br/licitacoes/v1/licitacoes.html?
# You need to install the right version of ChromeDriver according to your current browser. Link: https://chromedriver.chromium.org/downloads
# Importing libraries.
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#%%
# Defining the objects where our data is going to be saved.
title_licitation = []
content_licitation = []
# Insert the web address where the data is storaged inside the url object.
url = "http://comprasnet.gov.br/ConsultaLicitacoes/ConsLicitacao_Filtro.asp"
# Starting the webdriver. Insert the correct path to the executable chromedriver file installed in your pc.
driver = webdriver.Chrome(executable_path="c:\windows\webdriver\chromedriver.exe")
# Passing to the webdriver the defined url.
driver.get(url)
# The list_publ_ini is an object where we insert the pairs of initial date and ending date of our licitation search.
# Its a dictionary. You can change the dates according to your desire and right now it is set from 01/01/2021 to 30/04/2021.
list_publ_ini = {"01012021":"15012021","16012021":"31012021","01022021":"15022021","16022021":"28022021","01032021":"15032021","16032021":"31032021","01042021":"15042021","16042021":"30042021"}
#%%
# FOR LOOP
# This is a loop where all the magic happens.
# First we are cleaning all information from the search form because it will work as a loop passing the dates pairs one by one.
# If we do not clean it first, all the old searching arguments will be displayed there.
# It was made like that because the government website just allows us to search with the maximum gap of 15 days.
# The second xpath argument is the initial date imputation.
# The third xpath argument is the final date imputation, passed as the key from our dictionary.
# The forth xpath argument is the selection of all licitation modalities. If you are interested in just a specific modality, just change the xpath here.
# The last xpath argument is the final click to initialize the licitation search.
# WHILE INSIDE THE FOR
# Using BeautifulSoup we can handle the HTML.
# In the first for loop we are looking after the title of the licitation.
# In the second for loop we are looking after all the content from that licitation.
# The try argument is checking if there is more results listed on another page looking for the "Próximo" buttom.
# The government website only display 10 licitation at once.
# The except argument will be actioned when there is not a "Próximo" buttom and will click on the "Nova Pesquisa" buttom to initialize a new search.
for key in list_publ_ini:
WebDriverWait(driver, 1).until(EC.element_to_be_clickable((By.XPATH, "//input[@name='Limpar']"))).click()
driver.find_element_by_xpath("//input[@name='dt_publ_ini']").send_keys(key)
driver.find_element_by_xpath("//input[@name='dt_publ_fim']").send_keys(list_publ_ini[key])
driver.find_element_by_xpath("//input[@name='chkTodos']").click()
driver.find_element_by_xpath("//input[@name='ok']").click()
while True:
content = driver.page_source
soup = BeautifulSoup(content, features="lxml")
for a in soup.findAll(attrs={'class':'td_titulo_campo', 'align':'center'}):
title_licitation.append(a.text.strip())
for b in soup.findAll(attrs={'style':'padding:10px'}):
content_licitation.append(b.text.strip())
try:
WebDriverWait(driver, 1).until(EC.element_to_be_clickable((By.XPATH,"//input[@name='btn_proximo']"))).click()
except:
WebDriverWait(driver, 1).until(EC.element_to_be_clickable((By.XPATH,"//input[@name='Pesquisa']"))).click()
break
driver.quit()
# Now we have to clean the database in futures code updates.