In [1]:
# Imports
import requests
from bs4 import BeautifulSoup as bs

In [2]:
# URLs
get_url = "https://www2.correios.com.br/sistemas/buscacep/buscaFaixaCEP.cfm"
post_url = "https://www2.correios.com.br/sistemas/buscacep/resultadoBuscaFaixaCEP.cfm"

In [3]:
# Obtaining a GET response for analysis (also done via the developer tools on the Google Chrome browser)
response = requests.get(get_url)
get_soup = bs(response.content)
print("GET response:\n")
print(get_soup)

GET response:

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html><head><meta content="IE=EDGE,chrome=1" http-equiv="X-UA-Compatible"/>
<script src="/cfscripts/cfform.js" type="text/javascript"></script>
<script src="/cfscripts/masks.js" type="text/javascript"></script>
<meta content="IE=EDGE,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="pt" http-equiv="Content-Language"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>buscaFaixaCep</title> <meta content="[page]" name="description"/> <meta content="" name="keywords"/>
<!-- AppInternalsXpert BMX Integration Begin -->
<script>
if(!RVBD_EUE){
   var RVBD_EUE={startJS:Number(new Date()),
   clientId:'',appId:1,
   collector:'apmperformance.correios.com.br',
   collectorHttpPort:80, collectorHttpsPort:443,
   sv:'0401',
   ajax:true, sync:true,
   ajaxResponseTime:true};
   (function(){
      var w=window,l=w.addEventListener,m=w.attachEvent,
  

In [4]:
# Verifying the presence of a single dropdown menu - which should be the UF menu 
print("Number of dropdown menus:", len(get_soup.find_all(class_="f1col")))

Number of dropdown menus: 1


In [5]:
# Acquiring all the available and valid UF options
upper_letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
uf_list = [uf_option.string for uf_option in get_soup.find(class_="f1col").find_all("option") if all([c in upper_letters for c in uf_option.string.upper()])]
print("Available and valid UF options:", uf_list)

Available and valid UF options: ['AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR', 'RS', 'SC', 'SE', 'SP', 'TO']


In [6]:
# Obtaining a POST response for analysis (also done via the developer tools on the Google Chrome browser)
payload = {"UF": uf_list[8]}
response = requests.post(post_url, data=payload)
post_soup = bs(response.content)
print("POST response:\n")
print(post_soup)

POST response:

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html><head><meta content="IE=EDGE,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="IE=EDGE,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="pt" http-equiv="Content-Language"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>resultadoBuscaFaixaCep</title> <meta content="[page]" name="description"/> <meta content="" name="keywords"/>
<!-- AppInternalsXpert BMX Integration Begin -->
<script>
if(!RVBD_EUE){
   var RVBD_EUE={startJS:Number(new Date()),
   clientId:'',appId:1,
   collector:'apmperformance.correios.com.br',
   collectorHttpPort:80, collectorHttpsPort:443,
   sv:'0401',
   ajax:true, sync:true,
   ajaxResponseTime:true};
   (function(){
      var w=window,l=w.addEventListener,m=w.attachEvent,
      d=document,s='script',t='load',o=RVBD_EUE,
      r=(('https:'===d.location.protocol)?
      'https://apmperformance.corre

In [7]:
# Multiple result pages analysis - case study 1: results fit in a single page
payload = {"UF": uf_list[0]}
response = requests.post(post_url, data=payload)
post_soup = bs(response.content)
print("Tables found in the response page: {} ({})".format([[column_name.string for column_name in post_soup.find_all(class_='tmptabela')[idx].find_all("th")] for idx in range(len(post_soup.find_all(class_='tmptabela')))], len(post_soup.find_all(class_='tmptabela'))))
print("POST forms found in the response page: {} ({})".format([form["name"] for form in post_soup.find_all("form")],len(post_soup.find_all("form"))))

Tables found in the response page: [['UF', 'Faixa de CEP'], ['Localidade', 'Faixa de CEP', 'Situação', 'Tipo de Faixa']] (2)
POST forms found in the response page: [] (0)


In [8]:
# Multiple result pages analysis - case study 2: results fit in multiple page (first page result)
payload = {"UF": uf_list[26]}
response = requests.post(post_url, data=payload)
post_soup = bs(response.content)
print("Tables found in the response page: {} ({})".format([[column_name.string for column_name in post_soup.find_all(class_='tmptabela')[idx].find_all("th")] for idx in range(len(post_soup.find_all(class_='tmptabela')))], len(post_soup.find_all(class_='tmptabela'))))
print("POST forms found in the response page: {} ({})".format([form["name"] for form in post_soup.find_all("form")],len(post_soup.find_all("form"))))

Tables found in the response page: [['UF', 'Faixa de CEP'], ['Localidade', 'Faixa de CEP', 'Situação', 'Tipo de Faixa']] (2)
POST forms found in the response page: ['Proxima'] (1)


In [9]:
# Multiple result pages analysis - case study 3: results in multiple page (middle page result)
payload = {"UF": uf_list[26], "qtdrow": 50, "pagini": 51, "pagfim": 100}
response = requests.post(post_url, data=payload)
post_soup = bs(response.content)
print("Tables found in the response page: {} ({})".format([[column_name.string for column_name in post_soup.find_all(class_='tmptabela')[idx].find_all("th")] for idx in range(len(post_soup.find_all(class_='tmptabela')))], len(post_soup.find_all(class_='tmptabela'))))
print("POST forms found in the response page: {} ({})".format([form["name"] for form in post_soup.find_all("form")],len(post_soup.find_all("form"))))

Tables found in the response page: [['Localidade', 'Faixa de CEP', 'Situação', 'Tipo de Faixa']] (1)
POST forms found in the response page: ['Anterior', 'Proxima'] (2)


In [10]:
# Multiple result pages analysis - case study 4: results in multiple page (last page result)
payload = {"UF": uf_list[26], "qtdrow": 50, "pagini": 101, "pagfim": 150}
response = requests.post(post_url, data=payload)
post_soup = bs(response.content)
print("Tables found in the response page: {} ({})".format([[column_name.string for column_name in post_soup.find_all(class_='tmptabela')[idx].find_all("th")] for idx in range(len(post_soup.find_all(class_='tmptabela')))], len(post_soup.find_all(class_='tmptabela'))))
print("POST forms found in the response page: {} ({})".format([form["name"] for form in post_soup.find_all("form")],len(post_soup.find_all("form"))))

Tables found in the response page: [['Localidade', 'Faixa de CEP', 'Situação', 'Tipo de Faixa']] (1)
POST forms found in the response page: ['Anterior'] (1)


In [11]:
# Based on the observations above, the following code retrieves the data record for each UF
qtdrow = 50 # As displayed on the browser, iterate through 50 records per result page
uf_records = []
localidade_records = []
# Iterate UFs
for uf_idx,uf in enumerate(uf_list):
  pag_idx = 0
  # Iterate result pages
  while True:
    payload = {"UF": uf, "qtdrow": qtdrow, "pagini": qtdrow*pag_idx+1, "pagfim": qtdrow*(pag_idx+1)}
    response = requests.post(post_url, data=payload)
    soup = bs(response.content)
    # Handling first result page case (additional UF table)
    if pag_idx == 0:
      columns = [column.string for column in soup.find_all(class_='tmptabela')[0].find_all("th")]
      values = [value.string for value in soup.find_all(class_='tmptabela')[0].find_all("td")]
      # Organizing list of values according to list of column names 
      for value_idx in range(0,len(values),len(columns)):
        aux_dict = {"id" : uf_idx} # Generating unique ID for UF record
        for column_idx in range(len(columns)):
          aux_dict[str(columns[column_idx])] = str(values[value_idx+column_idx])
        uf_records.append(aux_dict)
      aux_idx = 1
    else:
      aux_idx = 0
    # Handling Localidade table
    columns = [column.string for column in soup.find_all(class_='tmptabela')[aux_idx].find_all("th")]
    values = [value.string for value in soup.find_all(class_='tmptabela')[aux_idx].find_all("td")]
    # Organizing list of values according to list of column names 
    for id_idx,value_idx in enumerate(range(0,len(values),len(columns))):
      aux_dict = {"id" : '-'.join([str(uf_idx),str(pag_idx),str(id_idx)])} # Generating unique ID for Localidade record
      for column_idx in range(len(columns)):
        aux_dict[str(columns[column_idx])] = str(values[value_idx+column_idx])
      localidade_records.append(aux_dict)
    pag_idx += 1
    # Verifying stop condition for result pages iteration
    if len(soup.find_all("form")) == 0 or (len(soup.find_all("form")) == 1 and soup.find_all("form")[0]["name"] == "Anterior"):
      break

In [12]:
# Checking for duplicates in uf_records (disregarding the generated ID )
str_list = []
for uf_record in uf_records:
  str_list.append(' '.join([uf_record[key] for key in list(uf_record.keys()) if key != "id"]))
str_list = list(set(str_list)) # Removing string duplicates
print("{} duplicates found!".format(len(uf_records)-len(str_list)))

0 duplicates found!


In [13]:
# Checking for duplicates in localidade_records (disregarding the generated ID )
str_list = []
for localidade_record in localidade_records:
  str_list.append(' '.join([localidade_record[key] for key in list(localidade_record.keys()) if key != "id"]))
str_list = list(set(str_list))  # Removing string duplicates
print("{} duplicates found!".format(len(localidade_records)-len(str_list)))

0 duplicates found!


In [14]:
# Output UF JSONL file
with open("uf_records.jsonl","w") as jsonl_file:
  for uf_record in uf_records:
    jsonl_file.write(str(uf_record)+'\n')

In [15]:
# Output Localidade JSONL file
with open("localidade_records.jsonl","w") as jsonl_file:
  for localidade_record in localidade_records:
    jsonl_file.write(str(localidade_record)+'\n')