This notebook scrapes the Brazilian parliament' discourses under the <a href="https://www2.camara.leg.br/atividade-legislativa/discursos-e-notas-taquigraficas" target="_blank"> Discourses and Debates</a> section.

In [None]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import glob

In [None]:
def get_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    r = requests.get(url, headers=headers)
    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, 'html.parser')
    
    return soup

The code is based on <a href="https://github.com/estadao/bolsonaro-e-ditadura-no-congresso/blob/1b53692c67799841f5d1f5a3e0e763ef1ec351a3/code/pega-links.ipynb" target="_blank">Bolsonaro and dictatorship in the parliament</a> project developed by O Estado de S.Paulo. We have made minor changes for the code to work correctly. Note that you need to manually edit the URLs, making changes on:

- **CurrentPage=**: this code scrapes one page at a time
- **dtInicio=**: DD/MM/YYYY
- **dtFim=**: DD/MM/YYYY

In [None]:
url = 'https://www.camara.leg.br/internet/sitaqweb/resultadoPesquisaDiscursos.asp?CurrentPage=22&BasePesq=plenario&txIndexacao=&txOrador=&txPartido=&dtInicio=01/01/2021&dtFim=31/12/2021&txUF=&txSessao=&listaTipoSessao=&listaTipoInterv=&inFalaPres=&listaTipoFala=&listaFaseSessao=&txAparteante=&listaEtapa=&CampoOrdenacao=dtSessao&TipoOrdenacao=DESC&PageSize=1000&txTexto=&txSumario='

In [None]:
# collecting page's html
html = get_html(url)

In [None]:
def scrape_table(html):
    
    # find main the table
    table = html.find('table', class_='table table-bordered variasColunas')

    # find the headers
    headers = table.find_all('th')

    # list compre comprehension: creates a list for each header in a dictionary
    data = { header.text : [] for header in headers }

    # access only the table's body
    table_body = table.find('tbody')

    # access each row in the table's body
    rows = table_body.find_all('tr')
        
    # filters out stragen lines that don't carry data
    rows = [ row for row in rows if not row.has_attr("name") ]

    for row in rows:         

        # find all table cells
        cells = row.find_all('td')

        # We will always have eight cells: data, session, fase, discourse, summary, 
        # speaker, time, and publication. We can save them in list order
        
        data["Data"].append(cells[0].text)
        data["Sessão"].append(cells[1].text)
        data["Fase"].append(cells[2].text)
        
        # Sometimes there are no links to a transcript and this information may be blank. 
        # So, if the script bumps into a TypeError (by trying to access 'href' of a 
        # None-type element), the output is to fill it with text.
        
        try:
            data["Discurso"].append(cells[3].find('a')['href'])
        except TypeError:
            data["Discurso"].append(cells[3].text)
        
        data["Sumário"].append(cells[4].text) # to have the content: .find('a')['title']
        data["Orador"].append(cells[5].text)
        data["Hora"].append(cells[6].text)
        data["Publicação"].append(cells[7].text)

    # remove unnecessary blanks from all lists and make some changes
    for key, value in data.items():
        data[key] = [ item.strip() for item in value ]

        if key == 'Discurso':
            new_data = [item.replace("\r\n\t\t\t\t\t\t\t", "") for item in value]
            new_data = ["https://www.camara.leg.br/internet/sitaqweb/" + item if item != '\xa0' else '-' for item in new_data]
            data[key] = new_data
            
    return data

In [None]:
data = scrape_table(html)

In [None]:
df = pd.DataFrame(data)

In [None]:
# saving into csv
# df.to_csv('2000_04.csv', sep=';', header=True, encoding='utf-8', index=False)

In [None]:
# saving into xlsx
df.to_excel('2021_22.xlsx', index=False)

---

Use the glob module to retrieve files and save them in an individual file by year.

In [None]:
path = r'/00_scraping_data' 

# reading all the excel files
filenames = glob.glob(path + '\*.xlsx')
print(filenames)

In [None]:
congress = pd.DataFrame()

# to iterate excel file one by one inside the folder
for file in filenames:
    
    # combining multiple excel worksheets into single data frame
    df = pd.read_excel(file, sep='\t', header=0)
    
    # appending excel files one by one
    congress = congress.append(df)

In [None]:
congress.to_excel(r'21_2021.xlsx', header=True, encoding='utf-8', index=False)