In [10]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from bs4 import BeautifulSoup

In [11]:
legislatures = [57, 56, 55, 54, 53, 52]

dfs = []

for legislature in legislatures:
    df = pd.read_csv(f'data/deputies/deputies_{legislature}.csv')
    dfs.append(df)

df = pd.concat(dfs)

In [12]:
def extract_discourse_links(soup):
    table_rows = soup.find_all('td')
    
    links = []
    
    for row in table_rows:
        a_tags = row.find_all('a')
        if a_tags:
            for a_tag in a_tags:
                try:
                    link = a_tag['href']
                except:
                    continue
                    
                if link.startswith('TextoHTML'):
                    # remove "\n", "\r", and "\t"
                    link = link.replace('\n', '')
                    link = link.replace('\r', '')
                    link = link.replace('\t', '')
                    links.append(link)

    return links

In [None]:
deputies_names = set(df["nome"])
data = pd.DataFrame(columns=["deputy", "discourse", "discourse_link", "date", "phase"])

for deputy in deputies_names:
    current_page = 1
    all_discourse_links = []
    deputy_name = deputy.replace(" ", "+")

    while True:
        url_table = f"https://www.camara.leg.br/internet/sitaqweb/resultadoPesquisaDiscursos.asp?CurrentPage={current_page}&txOrador={deputy_name}&txPartido=&txUF=&dtInicio=&dtFim=&txTexto=&txSumario=&basePesq=plenario&CampoOrdenacao=dtSessao&PageSize=100&TipoOrdenacao=DESC&btnPesq=Pesquisar"
        response = requests.get(url_table)
        
        if response.ok:
            soup = BeautifulSoup(response.text, 'html.parser')
            page_discourse_links = extract_discourse_links(soup)
        else:
            print(f"Failed to get {deputy} - {response} - page {current_page}")
            continue

        if not page_discourse_links:
            break
        
        print(f"Extracted {len(page_discourse_links)} discourse links for {deputy} - page {current_page}")
        all_discourse_links.extend(page_discourse_links)
        
        current_page += 1
        
        time.sleep(5)

    # Inner tqdm for discourse links
    for discourse in tqdm(all_discourse_links, desc=f"Processing Discourses of {deputy}", leave=False):
        url_discourse = f"https://www.camara.leg.br/internet/sitaqweb/{discourse}"
        response = requests.get(url_discourse)
        
        if not response.ok:
            print(f"Failed to get {url_discourse} of deputy {deputy} - {response}")
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract discourse text
        discourse_text = soup.find_all('p')
        discourse_text = " ".join([p.text for p in discourse_text])
        
        # Extract date and phase
        date, phase = None, None
        tds_right = soup.find_all('td', align='right')
        for td in tds_right:
            if "Data" in td.text:
                date = td.text.split(":")[1].strip()  # Split to get date part
            elif "Fase" in td.text:
                phase = td.text.split(":")[1].strip()  # Split to get phase part

        # Add to dataframe
        data = pd.concat([data, pd.DataFrame([[deputy, discourse_text, url_discourse, date, phase]], 
                                             columns=["deputy", "discourse", "discourse_link", "date", "phase"])])
        
        if len(data) % 100 == 0:
            data.to_csv("data/speeches/speeches.csv", index=False)
            
        #time.sleep(5)
    
    print(f"Finished {deputy_name}")

# Save final result
data.to_csv("data/speeches/speeches.csv", index=False)