In [1]:
import pandas as pd
import requests
import time
from tqdm import tqdm

In [2]:
session = requests.Session()

In [3]:
legislatures = [57, 56, 55, 54, 53, 52]

In [4]:
# for legislature in legislatures:
    
#     # request data from the Chamber of Deputies API
#     url = f'https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura={legislature}'
#     response = requests.get(url)
    
#     # parse json
#     data = response.json()
#     deputados = data['dados']
#     df = pd.DataFrame(deputados)
    
#     # get only unique deputies (changes in the same legislature, such as party or name, are not considered)
#     df.drop_duplicates(subset='id', inplace=True)
    
#     # save data
#     df.to_csv(f'data/deputies/deputies_{legislature}.csv', index=False)
    
#     # print number of (unique) saved deputies
#     print(f' Legislature {legislature}: {len(df)} saved deputies')

In [None]:
for legislature in legislatures:
    
    # deputies data
    df = pd.read_csv(f'data/deputies/deputies_{legislature}.csv')
    ids = set(df['id'].tolist())
    
    # speeches data
    df = pd.DataFrame()
    
    # counters
    id_count = 1
    error_count = 0
    
    # dictionary to store errors -> 'id': 'url that caused the error'
    errors = {}
    
    # print number of deputies
    print(f'Legislature {legislature}: {len(ids)} deputies\n')
    
    # request data from the Chamber of Deputies API
    for id_ in ids:
        
        url = f'https://dadosabertos.camara.leg.br/api/v2/deputados/{id_}/discursos?idLegislatura={legislature}&itens=50'
        
        while True:
            # request data from the Chamber of Deputies API
            response = session.get(url)
            
            if not response.ok:
                # if request fails, print error message, break loop and go to the next deputy
                print(f'❌ {id_count}/{len(ids)} - ID: {id_} - Error: {response.text} - URL: {url}')
                error_count += 1
                errors[id_] = url
                break
            
            # parse json
            data = response.json()['dados']
            
            # print progress
            print(f'✅ {id_count}/{len(ids)} - Deputy {id_} - {len(data)} speeches - URL: {url}')
            
            data = pd.DataFrame(data)
            data['id'] = id_
            
            # concatenate data
            df = pd.concat([df, data])
            
            # check if there is another page
            links = response.json()['links']
            # if one of the link['rel'] is 'next', there is another page
            if not any(link['rel'] == 'next' for link in links):
                break
            else:
                # get next page
                time.sleep(2)
                url = next(link['href'] for link in links if link['rel'] == 'next')
        
        # save data every 50 deputies
        if id_count % 50 == 0:
            df.to_csv(f'data/speeches/speeches_{legislature}.csv', index=False)
        
        id_count += 1
        
        time.sleep(2)
        
    print(f'\nLegislature {legislature}: {len(ids)} deputies - {len(df)} speeches - {error_count} errors\n')
    
    # save data for the current legislature
    df.to_csv(f'data/speeches/speeches_{legislature}.csv', index=False)
    
    # save errors for troubleshooting
    errors_df = pd.DataFrame(errors.items(), columns=['id', 'url'])
    errors_df.to_csv(f'data/errors/extraction_errors_{legislature}.csv', index=False)

In [8]:
session.close()