In [None]:
#Importing libraries
import requests
import pandas as pd
import re

In [None]:
#Downloading and saving the dataset with the information of the deputies
URL = 'http://dadosabertos.camara.leg.br/arquivos/deputados/csv/deputados.csv'
response = requests.get(URL)

with open("deputados.csv", "wb") as file:
    file.write(response.content)

In [None]:
#Opening the dataset with the information of the deputies
dep = pd.read_csv('deputados.csv', delimiter=';')

In [None]:
#Function to save the speeches and the info about the deputies
def get_speeches(database, checkpoint):
    """
    Retrieve speeches from the given database.

    Parameters:
        database (pd.DataFrame): DataFrame containing information about the
            deputies.
        checkpoint (int): The interval at which to save the .csv file.
    """
    # Creating the dictionary
    data = {
        'deputy_code': [],
        'keywords': [],
        'start_time': [],
        'transcription': []
    }

    # Downloading the speeches
    for uri in database['uri'].tolist():
        try:
            response = requests.get(f'{uri}/discursos?dataInicio=1988-01-01&dataFim=2022-08-01&ordenarPor=dataHoraInicio&ordem=ASC')
            response_data = response.json()

            # Extracting the deputy code
            deputy_code = re.sub(r'.', '', uri, count=52)

            # Looping over the speeches
            for i, speech in enumerate(response_data['dados']):
                # Adding the deputy code to the dictionary
                data['deputy_code'].append(deputy_code)

                # Adding the keywords
                data['keywords'].append(speech['keywords'])

                # Adding the start time
                data['start_time'].append(speech['dataHoraInicio'])

                # Adding the transcription
                data['transcription'].append(speech['transcricao'])

                # Saving .csv checkpoint every checkpoint rows
                if (i + 1) % checkpoint == 0:
                    df = pd.DataFrame(data)
                    df.to_csv(f'speeches{checkpoint}.csv', index=False)
                    print(checkpoint)
                    checkpoint += checkpoint
        except Exception as e:
            print(f"Error downloading speech for URI {uri}: {e}")
            continue
    # Saving the data to a .csv file after the loop
    df = pd.DataFrame(data)
    df.to_csv("speeches.csv", index=False)

#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def get_deputy_info(database, checkpoint):
    """ database: dataframe containing the code of the deputies - format of the Câmara dos Deputados API site
        checkpoint: number of rows checkpoint to save
    """

    current_checkpoint = checkpoint

    # Creating the dictionary
    info_dict = {'deputy_code':[], 'legislature_id':[], 'full_name':[], 'party_abbreviation':[], 'state_abbreviation':[], 'party_uri':[]}

    # Download the information
    for i, deputy_uri in enumerate(database['uri'].values.tolist()):
        try:
            response = requests.get(f'{deputy_uri}/')
            data = response.json()

            # Getting the code for each deputy
            deputy_code = re.sub(r'.', '', deputy_uri, count = 52)

            # Adding the deputy code to the dictionary
            info_dict['deputy_code'].append(deputy_code)

            # Saving the legislature_id
            legislature_id = data['dados']['ultimoStatus']['idLegislatura']
            info_dict['legislature_id'].append(legislature_id)

            # Saving the full_name
            full_name = data['dados']['nomeCivil']
            info_dict['full_name'].append(full_name)

            # Saving the party_abbreviation
            party_abbreviation = data['dados']['ultimoStatus']['siglaPartido']
            info_dict['party_abbreviation'].append(party_abbreviation)

            # Saving the state_abbreviation
            state_abbreviation = data['dados']['ultimoStatus']['siglaUf']
            info_dict['state_abbreviation'].append(state_abbreviation)

            # Saving the party_uri
            party_uri = data['dados']['ultimoStatus']['uriPartido']
            info_dict['party_uri'].append(party_uri)

            # Saving .csv checkpoint every checkpoint rows
            if (i + 1) % current_checkpoint == 0:
                df = pd.DataFrame(info_dict)
                df.to_csv(f'deputy_info_{current_checkpoint}.csv', index=False)
                print(f"Saved checkpoint of {current_checkpoint} rows")
                current_checkpoint += checkpoint
        except Exception as e:
            print(f"Error downloading speech for URI {deputy_uri}: {e}")
            continue

    # Saving the data to a .csv file after the loop
    df = pd.DataFrame(info_dict)
    df.to_csv("deputy_info.csv", index=False)

In [None]:
# Running the functions
get_speeches(database = dep, checkpoint = 1)
get_deputy_info(database = dep, checkpoint = 1000)

In [None]:
#Opening downloaded datasets to join them
speeches = pd.read_csv('speeches.csv')
deputy_info = pd.read_csv('deputy_info.csv')

In [None]:
#Get the code for each deputy
deputy_codes = []
for deputy_uri in dep['uri'].tolist():
    deputy_code = re.sub(r'.', '', deputy_uri, count = 52)
    deputy_codes.append(deputy_code)

#Add the deputy codes to the database
dep['deputy_code'] = deputy_codes

#Change the format of the key variables for merging the databases
speeches['deputy_code'] = speeches['deputy_code'].astype(int).astype(str)
deputy_info['deputy_code'] = deputy_info['deputy_code'].astype(int).astype(str)

#Merge the databases
df1 = pd.merge(speeches, dep, on='deputy_code')
df2 = pd.merge(df1, deputy_info, on='deputy_code')

#Select the relevant variables
df2 = df2[['deputy_code', 'keywords', 'start_time', 'transcription', 'full_name', 'siglaSexo', 'dataNascimento', 'ufNascimento', 'party_abbreviation', 'state_abbreviation']]

#Remove rows that don't have any transcription
df3 = df2[df2['transcription'].notna()]

In [None]:
# Saving the final dataset to a .csv file to build the graphs
df3.to_csv("df_analyses.csv", index=False)