In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the data
candidates = pd.read_csv('../data/candidates/candidates.csv', encoding='utf-8')
congresspeople = pd.read_csv('../data/congresspeople/congresspeople.csv', encoding='utf-8')

In [3]:
# Filter out the candidates that are not congresspeople
candidates = candidates[candidates['DS_CARGO'] == 'DEPUTADO FEDERAL']

# Replace #NE with np.nan
candidates['DS_COR_RACA'] = candidates['DS_COR_RACA'].replace('#NE', np.nan)
candidates['DS_COR_RACA'] = candidates['DS_COR_RACA'].replace('NA', np.nan)
candidates['DS_COR_RACA'] = candidates['DS_COR_RACA'].replace('#NE#', np.nan)

# For the candidates with np.nan, look for the next ANO_ELEICAO and use that
candidates['DS_COR_RACA'] = candidates.groupby('NM_CANDIDATO')['DS_COR_RACA'].fillna(method='bfill')
candidates['DS_COR_RACA']

# Filtering important columns
candidates = candidates[['ANO_ELEICAO', 'DS_OCUPACAO', 'NR_CPF_CANDIDATO', 
                         'DS_GENERO', 'DS_GRAU_INSTRUCAO', 'DS_ESTADO_CIVIL', 'DS_COR_RACA',]]

# Rename columns
candidates.rename({'NR_CPF_CANDIDATO': "cpf", 'DS_OCUPACAO': 'occupation',
                   'DS_GENERO': 'gender', 'DS_GRAU_INSTRUCAO': 'education',
                   'DS_ESTADO_CIVIL': 'marital_status', 'DS_COR_RACA': 'ethnicity',
                   'ANO_ELEICAO': 'election_year'
                   }, axis=1, inplace=True)

candidates['election_year'] = candidates['election_year'].astype('int64')


# Adding election year to congresspeople and converting cpf to string
election_year = {57: 2022, 56: 2018, 55: 2014,
                 54: 2010, 53: 2006, 52: 2002,
                 51: 1998}
congresspeople['election_year'] = congresspeople['idLegislatura'].apply(lambda x: election_year[x])
congresspeople['cpf'] = congresspeople['cpf'].astype(str)

# Drop duplicated congresspeople, keeping the first one. Each congressperson will have only one candidate per election year
congresspeople = congresspeople.drop_duplicates(subset=['cpf', 'election_year'], keep='first')

In [4]:
# Merge the data, by year and cpf
congresspeople = congresspeople.merge(candidates, on=['cpf', 'election_year'], how='left')
congresspeople

Unnamed: 0,id,nomeCivil,nomeEleitoral,ufNascimento,escolaridade,dataNascimento,sexo,cpf,redeSocial,siglaUf,siglaPartido,idLegislatura,election_year,occupation,gender,education,marital_status,ethnicity
0,65551,RICARDO WAGNER DE CARVALHO LAGO,WAGNER LAGO,MA,Superior,1944-02-15,M,1706047304,[],MA,PDT,52,2002,ADVOGADO,MASCULINO,SUPERIOR COMPLETO,CASADO(A),
1,139285,LÍDICE DA MATA E SOUZA,Lídice da Mata,BA,Superior,1956-03-12,F,14672049515,"['https://twitter.com/lidicedamata', 'https://...",BA,PSB,57,2022,DEPUTADO,FEMININO,SUPERIOR COMPLETO,DIVORCIADO(A),PARDA
2,139285,LÍDICE DA MATA E SOUZA,Lídice da Mata,BA,Superior,1956-03-12,F,14672049515,"['https://twitter.com/lidicedamata', 'https://...",BA,PSB,53,2006,DEPUTADO,FEMININO,SUPERIOR COMPLETO,DIVORCIADO(A),PARDA
3,139285,LÍDICE DA MATA E SOUZA,Lídice da Mata,BA,Superior,1956-03-12,F,14672049515,"['https://twitter.com/lidicedamata', 'https://...",BA,PSB,56,2018,SENADOR,FEMININO,SUPERIOR COMPLETO,DIVORCIADO(A),PARDA
4,196357,DEJORGE PATRICIO DA SILVA,DEJORGE PATRÍCIO,RJ,Ensino Médio,1974-06-18,M,3726063773,[],RJ,PRB,55,2014,VEREADOR,MASCULINO,ENSINO MÉDIO COMPLETO,SOLTEIRO(A),PARDA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4406,163831,FERNANDO ANTÔNIO CECILIANO JORDÃO,FERNANDO JORDÃO,RJ,Superior,1952-06-23,M,49752839720,[],RJ,PMDB,54,2010,EMPRESÁRIO,MASCULINO,SUPERIOR COMPLETO,CASADO(A),
4407,73720,César Antonio de Souza,CESAR SOUZA,SC,Secundário,1957-11-08,M,28951107949,[],SC,PSD,55,2014,EMPRESÁRIO,MASCULINO,ENSINO MÉDIO INCOMPLETO,DIVORCIADO(A),BRANCA
4408,180214,RAIMUNDO DINIZ ARAÚJO,Pastor Diniz,RR,Superior,1980-03-21,M,65822960278,[],RR,UNIÃO,57,2022,PSICÓLOGO,MASCULINO,SUPERIOR COMPLETO,CASADO(A),PARDA
4409,172029,FRANCISCO DAS CHAGAS FRANCILINO,FRANCISCO CHAGAS,SE,Superior,1956-04-21,M,76555283815,[],SP,PT,54,2010,VEREADOR,MASCULINO,SUPERIOR COMPLETO,DIVORCIADO(A),BRANCA


In [5]:
congresspeople.to_csv('../data/congresspeople/enriched_congresspeople.csv', index=False, encoding='utf-8')