 # Consolidate Entities
 All csv files under slp/__entity__/ into one single file representing all terms entities.

 Please run __Senators__ and __Congressmen__ notebooks in order to update the files to the latest update.

In [106]:
import glob # linux glob wrapper allows us to list directory using regex expressions
import pandas as pd # provides join functionality and named-column manipulation
import os
SLP_PATH = 'datasets/slp/'
AGENTS_PATH = '{:}agents/'.format(SLP_PATH)
ORGANIZATIONS_PATH = '{:}organizations/'.format(SLP_PATH)
MEMBERSHIPS_PATH = '{:}memberships/'.format(SLP_PATH)


In [107]:
def upsert_by_prov(entity_path, prov):
    '''
        Performs upsert (Update/Insert) of all files with the same provenance
        
        args:
            entity_path .: string containing the path
            prov        .: string in ('cam', 'sen')
        
    '''
    if prov not in ('cam', 'sen'):
        raise ValueError('The only provs suported are (\'cam\', \'sen\') got {:}'.format(prov))    

    df = None    
    # Gets every directory under entity path - with terms sorted asc    
    entity_terms = sorted( glob.glob('{:}*[0-9]'.format(entity_path)) )
    for et in entity_terms:
        filenames = glob.glob('{:}/*{:}.csv'.format(et, prov))
        df = pd.read_csv(filenames[0], sep= ';', header=0, index_col='slp:resource_uri', encoding='utf-8')
        
        for fn in filenames[1:]:
            _df = pd.read_csv(fn, sep= ';', header=0, index_col='slp:resource_uri', encoding='utf-8')            
            # upsert : concatenate or insert
            df = pd.concat([df[~df.index.isin(_df.index)], _df])

    return df                        

 ## 1. Agents
 
 ### 1.1 Upsert camara

In [108]:
dataframes = [] 
df = upsert_by_prov(AGENTS_PATH, 'cam')
print('Table camara shape:', df.shape)
columns = df.columns
dataframes.append(df)
df.head()

Table camara shape: (606, 5)


Unnamed: 0_level_0,cam:dataFalecimento,cam:dataNascimento,cam:ideCadastro,cam:nomeCivil,cam:nomeParlamentarAtual
slp:resource_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
64965d06-916a-4ef9-a8c7-886055dc979e,,1976-10-23,178836,ALAN RICK MIRANDA,ALAN RICK
908bcac7-6fc7-4957-9ae9-00a46ad10a37,,1969-02-13,160527,AGUINALDO VELLOSO BORGES RIBEIRO,AGUINALDO RIBEIRO
5ff9cfe9-2fee-47b6-b7ac-a1cb282236be,,1987-02-10,160582,JOSÉ ALBERTO OLIVEIRA VELOSO FILHO,ALBERTO FILHO
e4e93a22-8177-400a-8231-2e8ae9d06a12,,1960-10-15,160508,AFONSO BANDEIRA FLORENCE,AFONSO FLORENCE
0a39e693-7990-4118-8637-278d778df124,,1956-02-05,178903,ADILTON DOMINGOS SACHETTI,ADILTON SACHETTI


 ### 1.2 Upsert senado

In [109]:
df  = upsert_by_prov(AGENTS_PATH, 'sen')
print('Table senado shape:', df.shape)
columns = df.columns.union(columns)
dataframes.append(df)
df.head()

Table senado shape: (252, 3)


Unnamed: 0_level_0,sen:CodigoParlamentar,sen:NomeCompletoParlamentar,sen:NomeParlamentar
slp:resource_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
828a0e78-b457-47de-8112-a7cce8d24563,5573,Abel Rebouças São José,Abel Rebouças
59888f36-89fc-40e9-adf7-e95bb4906eb2,739,Ciro Nogueira Lima Filho,Ciro Nogueira
94be5f0e-2cc9-488b-bb32-335a44eb4f1b,5108,José Aparecido dos Santos,Cidinho Santos
40260ab9-9261-46b9-8ab3-3c569dfa8b24,5136,Cesar Antonio de Souza,Cesar Antonio de Souza
ad36f1e7-f24a-4f51-9f44-c45468e0d4a1,5623,Christopher Belchior Goulart,Christopher Goulart


 ### 1.3 Integrate

In [110]:
for _df in dataframes:
    _df = _df.reindex(columns=columns, fill_value=None)

df = pd.concat(dataframes, axis=0)    
print('Table agents', _df.shape)
df.head()

Table agents (252, 8)


Unnamed: 0_level_0,cam:dataFalecimento,cam:dataNascimento,cam:ideCadastro,cam:nomeCivil,cam:nomeParlamentarAtual,sen:CodigoParlamentar,sen:NomeCompletoParlamentar,sen:NomeParlamentar
slp:resource_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
64965d06-916a-4ef9-a8c7-886055dc979e,,1976-10-23,178836.0,ALAN RICK MIRANDA,ALAN RICK,,,
908bcac7-6fc7-4957-9ae9-00a46ad10a37,,1969-02-13,160527.0,AGUINALDO VELLOSO BORGES RIBEIRO,AGUINALDO RIBEIRO,,,
5ff9cfe9-2fee-47b6-b7ac-a1cb282236be,,1987-02-10,160582.0,JOSÉ ALBERTO OLIVEIRA VELOSO FILHO,ALBERTO FILHO,,,
e4e93a22-8177-400a-8231-2e8ae9d06a12,,1960-10-15,160508.0,AFONSO BANDEIRA FLORENCE,AFONSO FLORENCE,,,
0a39e693-7990-4118-8637-278d778df124,,1956-02-05,178903.0,ADILTON DOMINGOS SACHETTI,ADILTON SACHETTI,,,


 ### 1.4 Save

In [111]:
filename = '{:}agents.csv'.format(SLP_PATH)
df.to_csv(filename, sep=';', index=True, encoding='utf-8')

 ## 2. Memberships
 ### 2.1 Upsert camara

In [112]:
dataframes=[]
df = upsert_by_prov(MEMBERSHIPS_PATH, 'cam')
columns = df.columns
dataframes.append(df)
print(df.shape)
df.head()

(954, 7)


Unnamed: 0_level_0,cam:dataFim,cam:dataInicio,cam:siglaUFRepresentacao,org:member,org:post,org:postIn,org:role
slp:resource_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9589fe93-2270-4940-be3c-706111bf6b69,,2015-02-01,AC,64965d06-916a-4ef9-a8c7-886055dc979e,66884075-6a00-485e-9dc9-792f9b34e035,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe
e896acb8-0da3-4e4a-b235-355c1a01f7cb,,2015-02-01,PB,908bcac7-6fc7-4957-9ae9-00a46ad10a37,a8b6f47f-5ff8-489b-8c27-f6880bc0fb54,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe
30eb6da8-3308-45a0-bfd8-0f6f9884133c,,2015-02-01,BA,e4e93a22-8177-400a-8231-2e8ae9d06a12,826e5566-26d8-4dfd-ad92-c81039336f0e,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe
76e294dc-7b1a-48d5-8f15-6888f59c42d2,2017-10-30,2015-02-01,MT,0a39e693-7990-4118-8637-278d778df124,fca05d26-d58d-4289-b0d9-ecffccf678d4,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe
39df2d4d-6954-4abf-909f-7ab27d22870f,,2015-02-01,RS,abf442cf-ef29-40f6-8929-9a4066570ed2,70fd1e05-d6d4-46ca-98bf-82a207d04d7c,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe


 ### 2.2 Upsert senado

In [113]:
df  = upsert_by_prov(MEMBERSHIPS_PATH, 'sen')
print('Table senado shape:', df.shape)
columns = df.columns.union(columns)
dataframes.append(df)
df.head()

Table senado shape: (856, 12)


Unnamed: 0_level_0,DataFim,DataInicio,NumeroLegislatura,org:member,org:post,org:postIn,org:role,sen:CodigoMandato,sen:DataDesfiliacao,sen:DataFiliacao,sen:SiglaPartido,sen:UfParlamentar
slp:resource_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
61e28c9c-c93b-40fc-a73d-3a963632df24,2019-01-31,2015-02-01,55.0,828a0e78-b457-47de-8112-a7cce8d24563,557ddf29-a108-4f60-9bea-08c9eb164aab,81311052-e5b6-46fe-87ba-83865fa0ffb0,d69fcc38-21a6-41ee-8c0f-90be24aff805,492.0,,,,BA
669716c0-cf09-4d77-896b-8497c3943b34,2023-01-31,2019-02-01,56.0,828a0e78-b457-47de-8112-a7cce8d24563,804a0dd6-37d9-448c-9a1b-fd47737a6684,81311052-e5b6-46fe-87ba-83865fa0ffb0,d69fcc38-21a6-41ee-8c0f-90be24aff805,492.0,,,,BA
869d897d-3ab6-448a-ba58-8c18a2aa4421,2015-01-31,2011-02-01,54.0,59888f36-89fc-40e9-adf7-e95bb4906eb2,d1f2997c-db50-458e-a53b-07df441ae6d0,81311052-e5b6-46fe-87ba-83865fa0ffb0,d69fcc38-21a6-41ee-8c0f-90be24aff805,458.0,,,,PI
155310d6-40c8-4c04-9234-6f312193b877,2019-01-31,2015-02-01,55.0,59888f36-89fc-40e9-adf7-e95bb4906eb2,47fe43d9-8020-471f-90fc-9432a6227492,81311052-e5b6-46fe-87ba-83865fa0ffb0,d69fcc38-21a6-41ee-8c0f-90be24aff805,458.0,,,,PI
3e9c9056-8f75-4d44-b5a3-da45f12caa33,2015-01-31,2011-02-01,54.0,94be5f0e-2cc9-488b-bb32-335a44eb4f1b,d0fc7898-63b2-454e-8a3a-882652c63d14,81311052-e5b6-46fe-87ba-83865fa0ffb0,d69fcc38-21a6-41ee-8c0f-90be24aff805,456.0,,,,MT


 ### 2.3 Integrate

In [117]:
for _df in dataframes:
    _df = _df.reindex(columns=columns, fill_value=None)

df = pd.concat(dataframes, axis=0)    
print('Table memberships', df.shape)
df.head()

Table memberships (1810, 15)


Unnamed: 0_level_0,DataFim,DataInicio,NumeroLegislatura,cam:dataFim,cam:dataInicio,cam:siglaUFRepresentacao,org:member,org:post,org:postIn,org:role,sen:CodigoMandato,sen:DataDesfiliacao,sen:DataFiliacao,sen:SiglaPartido,sen:UfParlamentar
slp:resource_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9589fe93-2270-4940-be3c-706111bf6b69,,,,,2015-02-01,AC,64965d06-916a-4ef9-a8c7-886055dc979e,66884075-6a00-485e-9dc9-792f9b34e035,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe,,,,,
e896acb8-0da3-4e4a-b235-355c1a01f7cb,,,,,2015-02-01,PB,908bcac7-6fc7-4957-9ae9-00a46ad10a37,a8b6f47f-5ff8-489b-8c27-f6880bc0fb54,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe,,,,,
30eb6da8-3308-45a0-bfd8-0f6f9884133c,,,,,2015-02-01,BA,e4e93a22-8177-400a-8231-2e8ae9d06a12,826e5566-26d8-4dfd-ad92-c81039336f0e,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe,,,,,
76e294dc-7b1a-48d5-8f15-6888f59c42d2,,,,2017-10-30,2015-02-01,MT,0a39e693-7990-4118-8637-278d778df124,fca05d26-d58d-4289-b0d9-ecffccf678d4,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe,,,,,
39df2d4d-6954-4abf-909f-7ab27d22870f,,,,,2015-02-01,RS,abf442cf-ef29-40f6-8929-9a4066570ed2,70fd1e05-d6d4-46ca-98bf-82a207d04d7c,c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c,b27beba7-ca02-4041-a9e0-1793bcd141fe,,,,,


In [118]:
filename = '{:}memberships.csv'.format(SLP_PATH)
df.to_csv(filename, sep=';', index=True, encoding='utf-8')