# Senators notebook

Parses datasets/senado/senators_with_memberships-**termid**.json creating / updating the following polare entites:
* Agents
* Memberships (Senate, Parties)
* Organizations (Senate)
* Posts (Senate)
* Roles (Afiliate, Senator)

## 1. Imports and Constants Declaration

In [66]:
import json   # reads scrapped data
import pandas as pd # helps with file management and visualization
import re
from collections import defaultdict
from uuid import uuid4 # creates new uuids for Posts

termid = 55 # legislatura

PROV = 'sen'
FILE_SUFFIX = termid
SUB_DIR = '{:}/'.format(termid)
JSON_PATH =  'datasets/senado/'

BASE_PATH = 'datasets/slp/'
AGENTS_PATH = 'datasets/slp/agents/'
MEMBERSHIPS_PATH = 'datasets/slp/memberships/'
ORGANIZATIONS_PATH = 'datasets/slp/organizations/'
ROLES_PATH = 'datasets/slp/roles/'
POSTS_PATH = 'datasets/slp/posts/'

SENATE_URI = '81311052-e5b6-46fe-87ba-83865fa0ffb0' 
SENATOR_URI = 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3' 
AFFILIATE_URI = '6a688541-b16a-45ca-8aa9-fa700373279f' 


 ## 2. Getting the Parties

In [67]:
filename = 'organizations.csv'
file_path = '{:}{:}'.format(BASE_PATH, filename)
df_parties = pd.read_csv(file_path, sep= ';', encoding= 'utf-8', index_col=None)
df_parties.set_index('sigla', inplace=True)


parties_d = df_parties['resource_uri'].to_dict()                        
print({k:parties_d[k]
    for k in ['PT', 'PMDB', 'DEM']})

{'PT': '7b2138a6-df61-11e7-baf1-c82a144c0a85', 'PMDB': '7b210cfa-df61-11e7-9864-c82a144c0a85', 'DEM': '7b20ed7e-df61-11e7-bb65-c82a144c0a85'}


## 3. Reads scrapped json

In [68]:
filename = 'senator_with_memberships-{:}.json'.format(FILE_SUFFIX) 
file_path = '{:}{:}'.format(JSON_PATH, filename)
with open(file_path, mode='r') as f:
    senatorsstr = f.read()
f.close()

senators_with_memberships = json.loads(senatorsstr)

## 4. Agents

In [69]:
mapping = {'skos:prefLabel': 'sen:CodigoParlamentar',
  'foaf:name':'sen:NomeCompletoParlamentar',
  'rdfs:label':'sen:NomeParlamentar',
  'resource_uri':'slp:resource_uri'           
}

agents = [] 
for swm_d in senators_with_memberships:
    agent_d = {mapping[k]: swm_d.get(k, None) for k in mapping}
    if agent_d:
        agents.append(agent_d)

filename = 'senators-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(AGENTS_PATH, SUB_DIR, filename)
df = pd.DataFrame.from_dict(agents)    
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)
df.head()

Unnamed: 0,sen:CodigoParlamentar,sen:NomeCompletoParlamentar,sen:NomeParlamentar,slp:resource_uri
0,846,Aloysio Nunes Ferreira Filho,Aloysio Nunes Ferreira,9220fc38-5091-47de-816a-d20e3772d2d5
1,715,Armando de Queiroz Monteiro Neto,Armando Monteiro,8961cb60-991f-4b5b-82fe-b24a5ef596b6
2,5529,Antonio Augusto Junho Anastasia,Antonio Anastasia,4638c700-684a-4d98-9090-528e721f6f35
3,5150,Antonio Carlos Rodrigues,Antonio Carlos Rodrigues,686fc66b-e3c3-4ada-943c-e8b01207936c
4,5140,Airton Sandoval Santana,Airton Sandoval,eb2335ea-3b2f-4f6b-babd-bc4254869254


 ## 5. Senate Org
 
 ### 5.1 Membership Senator
 #### 5.1.1 Process

In [70]:
def priority_post(description):
    '''
        Computes priority to a post 
        0 is higher (Titular)
    '''
    return 0 if description == 'Titular' else int(re.sub('\D','', description))

In [71]:
mapping = {
    'code': 'sen:CodigoMandato',
    'area': 'sen:UfParlamentar',
    'startDate': 'sen:startDate', 
    'finishDate': 'sen:finishDate',
    'resource_uri':'slp:resource_uri',
    'role':'org:role'
}
dependents = defaultdict(list)
owners = {}
memberships = [] 
for swm_d in senators_with_memberships:
    member_uri = swm_d['resource_uri'] 
    identifier = int(swm_d['skos:prefLabel'])
    terms = swm_d['terms']
    if terms:
        for term_d in terms:
            memberships_d = {mapping[k]: term_d.get(k, None) for k in mapping}
            if memberships_d:
                memberships_d['org:member'] = member_uri
                memberships_d['priority'] = priority_post(term_d.get('description', None))
                memberships_d['identifier'] = identifier
                memberships.append(memberships_d)
    
    # fills dict with owners
    
    if swm_d['owner']: 
        owners[identifier] = swm_d['owner'][0]['skos:prefLabel']
            
    # fills dict with dependents
    if swm_d['dependents']: 
        for dep in swm_d['dependents']:
            dependents[identifier].append(int(dep['skos:prefLabel']))
            
df = pd.DataFrame.from_dict(memberships)    
print('Number of memberships:{:}'.format(df.shape[0]))
df.head()            

Number of memberships:179


Unnamed: 0,identifier,org:member,org:role,priority,sen:CodigoMandato,sen:UfParlamentar,sen:finishDate,sen:startDate,slp:resource_uri
0,846,9220fc38-5091-47de-816a-d20e3772d2d5,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0,485,SP,2017-03-06,2011-02-01,b4f84a28-7712-423d-9a55-a85730a880f4
1,715,8961cb60-991f-4b5b-82fe-b24a5ef596b6,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0,443,PE,,2016-05-09,8eb21abb-0fea-4984-8d0b-245b6b57b626
2,715,8961cb60-991f-4b5b-82fe-b24a5ef596b6,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0,443,PE,2015-01-01,2014-11-15,7a68ad0b-66c5-49b9-bfae-0c45000f703c
3,715,8961cb60-991f-4b5b-82fe-b24a5ef596b6,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0,443,PE,2014-07-16,2011-02-01,16035f73-1051-4297-9950-bc9f83b4ff74
4,5529,4638c700-684a-4d98-9090-528e721f6f35,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0,495,MG,,2015-02-01,fd016ede-506d-470b-9df1-71466d1c507f


 #### 5.1.2 Save

In [72]:
_df = df.drop(['priority', 'identifier'], axis=1)
filename = 'memberships_with_senate-{:}.csv'.format(PROV, FILE_SUFFIX) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
_df.to_csv(file_path, sep=';', encoding='utf-8', index=None)
_df.head()

Unnamed: 0,org:member,org:role,sen:CodigoMandato,sen:UfParlamentar,sen:finishDate,sen:startDate,slp:resource_uri
0,9220fc38-5091-47de-816a-d20e3772d2d5,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,485,SP,2017-03-06,2011-02-01,b4f84a28-7712-423d-9a55-a85730a880f4
1,8961cb60-991f-4b5b-82fe-b24a5ef596b6,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,443,PE,,2016-05-09,8eb21abb-0fea-4984-8d0b-245b6b57b626
2,8961cb60-991f-4b5b-82fe-b24a5ef596b6,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,443,PE,2015-01-01,2014-11-15,7a68ad0b-66c5-49b9-bfae-0c45000f703c
3,8961cb60-991f-4b5b-82fe-b24a5ef596b6,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,443,PE,2014-07-16,2011-02-01,16035f73-1051-4297-9950-bc9f83b4ff74
4,4638c700-684a-4d98-9090-528e721f6f35,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,495,MG,,2015-02-01,fd016ede-506d-470b-9df1-71466d1c507f


### 5.2 Posts

#### 5.2.1 Posts Process

In [74]:
posts_d = {}
sorted(memberships, key=lambda x: '' if x['sen:finishDate'] is None else x['sen:finishDate'])
sorted(memberships, key=lambda x: x['sen:startDate'])
d = {} 
posts_with_memberships = []
for membership in memberships:
    identifier = membership['identifier']
    if membership['priority'] == 0 and not membership['identifier'] in posts_d:
        posts_d[identifier] = str(uuid4())
    else:        
    post_uri = posts_d[identifier]

    posts_with_memberships.append({
            'resource_uri': identifier,
            '':        
    })
    

print(len(posts_d))        
posts = defaultdict(dict)
i = 0 
for idx, post_uri in posts_d.items():
    posts['resource_uri'][i] = post_uri
    posts['org:role'][i] = SENATOR_URI
    posts['org:postIn'][i] = SENATE_URI
    i += 1

df = pd.DataFrame.from_dict(posts)
df.head()
        

84


Unnamed: 0,org:postIn,org:role,resource_uri
0,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,734f0886-f5d1-45d5-993d-48554ed2a1d0
1,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,fca44e77-9405-4251-9a63-bdaeed9544c6
2,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,c844f5c3-e5d6-46d5-9783-0d136ecd6667
3,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0f3212bb-09c6-4cdc-b91f-e20211875f9f
4,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,9c565a2b-43b3-4b2a-bc26-55897cdb975c


#### 5.2.2 Posts save

In [57]:
filename = 'senator_posts-{:}.csv'.format(PROV, FILE_SUFFIX) 
file_path = '{:}{:}{:}'.format(POSTS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ## 6. Parties Org

 ### 6.1 Membership
 
 #### 6.1.1 Process

In [58]:
mapping = {
    'sigla': 'sen:SiglaPartido',
    'startDate': 'sen:DataFiliacao' ,
    'finishDate': 'sen:DataDesfiliacao',
    'resource_uri': 'slp:resource_uri'
}

memberships = [] 
for swm_d in senators_with_memberships:
    affiliations = swm_d['affiliations']
    if affiliations:
        for affiliation_d in affiliations:
            if affiliation_d['sigla'] in parties_d:
                memberships_d = {mapping[k]: affiliation_d.get(k, None) for k in mapping}
                if memberships_d:
                    memberships_d['org:member'] = member_uri
                    memberships_d['org:role'] = AFFILIATE_URI
                    memberships.append(memberships_d)


df = pd.DataFrame.from_dict(memberships)    
parties = df['sen:SiglaPartido'].values
df.head(10)            

Unnamed: 0,org:member,org:role,sen:DataDesfiliacao,sen:DataFiliacao,sen:SiglaPartido,slp:resource_uri
0,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,,1999-02-01,PSDB,8f80e666-1bf9-449d-88b3-61f4b976e539
1,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,,2003-05-15,PTB,51a94990-a999-4398-9f79-21c5eb3238de
2,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,2003-05-14,1999-02-01,PMDB,c748a494-9891-4066-9029-14a1b4402ae0
3,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,,2015-02-01,PSDB,c17aee00-caf0-4544-a8e8-f2a2efb4a0ce
4,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,,2011-02-03,PR,787b212c-b6f6-4808-8b8d-4d1a471a64e5
5,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,,2011-02-03,PMDB,2fc7e762-49b8-45e8-8bd3-69e80a38943b
6,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,,2017-07-01,PODE,1dc8f0d1-b704-4ee3-a25d-92725cc6fe67
7,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,2017-06-30,2016-01-11,PV,8058503b-e8e0-42b8-80f0-3bbdb79dc154
8,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,2016-01-07,2003-08-14,PSDB,41b24430-9e84-43a4-a529-62af16f9ecc7
9,65186822-1203-434c-b679-188b99f2decb,6a688541-b16a-45ca-8aa9-fa700373279f,2003-08-13,2001-09-26,PDT,a0f91814-7387-4f8f-8318-005d4e8f3705


 #### 6.1.2 Save

In [63]:
filename = 'memberships_at_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

  ### 6.2 Roles at Parties
  #### 6.2.1 Process

In [60]:
df = df['org:role'].to_frame()
df = df.rename({'org:role':'slp:resource_uri'}, axis='columns')
df['rdfs:label'] = 'Affiliate'
df.head()

Unnamed: 0,slp:resource_uri,rdfs:label
0,6a688541-b16a-45ca-8aa9-fa700373279f,Affiliate
1,6a688541-b16a-45ca-8aa9-fa700373279f,Affiliate
2,6a688541-b16a-45ca-8aa9-fa700373279f,Affiliate
3,6a688541-b16a-45ca-8aa9-fa700373279f,Affiliate
4,6a688541-b16a-45ca-8aa9-fa700373279f,Affiliate


 #### 6.2.2 Save

In [61]:
filename = 'roles_affiliatiate-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(ROLES_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ### 6.3 Posts at Parties
 #### 6.3.1 Process

In [62]:
df = df.rename({'slp:resource_uri':'org:role'}, axis='columns')
df['slp:resource_uri'] = [parties_d.get(sigla, None) for sigla in parties]
df.drop('rdfs:label', axis=1, inplace=True)
df.head()

Unnamed: 0,org:role,slp:resource_uri
0,6a688541-b16a-45ca-8aa9-fa700373279f,7b212e92-df61-11e7-aaf4-c82a144c0a85
1,6a688541-b16a-45ca-8aa9-fa700373279f,7b2139c8-df61-11e7-8d64-c82a144c0a85
2,6a688541-b16a-45ca-8aa9-fa700373279f,7b210cfa-df61-11e7-9864-c82a144c0a85
3,6a688541-b16a-45ca-8aa9-fa700373279f,7b212e92-df61-11e7-aaf4-c82a144c0a85
4,6a688541-b16a-45ca-8aa9-fa700373279f,7b211a38-df61-11e7-b1b8-c82a144c0a85


 #### 6.3.2 Save

In [37]:
filename = 'posts_at_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(POSTS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)