# Senators notebook

Parses datasets/senado/senators_with_memberships-**termid**.json creating / updating the following polare entites:
* Agents
* Memberships (Senate, Parties)
* Organizations (Senate)
* Posts (Senate)
* Roles (Afiliate, Senator)

## 1. Imports and Constants Declaration

In [4]:
import json   # reads scrapped data
import pandas as pd # helps with file management and visualization
import re
from collections import defaultdict
from uuid import uuid4 # creates new uuids for Posts
import os

termid = 54 # legislatura

PROV = 'sen'
FILE_SUFFIX = termid
SUB_DIR = '{:}/'.format(termid)
JSON_PATH =  'datasets/senado/'

BASE_PATH = 'datasets/slp/'
AGENTS_PATH = 'datasets/slnp/agents/'
MEMBERSHIPS_PATH = 'datasets/slnp/memberships/'
ORGANIZATIONS_PATH = 'datasets/slnp/organizations/'
ROLES_PATH = 'datasets/slnp/roles/'
POSTS_PATH = 'datasets/slnp/posts/'

SENATE_URI = '81311052-e5b6-46fe-87ba-83865fa0ffb0' 
SENATOR_URI = 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3' 
AFFILIATE_URI = '6a688541-b16a-45ca-8aa9-fa700373279f' 


 ## 2. Getting the Parties

In [5]:
filename = 'organizations.csv'
file_path = '{:}{:}'.format(BASE_PATH, filename)
df_parties = pd.read_csv(file_path, sep= ';', encoding= 'utf-8', index_col=None)
df_parties.set_index('sigla', inplace=True)


parties_d = df_parties['resource_uri'].to_dict()                        
print({k:parties_d[k]
    for k in ['PT', 'PMDB', 'DEM']})

{'PT': '7b2138a6-df61-11e7-baf1-c82a144c0a85', 'PMDB': '7b210cfa-df61-11e7-9864-c82a144c0a85', 'DEM': '7b20ed7e-df61-11e7-bb65-c82a144c0a85'}


## 3. Reads scrapped json

In [6]:
filename = 'senator_with_memberships-{:}.json'.format(FILE_SUFFIX) 
file_path = '{:}{:}'.format(JSON_PATH, filename)
with open(file_path, mode='r') as f:
    senatorsstr = f.read()
f.close()

senators_with_memberships = json.loads(senatorsstr)

## 4. Agents

In [7]:
file_dir = '{:}{:}'.format(AGENTS_PATH, SUB_DIR)
if not os.path.exists(file_dir):
    os.makedirs(file_dir)
    
mapping = {'skos:prefLabel': 'sen:CodigoParlamentar',
  'foaf:name':'sen:NomeCompletoParlamentar',
  'rdfs:label':'sen:NomeParlamentar',
  'resource_uri':'slnp:resource_uri'           
}

agents = [] 
for swm_d in senators_with_memberships:
    agent_d = {mapping[k]: swm_d.get(k, None) for k in mapping}
    if agent_d:
        agents.append(agent_d)

filename = 'senators-{:}.csv'.format(PROV) 
file_path = '{:}{:}'.format(file_dir, filename)
df = pd.DataFrame.from_dict(agents)    
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)
df.head()

Unnamed: 0,sen:CodigoParlamentar,sen:NomeCompletoParlamentar,sen:NomeParlamentar,slnp:resource_uri
0,5154,Abiancy Cadoso Rosa,Abiancy Cadoso Rosa,96c3f158-e4b7-43af-a5b9-fd51d173bced
1,5108,José Aparecido dos Santos,Cidinho Santos,1aae4a8a-8e07-4d3e-b5a9-66e572c674da
2,5136,Cesar Antonio de Souza,Cesar Antonio de Souza,758da695-76e4-4b4b-9744-8c8594a00bcf
3,3398,Cristovam Ricardo Cavalcanti Buarque,Cristovam Buarque,d078c83f-fed0-4e76-abd6-17e515cb2f7b
4,4895,Clésio Soares de Andrade,Clésio Andrade,7b2ec4ca-44b3-461c-bb09-365c0bb4b872


 ## 5. Senate Org
 
 ### 5.1 Membership Senator
 #### 5.1.1 Process

In [8]:
def priority_post(description):
    '''
        Computes priority to a post 
        0 is higher (Titular)
    '''
    return 0 if description == 'Titular' else int(re.sub('\D','', description))

In [9]:
mapping = {
    'code': 'sen:CodigoMandato',
    'area': 'sen:UfParlamentar',
    'startDate': 'sen:startDate', 
    'finishDate': 'sen:finishDate',
    'resource_uri':'slnp:resource_uri',
    'role':'org:role'
}
dependents = defaultdict(list)
owners = {}
memberships = [] 
for swm_d in senators_with_memberships:
    member_uri = swm_d['resource_uri'] 
    identifier = int(swm_d['skos:prefLabel'])
    terms = swm_d['terms']
    if terms:
        for term_d in terms:
            memberships_d = {mapping[k]: term_d.get(k, None) for k in mapping}
            if memberships_d:
                memberships_d['org:member'] = member_uri
                memberships_d['priority'] = priority_post(term_d.get('description', None))
                memberships_d['identifier'] = identifier
                memberships.append(memberships_d)
    
    # fills dict with owners
    if swm_d['owner']: 
        owners[identifier] = int(swm_d['owner'][0]['skos:prefLabel'])
            
    # fills dict with dependents
    if swm_d['dependents']: 
        for dep in swm_d['dependents']:
            dependents[identifier].append(int(dep['skos:prefLabel']))
            
df = pd.DataFrame.from_dict(memberships)    
print('Number of memberships:{:}'.format(df.shape[0]))
df.head()            

Number of memberships:213


Unnamed: 0,identifier,org:member,org:role,priority,sen:CodigoMandato,sen:UfParlamentar,sen:finishDate,sen:startDate,slnp:resource_uri
0,5108,1aae4a8a-8e07-4d3e-b5a9-66e572c674da,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,1,456,MT,2018-04-26,2016-05-15,ea25220c-7fc7-4d2b-8569-5df19e837db0
1,5108,1aae4a8a-8e07-4d3e-b5a9-66e572c674da,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,1,456,MT,2014-07-17,2014-03-13,056b025a-191a-444b-9fc9-6977654c06c6
2,5108,1aae4a8a-8e07-4d3e-b5a9-66e572c674da,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,1,456,MT,2012-12-16,2012-08-09,9b0cbc51-cad3-4655-9e35-fe423adcf4c7
3,3398,d078c83f-fed0-4e76-abd6-17e515cb2f7b,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,0,452,DF,,2011-02-01,5adf4d36-e418-4ad6-abf9-58080453eef1
4,4895,7b2ec4ca-44b3-461c-bb09-365c0bb4b872,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,1,347,MG,2014-07-15,2011-01-12,621de215-a855-49b2-8f38-65577b32029c


 #### 5.1.2 Save

In [10]:
_df = df.drop(['priority', 'identifier'], axis=1)
filename = 'memberships_with_senate-{:}.csv'.format(PROV, FILE_SUFFIX) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
_df.to_csv(file_path, sep=';', encoding='utf-8', index=None)
_df.head()

Unnamed: 0,org:member,org:role,sen:CodigoMandato,sen:UfParlamentar,sen:finishDate,sen:startDate,slnp:resource_uri
0,1aae4a8a-8e07-4d3e-b5a9-66e572c674da,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,456,MT,2018-04-26,2016-05-15,ea25220c-7fc7-4d2b-8569-5df19e837db0
1,1aae4a8a-8e07-4d3e-b5a9-66e572c674da,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,456,MT,2014-07-17,2014-03-13,056b025a-191a-444b-9fc9-6977654c06c6
2,1aae4a8a-8e07-4d3e-b5a9-66e572c674da,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,456,MT,2012-12-16,2012-08-09,9b0cbc51-cad3-4655-9e35-fe423adcf4c7
3,d078c83f-fed0-4e76-abd6-17e515cb2f7b,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,452,DF,,2011-02-01,5adf4d36-e418-4ad6-abf9-58080453eef1
4,7b2ec4ca-44b3-461c-bb09-365c0bb4b872,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,347,MG,2014-07-15,2011-01-12,621de215-a855-49b2-8f38-65577b32029c


### 5.2 Posts

#### 5.2.1 Posts Process

In [11]:
posts_d = {}
sorted(memberships, key=lambda x: x['priority'])
sorted(memberships, key=lambda x: '' if x['sen:finishDate'] is None else x['sen:finishDate'])
sorted(memberships, key=lambda x: x['sen:startDate'])
d = {} 

posts_with_memberships = []
# fills owners
for membership in memberships:
    identifier = membership['identifier']
    if membership['priority'] == 0 and not membership['identifier'] in posts_d:
        posts_d[identifier] = str(uuid4())
      

# fills dependents
for membership in memberships:
    identifier = membership['identifier']
    if membership['priority'] > 0 and not owners[identifier] in posts_d:
        posts_d[owners[identifier]] = str(uuid4())


for i, membership in enumerate(memberships):
    identifier = membership['identifier']
    if membership['priority'] == 0:    
        post_uri = posts_d[identifier]        
    else:        
        post_uri = posts_d[owners[identifier]]
    
    posts_with_memberships.append({
        'resource_uri': post_uri,
        'org:role': SENATOR_URI,
        'org:postIn': SENATE_URI,
        'org:member': identifier,
        'org:startDate': membership['sen:startDate'],
        'owner': owners.get(identifier, '')
    })

df = pd.DataFrame.from_dict(posts_with_memberships)
df.head()
        

Unnamed: 0,org:member,org:postIn,org:role,org:startDate,owner,resource_uri
0,5108,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,2016-05-15,111.0,1b93da67-0546-4837-a06d-9036b5b5ac59
1,5108,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,2014-03-13,111.0,1b93da67-0546-4837-a06d-9036b5b5ac59
2,5108,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,2012-08-09,111.0,1b93da67-0546-4837-a06d-9036b5b5ac59
3,3398,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,2011-02-01,,0274b700-0d4c-455f-9977-58119f3d226a
4,4895,81311052-e5b6-46fe-87ba-83865fa0ffb0,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,2011-01-12,394.0,deac66fe-6199-4955-8e03-e1a3143c3754


#### 5.2.2 Posts save

In [13]:
file_dir = '{:}{:}'.format(POSTS_PATH, SUB_DIR)
if not os.path.exists(file_dir):
    os.makedirs(file_dir)
    
_df = df.drop(['org:member', 'owner', 'org:startDate'], axis=1)
filename = 'senator_posts-{:}.csv'.format(PROV, FILE_SUFFIX) 
file_path = '{:}{:}'.format(file_dir, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ## 6. Parties Org

 ### 6.1 Membership
 
 #### 6.1.1 Process

In [179]:
mapping = {
    'sigla': 'sen:SiglaPartido',
    'startDate': 'sen:DataFiliacao' ,
    'finishDate': 'sen:DataDesfiliacao',
    'resource_uri': 'resource_uri'
}

memberships = [] 
for swm_d in senators_with_memberships:
    member_uri = swm_d['resource_uri']
    affiliations = swm_d['affiliations']
    if affiliations:
        for affiliation_d in affiliations:
            if affiliation_d['sigla'] in parties_d:
                memberships_d = affiliation_d.copy()
                if memberships_d:
                    memberships_d['org:member'] = member_uri
                    memberships_d['org:role'] = AFFILIATE_URI
                    memberships.append(memberships_d)


df = pd.DataFrame.from_dict(memberships)    
parties = df['sigla'].values
df.head(10)            

Unnamed: 0,finishDate,org:member,org:role,resource_uri,role,sigla,startDate
0,,c072c80f-3595-43c0-86a2-f9de4ab08a94,6a688541-b16a-45ca-8aa9-fa700373279f,a4084c1a-a2f7-438d-b4fc-1085af4b0256,6a688541-b16a-45ca-8aa9-fa700373279f,PDT,2015-02-01
1,,88776b6d-78a4-4409-b4ed-d6d7da02d5ab,6a688541-b16a-45ca-8aa9-fa700373279f,f1375dee-a50d-48bc-86d4-6d72d268dbe5,6a688541-b16a-45ca-8aa9-fa700373279f,PR,2011-01-27
2,,664ca3b7-95cf-4647-b0d9-7c5d69dd5caf,6a688541-b16a-45ca-8aa9-fa700373279f,c3b4c7d2-f81d-4e26-b924-ea1a28124502,6a688541-b16a-45ca-8aa9-fa700373279f,DEM,2011-02-03
3,,12ccdcb9-3b04-4be8-aadd-287b05819002,6a688541-b16a-45ca-8aa9-fa700373279f,10cddeff-52aa-41a5-a219-118dfa276883,6a688541-b16a-45ca-8aa9-fa700373279f,PSDB,1999-02-01
4,,ac4e1241-47d9-4b28-a926-45ecb143d59f,6a688541-b16a-45ca-8aa9-fa700373279f,670dbcd1-5c18-4f56-81e0-e84e334059e2,6a688541-b16a-45ca-8aa9-fa700373279f,PSD,2015-02-01
5,,3d8250ad-d2c7-4932-92c8-2b93a11aeed4,6a688541-b16a-45ca-8aa9-fa700373279f,60952631-58e2-44fa-bca9-8f6044638a3c,6a688541-b16a-45ca-8aa9-fa700373279f,PRB,2015-02-01
6,,8117a2a2-3d7f-4f31-a816-7eb3290fd1c6,6a688541-b16a-45ca-8aa9-fa700373279f,18189899-204a-4a05-885f-ac64d6b4a4dd,6a688541-b16a-45ca-8aa9-fa700373279f,PMDB,2011-02-03
7,,386d86c5-0783-4cb3-aae0-0201ac1a26e0,6a688541-b16a-45ca-8aa9-fa700373279f,7b390928-fb87-464b-8064-8d89ee24f228,6a688541-b16a-45ca-8aa9-fa700373279f,PTB,2003-05-15
8,2003-05-14,386d86c5-0783-4cb3-aae0-0201ac1a26e0,6a688541-b16a-45ca-8aa9-fa700373279f,2076649b-d440-4888-982d-a555ca616d06,6a688541-b16a-45ca-8aa9-fa700373279f,PMDB,1999-02-01
9,,b36d3e89-1dfd-4517-8055-1cfad349d2ed,6a688541-b16a-45ca-8aa9-fa700373279f,027fab11-7376-4858-8765-7f2916cd2c2e,6a688541-b16a-45ca-8aa9-fa700373279f,PSDB,2014-12-13


 #### 6.1.2 Save

In [180]:
_df = df.drop(['sigla'], axis=1)
filename = 'memberships_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
_df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ### 6.3 Posts at Parties
 #### 6.3.1 Process

In [181]:
posts = [] 
for membership in memberships:
    posts.append({
      'resource_uri': str(uuid4()),
      'org:postIn': parties_d[membership['sigla']],
      'org:role': AFFILIATE_URI
    })

df = pd.DataFrame.from_dict(posts)    
df.head(10)                

Unnamed: 0,org:postIn,org:role,resource_uri
0,7b210070-df61-11e7-97e9-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,4fe5b1ec-1d81-498d-80b6-4da04d225447
1,7b211a38-df61-11e7-b1b8-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,0053384d-f417-4b6b-8e05-998ce51da2cd
2,7b20ed7e-df61-11e7-bb65-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,62f0bbeb-e6e9-479c-86cd-e30fe50bed78
3,7b212e92-df61-11e7-aaf4-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,3e0eb821-6e3f-4fed-88db-a5836ee2b2d7
4,7b212d98-df61-11e7-95f7-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,93ff0ddf-2927-42b7-945d-10f8f451e36b
5,7b211b34-df61-11e7-b235-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,ed2625e9-539a-4c89-961e-af01aeb5f8f8
6,7b210cfa-df61-11e7-9864-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,e4746fed-f33f-4656-969a-370e6c61f9e8
7,7b2139c8-df61-11e7-8d64-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,8ee0f79b-43fc-43e5-8ed1-fbdcf5d7e9c3
8,7b210cfa-df61-11e7-9864-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,5e42d86d-2132-44bf-9438-2d40c3d86846
9,7b212e92-df61-11e7-aaf4-c82a144c0a85,6a688541-b16a-45ca-8aa9-fa700373279f,81f3ee10-4a90-4207-b74a-a16861d8e9fb


 #### 6.3.2 Save

In [182]:
filename = 'posts_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(POSTS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)