# Senators notebook

Parses datasets/senado/senators_with_memberships-**termid**.json creating / updating the following polare entites:
* Agents
* Memberships (Senate, Parties)
* Organizations (Senate)
* Posts (Senate)
* Roles (Afiliate, Senator)

## 1. Imports and Constants Declaration

In [1]:
import json   # reads scrapped data
import pandas as pd # helps with file management and visualization
from uuid import uuid4 # creates new uuids for Posts

termid = 55 # legislatura

PROV = 'sen'
FILE_SUFFIX = termid
SUB_DIR = '{:}/'.format(termid)
JSON_PATH =  'datasets/senado/'

BASE_PATH = 'datasets/slp/'
AGENTS_PATH = 'datasets/slp/agents/'
MEMBERSHIPS_PATH = 'datasets/slp/memberships/'
ORGANIZATIONS_PATH = 'datasets/slp/organizations/'
ROLES_PATH = 'datasets/slp/roles/'
POSTS_PATH = 'datasets/slp/posts/'

SENATE_URI = '81311052-e5b6-46fe-87ba-83865fa0ffb0' 
SENATOR_URI = 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3' 
AFFILIATE_URI = '6a688541-b16a-45ca-8aa9-fa700373279f' 


 ## 2. Getting the Parties

In [2]:
filename = 'organizations.csv'
file_path = '{:}{:}'.format(BASE_PATH, filename)
df_parties = pd.read_csv(file_path, sep= ';', encoding= 'utf-8', index_col=None)
df_parties.set_index('sigla', inplace=True)


parties_d = df_parties['resource_uri'].to_dict()                        
print({k:parties_d[k]
    for k in ['PT', 'PMDB', 'DEM']})

{'PT': '7b2138a6-df61-11e7-baf1-c82a144c0a85', 'PMDB': '7b210cfa-df61-11e7-9864-c82a144c0a85', 'DEM': '7b20ed7e-df61-11e7-bb65-c82a144c0a85'}


## 3. Reads scrapped json

In [4]:
filename = 'senator_with_memberships-{:}.json'.format(FILE_SUFFIX) 
file_path = '{:}{:}'.format(JSON_PATH, filename)
with open(file_path, mode='r') as f:
    senatorsstr = f.read()
f.close()

senators_with_memberships = json.loads(senatorsstr)

## 4. Agents

In [6]:
mapping = {'skos:prefLabel': 'sen:CodigoParlamentar',
  'foaf:name':'sen:NomeCompletoParlamentar',
  'rdfs:label':'sen:NomeParlamentar',
  'resource_uri':'slp:resource_uri'           
}

agents = [] 
for swm_d in senators_with_memberships:
    agent_d = {mapping[k]: swm_d.get(k, None) for k in mapping}
    if agent_d:
        agents.append(agent_d)

filename = 'senators-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(AGENTS_PATH, SUB_DIR, filename)
df = pd.DataFrame.from_dict(agents)    
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)
df.head()

Unnamed: 0,sen:CodigoParlamentar,sen:NomeCompletoParlamentar,sen:NomeParlamentar,slp:resource_uri
0,945,Alvaro Fernandes Dias,Alvaro Dias,c658f5bd-bd38-4122-a290-fe12eb27ef23
1,612,Eunício Lopes de Oliveira,Eunício Oliveira,efbc3ef9-2cb3-4277-bf0d-26d94e507b95
2,4969,Elber Batalha de Goes,Elber Batalha,41168aa2-d5bc-4f54-9ecb-6963c44c4527
3,846,Aloysio Nunes Ferreira Filho,Aloysio Nunes Ferreira,3e39edf8-2cb0-485e-82d1-2da8ce43231b
4,5140,Airton Sandoval Santana,Airton Sandoval,0bf34c3f-a8fb-4d43-a47f-2dbce80dd710


 ## 5. Senate Org
 
 ### 5.1 Membership Senator
 #### 5.1.1 Process

In [9]:
mapping = {
    'skos:prefLabel': 'sen:CodigoMandato',
    'natureza': 'sen:UfParlamentar',
    'legislatura': 'NumeroLegislatura', 
    'startDate': 'sen:startDate', 
    'finishDate': 'sen:finishDate',
    'membership_resource_uri':'slp:resource_uri'
}

memberships = [] 
for swm_d in senators_with_memberships:
    member_uri = swm_d['resource_uri'] 
    terms = swm_d['terms']
    if terms:
        for term_d in terms:
            print(term_d)
#             memberships_d = {mapping[k]: term_d.get(k, None) for k in mapping}
#             if memberships_d:
#                 memberships_d['org:member'] = member_uri
#                 memberships_d['org:role'] = SENATOR_URI
#                 memberships.append(memberships_d)



# df = pd.DataFrame.from_dict(memberships)    
# df.head()            

{'area': 'PR', 'code': '514', 'resource_uri': '8440ca34-5ae9-4809-9723-61083cfaafd8', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'CE', 'code': '450', 'resource_uri': '7153ade7-c08e-47e4-9fc6-8bdfac419752', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'SE', 'code': '474', 'resource_uri': '2a4ef6f2-982a-4118-9af6-bcc65f20b757', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'SP', 'code': '485', 'resource_uri': '2af9307b-5e67-4aeb-b73d-f7a84fd08b40', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'SP', 'code': '485', 'resource_uri': '14ba9fc1-c63f-4d24-bb57-12f588eed502', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'RO', 'code': '515', 'resource_uri': '5fa6b62c-441a-4a3b-bb52-ca4cce3b6bec', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'RO', 'code': '515', 'resource_uri': '91c90ae8-482b-4d31-928e-b710374daae1', 'role': 'd57a29ff-c69a-4b32-b98a-3dd8f204c0a3'}
{'area': 'PI', 'code': '496', 'resource_uri': 'f98e5960-9d97-4

 #### 5.1.2 Save

In [7]:
filename = 'memberships_with_senate-{:}.csv'.format(PROV, FILE_SUFFIX) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

### 5.3 Posts

#### 5.3.1 Posts Process

In [8]:
df = df.rename({'slp:resource_uri':'org:role'}, axis='columns')
df['resource_uri'] = [str(uuid4()) for _ in range(df.shape[0])]
df['org:postIn'] = [SENATE_URI]*df.shape[0]
df.drop('rdfs:label', axis=1, inplace=True)
df.head()

Unnamed: 0,NumeroLegislatura,org:member,org:role,sen:CodigoMandato,sen:UfParlamentar,sen:finishDate,sen:startDate,org:role.1,resource_uri,org:postIn
0,55,828a0e78-b457-47de-8112-a7cce8d24563,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,492,BA,2019-01-31,2015-02-01,61e28c9c-c93b-40fc-a73d-3a963632df24,b4ba1a45-995f-49cf-83fe-fb3688b523de,81311052-e5b6-46fe-87ba-83865fa0ffb0
1,56,828a0e78-b457-47de-8112-a7cce8d24563,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,492,BA,2023-01-31,2019-02-01,669716c0-cf09-4d77-896b-8497c3943b34,c1a63d8b-3a1a-4953-b796-b533681d210f,81311052-e5b6-46fe-87ba-83865fa0ffb0
2,54,59888f36-89fc-40e9-adf7-e95bb4906eb2,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,458,PI,2015-01-31,2011-02-01,869d897d-3ab6-448a-ba58-8c18a2aa4421,5f61c354-fcea-4411-8ce6-78e396538d51,81311052-e5b6-46fe-87ba-83865fa0ffb0
3,55,59888f36-89fc-40e9-adf7-e95bb4906eb2,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,458,PI,2019-01-31,2015-02-01,155310d6-40c8-4c04-9234-6f312193b877,0442f41a-ad9b-46ef-a822-827c91110450,81311052-e5b6-46fe-87ba-83865fa0ffb0
4,54,94be5f0e-2cc9-488b-bb32-335a44eb4f1b,d57a29ff-c69a-4b32-b98a-3dd8f204c0a3,456,MT,2015-01-31,2011-02-01,3e9c9056-8f75-4d44-b5a3-da45f12caa33,33fbb16c-a2e5-4a05-a967-7237db6b64ea,81311052-e5b6-46fe-87ba-83865fa0ffb0


In [9]:
df.shape

(504, 10)

#### 5.3.3 Posts save

In [14]:
filename = 'senator_posts-{:}.csv'.format(PROV, FILE_SUFFIX) 
file_path = '{:}{:}{:}'.format(POSTS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ## 6. Parties Org

 ### 6.1 Membership
 
 #### 6.1.1 Process

In [10]:
mapping = {
    'sigla': 'sen:SiglaPartido',
    'startDate': 'sen:DataFiliacao' ,
    'finishDate': 'sen:DataDesfiliacao',
    'resource_uri': 'slp:resource_uri'
}

memberships = [] 
for swm_d in senators_with_memberships:
    affiliations = swm_d['affiliations']
    if affiliations:
        for affiliation_d in affiliations:
            if affiliation_d['sigla'] in parties_d:
                memberships_d = {mapping[k]: affiliation_d.get(k, None) for k in mapping}
                if memberships_d:
                    memberships_d['org:member'] = member_uri
                    memberships_d['org:role'] = AFFILIATE_URI
                    memberships.append(memberships_d)



                    
df = pd.DataFrame.from_dict(memberships)    
parties = df['sen:SiglaPartido'].values
df.head(10)            

Unnamed: 0,org:member,org:role,sen:DataDesfiliacao,sen:DataFiliacao,sen:SiglaPartido,slp:resource_uri
0,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2015-02-01,PDT,0577ae6b-9fa4-4ff0-ac19-9dd5cd3598d1
1,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2004-02-14,PP,b2ec603a-26b8-4e59-8a59-624d2f0973ac
2,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,2004-02-13,1999-02-01,PFL,3cf1400f-9fe7-4552-99ac-d0a4a52a1510
3,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2011-02-03,PR,da9f5ea7-2636-4240-af94-94e0df597ae9
4,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2011-02-03,DEM,89a85f62-3dce-499c-a72a-0917f5bf5240
5,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2015-02-01,PDT,5d602564-ea0f-4b99-86fe-7ed822778f85
6,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,1999-02-01,PSDB,70e5af79-d369-4e74-b4ad-4decab26ef97
7,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2015-02-01,PV,27187447-24a0-4e48-900d-5207e5ce37f5
8,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2015-02-01,PMDB,3bd6d232-08e8-4654-acdb-a46a77de65ba
9,85e8b745-786b-4782-a8a1-e3b3164a3f22,6a688541-b16a-45ca-8aa9-fa700373279f,,2011-01-27,PR,5401befb-466d-4949-890c-5969d74233e2


In [15]:
print(senators_with_memberships[0])

{'skos:prefLabel': '5573', 'rdfs:label': 'Abel Rebouças', 'foaf:name': 'Abel Rebouças São José', 'agent_resource_uri': '828a0e78-b457-47de-8112-a7cce8d24563', 'terms': [{'legislatura': '55', 'startDate': '2015-02-01', 'finishDate': '2019-01-31', 'skos:prefLabel': '492', 'natureza': 'BA', 'membership_resource_uri': '61e28c9c-c93b-40fc-a73d-3a963632df24', 'role_resource_uri': 'd69fcc38-21a6-41ee-8c0f-90be24aff805'}, {'legislatura': '56', 'startDate': '2019-02-01', 'finishDate': '2023-01-31', 'skos:prefLabel': '492', 'natureza': 'BA', 'membership_resource_uri': '669716c0-cf09-4d77-896b-8497c3943b34', 'role_resource_uri': 'd69fcc38-21a6-41ee-8c0f-90be24aff805'}], 'affiliations': [{'sigla': 'PDT', 'startDate': '2015-02-01', 'resource_uri': '0577ae6b-9fa4-4ff0-ac19-9dd5cd3598d1', 'role_resource_uri': 'ebf7f8f0-1b0a-4662-9ed8-c9bb19936ec0'}]}


 #### 6.1.2 Save

In [25]:
filename = 'memberships_at_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

  ### 6.2 Roles at Parties
  #### 6.2.1 Process

In [20]:
df = df['org:role'].to_frame()
df = df.rename({'org:role':'slp:resource_uri'}, axis='columns')
df['rdfs:label'] = 'Affiliate'
df.head()

Unnamed: 0,slp:resource_uri,rdfs:label
0,70f12084-b63a-4bca-b140-e60480ef3802,Affiliate
1,1706c489-db73-4b74-a410-5e441c1e1cdd,Affiliate
2,ea9a2925-a286-4b13-b719-a030418dd8c5,Affiliate
3,11dccd25-3d27-4e70-af1d-bb0907675b2c,Affiliate
4,a2d74722-5042-4795-b72e-c4d3bba42fc9,Affiliate


 #### 6.2.2 Save

In [21]:
filename = 'roles_affiliatiate-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(ROLES_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ### 6.3 Posts at Parties
 #### 6.3.1 Process

In [22]:
df = df.rename({'slp:resource_uri':'org:role'}, axis='columns')
df['slp:resource_uri'] = [parties_d.get(sigla, None) for sigla in parties]
df.drop('rdfs:label', axis=1, inplace=True)
df.head()

Unnamed: 0,org:role,slp:resource_uri
0,70f12084-b63a-4bca-b140-e60480ef3802,7b210070-df61-11e7-97e9-c82a144c0a85
1,1706c489-db73-4b74-a410-5e441c1e1cdd,7b21152e-df61-11e7-bdcf-c82a144c0a85
2,ea9a2925-a286-4b13-b719-a030418dd8c5,7b21046c-df61-11e7-8094-c82a144c0a85
3,11dccd25-3d27-4e70-af1d-bb0907675b2c,7b211a38-df61-11e7-b1b8-c82a144c0a85
4,a2d74722-5042-4795-b72e-c4d3bba42fc9,7b20ed7e-df61-11e7-bb65-c82a144c0a85


 #### 6.3.2 Save

In [23]:
filename = 'posts_at_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(POSTS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)