# Congressmen notebook

Parses datasets/camara/congressmen_with_memberships.json creating / updating the following polare entites:
* Agents
* Memberships (Congress, Parties)
* Organizations (Camara)
* Posts (Camara)
* Roles (Afiliado, Deputado)

 ## 1. Imports and Constants Declaration

In [9]:
import json 
import pandas as pd
from collections import deque, defaultdict
from uuid import uuid4

import matplotlib.pyplot as plt

termid = 55 # legislatura
start_date = '2015-01-02'

PROV = 'cam'
JSON_PATH =  'datasets/camara/'
FILE_SUFFIX = termid
SUB_DIR = '{:}/'.format(termid)

AGENTS_PATH = 'datasets/slp/agents/'
MEMBERSHIPS_PATH = 'datasets/slp/memberships/'
ORGANIZATIONS_PATH = 'datasets/slp/organizations/'
POSTS_PATH = 'datasets/slp/posts/'



CAMARA_URI = 'c8f660fd-7d0b-48e5-8513-2f7f0bb5f91c' 


 ## 2. Getting the Parties

In [10]:
filename = 'parties.csv'
file_path = '{:}{:}'.format(ORGANIZATIONS_PATH, filename)
df_parties = pd.read_csv(file_path, sep= ';', encoding= 'utf-8', index_col=None)
df_parties.set_index('sigla', inplace=True)


parties_d = df_parties['slp:resource_uri'].to_dict()                        
print({k:parties_d[k]
    for k in ['PT', 'PMDB', 'DEM']})

{'PT': '7b2138a6-df61-11e7-baf1-c82a144c0a85', 'PMDB': '7b210cfa-df61-11e7-9864-c82a144c0a85', 'DEM': '7b20ed7e-df61-11e7-bb65-c82a144c0a85'}


## 3. Reads scrapped json

In [11]:
filename = 'congressman_with_memberships-{:}.json'.format(termid) 
file_path = '{:}{:}'.format(JSON_PATH, filename)
with open(file_path, mode='r') as f:
    senatorsstr = f.read()
f.close()

congressmen_with_memberships = json.loads(senatorsstr)

## 4. Agents

In [12]:
mapping = {
  'slp:resource_uri': 'slp:resource_uri',           
  'cam:ideCadastro': 'cam:ideCadastro',
  'cam:nomeCivil':'cam:nomeCivil',
  'cam:nomeParlamentarAtual':'cam:nomeParlamentarAtual',
  'cam:dataNascimento': 'cam:dataNascimento',
  'cam:dataFalecimento': 'cam:dataFalecimento',
}

agents = [] 
for cwm_d in congressmen_with_memberships:
    agent_d = {mapping[k]: cwm_d.get(k, None) for k in mapping}
    if agent_d:
        agents.append(agent_d)

filename = 'congressmen-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(AGENTS_PATH, SUB_DIR, filename)
df = pd.DataFrame.from_dict(agents)    
print('found {:} congressmen.'.format(df.shape[0]))
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)
df.head()

found 606 congressmen.


Unnamed: 0,cam:dataFalecimento,cam:dataNascimento,cam:ideCadastro,cam:nomeCivil,cam:nomeParlamentarAtual,slp:resource_uri
0,,1976-10-23,178836,ALAN RICK MIRANDA,ALAN RICK,64965d06-916a-4ef9-a8c7-886055dc979e
1,,1969-02-13,160527,AGUINALDO VELLOSO BORGES RIBEIRO,AGUINALDO RIBEIRO,908bcac7-6fc7-4957-9ae9-00a46ad10a37
2,,1987-02-10,160582,JOSÉ ALBERTO OLIVEIRA VELOSO FILHO,ALBERTO FILHO,5ff9cfe9-2fee-47b6-b7ac-a1cb282236be
3,,1960-10-15,160508,AFONSO BANDEIRA FLORENCE,AFONSO FLORENCE,e4e93a22-8177-400a-8231-2e8ae9d06a12
4,,1956-02-05,178903,ADILTON DOMINGOS SACHETTI,ADILTON SACHETTI,0a39e693-7990-4118-8637-278d778df124


 ## 5. Congress Org
 
 ### 5.1 Membership Deputy

 #### 5.1.1 Preprocess

In [13]:
mapping = {
    'cam:siglaUFRepresentacao': 'cam:siglaUFRepresentacao',
    'cam:dataInicio': 'cam:startDate', 
    'cam:dataFim': 'cam:finishDate',
    'slp:resource_uri':'slp:resource_uri'
}

memberships = [] 
for cwm_d in congressmen_with_memberships:
    member_uri = cwm_d['slp:resource_uri'] 
    terms = cwm_d['terms']
    if terms:
        for term_d in terms:
            memberships_d = {mapping[k]: term_d.get(k, None) for k in mapping}
            if memberships_d:
                memberships_d['org:member'] = member_uri
                memberships_d['org:role'] = str(uuid4())                
                memberships.append(memberships_d)


memberships = sorted(memberships, key=lambda x:x['cam:startDate'])
df = pd.DataFrame.from_dict(memberships)    
print('number of memberships:{:}'.format(df.shape[0]))
df.head()            

number of memberships:954


Unnamed: 0,cam:finishDate,cam:siglaUFRepresentacao,cam:startDate,org:member,org:role,slp:resource_uri
0,,AC,2015-02-01,64965d06-916a-4ef9-a8c7-886055dc979e,43938db6-2682-414d-aedf-ffbe1a72f1ca,9589fe93-2270-4940-be3c-706111bf6b69
1,,PB,2015-02-01,908bcac7-6fc7-4957-9ae9-00a46ad10a37,1db40933-a7b5-45de-a29c-128dfc19410c,e896acb8-0da3-4e4a-b235-355c1a01f7cb
2,,BA,2015-02-01,e4e93a22-8177-400a-8231-2e8ae9d06a12,5f23002e-2439-4ee1-8087-f2c569e967ee,30eb6da8-3308-45a0-bfd8-0f6f9884133c
3,2017-10-30,MT,2015-02-01,0a39e693-7990-4118-8637-278d778df124,4db7f110-4ba2-491e-afa6-35322553c291,76e294dc-7b1a-48d5-8f15-6888f59c42d2
4,,RS,2015-02-01,abf442cf-ef29-40f6-8929-9a4066570ed2,6b4be757-fa9f-4375-8dd1-1c25a2f49944,39df2d4d-6954-4abf-909f-7ab27d22870f


 #### 5.1.2 Posts computation

There are currently 513 Posts at Camara. The total should be fixed but in practice it's not. Our first task is to determine the max number of posts and than allocating them accordingly.

In [14]:
def cumsum(lis):
    total = 0
    for x in lis:
        total += x
        yield total
        
ts_open = pd.pivot_table(df, index='cam:startDate',aggfunc=len)['cam:finishDate']
ts_open.index.names = ['time']
ts_open.name = 'new'

ts_close = pd.pivot_table(df, index='cam:finishDate',aggfunc=len)['cam:startDate']
ts_close.index.names = ['time']
ts_close.name = 'expired'

df = ts_open.to_frame().join(ts_close.to_frame(), how='outer').fillna(0)
df['total'] = list(cumsum(df['new'] - df['expired']))
print('Max posts:', max(df['total']))
df.head()




Max posts: 520.0


Unnamed: 0_level_0,new,expired,total
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-01,517.0,0.0,517.0
2015-02-02,0.0,4.0,513.0
2015-02-03,6.0,7.0,512.0
2015-02-04,8.0,5.0,515.0
2015-02-05,1.0,3.0,513.0


 #### 5.1.3 Posts graph

In [18]:
# from datetime import datetime
import matplotlib.dates as mdates
# fig, ax = plt.subplots(figsize=(20, 10))
# ind = range(df.shape[0])
# width = 0.35  # the width of the bars
# fromDate = datetime.strptime(df.index[0], '%Y-%m-%d')
# toDate = datetime.strptime(df.index[1], '%Y-%m-%d')
# xticks = [ datetime.strptime(dt, '%Y-%m-%d') for dt in df.index ]

# rects1 = ax.bar(ind, df['total'].values, width, color='SkyBlue')
# ax.set_ylabel('Num. Congressmen')
# ax.set_title('Congressmen Posts')
# ax.set_xticklabels(df['total'].index)

# xlabel = "Workitems\nN=%s"%("{:,}".format(N))
# plt.xlabel(xlabel,fontsize=12)
# plt.ylabel('average hours',fontsize=12)
# datemin = np.datetime64(r.date[0], 'Y')
# datemax = np.datetime64(r.date[-1], 'Y') + np.timedelta64(1, 'Y')
# ax.set_xlim(datemin, datemax)


# ax.bar(fromDate, toDate)
# plt.bar(ind , df['total'].values, align='center', width=width, color='red')
# plt.xticks(xticks)
# plt.show()

# plt.ylim([500, 525])
# plt.show()

# https://scentellegher.github.io/programming/2017/05/24/pandas-bar-plot-with-formatted-dates.html
    
#plot data
tmp = df['total']
tmp.index = pd.to_datetime(df.index)
# tmp.head()
fig, ax = plt.subplots(figsize=(15,7))
tmp.to_frame().plot(kind='bar', ax=ax)
ax.bar(tmp.index, df['total'])


#set ticks every week
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))


 #### 5.1.3 Posts allocation
 
 We create a queue of posts and allocate them as necessary.

In [53]:
maxlen= int(max(df['total']))
posts = deque([str(uuid4()) for _ in range(maxlen)], maxlen=maxlen)
posts_expired = defaultdict(deque)

# Sort by dataInicio 
memberships = sorted(memberships, key=lambda x:x['cam:startDate'])
dates = df.index.tolist()
prevdate = '1800-01-01'
for d in memberships:
    start = d['cam:startDate']
    finish = d['cam:finishDate']
    # either the start date is in posts_expired
    # or some posts have become available
    processdates = [date for date in dates 
                         if date >= prevdate and date <= start]        
    for procdate in processdates:
        if procdate in posts_expired:
            try:
                # pops from expired until empty
                while True:      
                    post_uri = posts_expired[procdate].pop()
                    posts.append(post_uri)
            except IndexError: 
                i=0
    
    post_uri = posts.pop()
    d['org:post'] = str(uuid4())
    if finish: # push for a future date        
        posts_expired[finish].append(d['org:post'])
    prevdate = start


df = pd.DataFrame.from_dict(memberships)    
posts = df['org:post'] 
roles = df['org:roles'] 
df.head()    

KeyError: 'total'

 ### 5.5  Congress Posts Save

In [22]:
filename = 'memberships_with_congress-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)

 ## 6. Party Org

 ### 6.1 Membership 
 
 ### 6.1.1 Process

In [34]:
mapping = {
    'cam:sigla': 'cam:sigla',
    'cam:dataInicio': 'cam:startDate', 
    'cam:dataFim': 'cam:finishDate',
    'slp:resource_uri':'slp:resource_uri'
}

memberships = [] 
for cwm_d in congressmen_with_memberships:
    member_uri = cwm_d['slp:resource_uri'] 
    affiliations = cwm_d['affiliations']
    if affiliations:
        for affiliation_d in affiliations:
            memberships_d = {mapping[k]: affiliation_d.get(k, None) for k in mapping}
            if memberships_d:
                memberships_d['org:member'] = member_uri
                memberships_d['org:role'] = str(uuid4())
#                 memberships_d['org:postIn'] = parties_d[affiliation_d['cam:sigla']]
                memberships.append(memberships_d)


df = pd.DataFrame.from_dict(memberships)   
df.head(5000)

Unnamed: 0,cam:finishDate,cam:sigla,cam:startDate,org:member,org:role,slp:resource_uri
0,,PRB,,64965d06-916a-4ef9-a8c7-886055dc979e,bfca262a-e174-4035-9006-72870b62605e,fe42e2cd-dfab-43be-be18-cb88180637fe
1,,DEM,,64965d06-916a-4ef9-a8c7-886055dc979e,9b68a7ff-dccb-4822-bcef-36c140b514fc,db643c02-6940-41c0-b668-fee9d4ae20b4
2,,PP,,908bcac7-6fc7-4957-9ae9-00a46ad10a37,ca99d1ba-2e5a-4e7d-935f-e42db1dacd65,f5193800-cd60-453b-b9c2-add0ec7f27d3
3,,PMDB,,5ff9cfe9-2fee-47b6-b7ac-a1cb282236be,cd3dd6a5-5bf8-40f4-a291-2bf341d63c4e,e6279cfe-789b-49c5-959a-49f14bea2571
4,,PT,,e4e93a22-8177-400a-8231-2e8ae9d06a12,92ad18f6-cb1e-4a96-8f8c-bc3888b65161,3bc84a72-bbf1-4cda-8833-7e94d7c8b234
5,,PSB,,0a39e693-7990-4118-8637-278d778df124,63086203-e5e3-49e7-a3c0-fdd3d23d7617,f66c069d-a9e5-4d3e-bec6-9b4da1aa7683
6,,S.PART.,,0a39e693-7990-4118-8637-278d778df124,5c40e1ef-09d8-4746-8879-32651309c5b5,bba07c8f-d6bd-4219-97b9-d6edb1a5297d
7,,PRB,,0a39e693-7990-4118-8637-278d778df124,f2326a8b-1754-4570-842b-398114bbedc2,a10f58f3-e299-4a69-a02d-1437f2f9c928
8,,PP,,abf442cf-ef29-40f6-8929-9a4066570ed2,cfd01a8f-f9e0-49b3-bcc3-be54d8997ba4,b608cd5e-0d51-44c5-9b07-fd67420399ae
9,,PDT,,92f42906-8bf7-4ede-9623-d436a052aa3a,9cbcaaae-c78a-448d-9f70-77bccbb038fb,8bd6f8ab-e983-4f42-bc5e-f813f6a486a8


 ### 6.1.2 Save

In [26]:
filename = 'memberships_with_parties-{:}.csv'.format(PROV) 
file_path = '{:}{:}{:}'.format(MEMBERSHIPS_PATH, SUB_DIR, filename)
df.to_csv(file_path, sep=';', encoding='utf-8', index=None)