Skip to content

Commit

Permalink
Merge 48b720d into 15aaa6e
Browse files Browse the repository at this point in the history
  • Loading branch information
luizcavalcanti committed May 21, 2017
2 parents 15aaa6e + 48b720d commit 060a02a
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 212 deletions.
192 changes: 89 additions & 103 deletions serenata_toolbox/chamber_of_deputies/chamber_of_deputies_dataset.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
import os.path
import csv
from datetime import date
from urllib.request import urlretrieve
from zipfile import ZipFile
import numpy as np
import pandas as pd
from .reimbursements import Reimbursements
from .xml2csv import convert_xml_to_csv

class ChamberOfDeputiesDataset:

YEARS = [n for n in range(2009, date.today().year+1)]

def __init__(self, path):
self.path = path


def fetch(self):
urls = ['http://www.camara.gov.br/cotas/AnoAtual.zip',
'http://www.camara.gov.br/cotas/AnoAnterior.zip',
'http://www.camara.gov.br/cotas/AnosAnteriores.zip']
filenames = map(lambda url: url.split('/')[-1], urls)
base_url = "http://www.camara.leg.br/cotas/Ano-{}.csv.zip"

for url, filename in zip(urls, filenames):
zip_file_path = os.path.join(self.path, filename)
for year in self.YEARS:
zip_file_path = os.path.join(self.path, "Ano-{}.zip".format(year))
url = base_url.format(year)
urlretrieve(url, zip_file_path)
zip_file = ZipFile(zip_file_path, 'r')
zip_file.extractall(self.path)
Expand All @@ -30,15 +32,13 @@ def fetch(self):


def convert_to_csv(self):
for filename in ['AnoAtual', 'AnoAnterior', 'AnosAnteriores']:
xml_path = os.path.join(self.path, '{}.xml'.format(filename))
csv_path = xml_path.replace('.xml', '.csv')
convert_xml_to_csv(xml_path, csv_path)
# deprecated but still here so we don't break poor Rosie (for now)
pass


def translate(self):
for filename in ['AnoAtual', 'AnoAnterior', 'AnosAnteriores']:
csv_path = os.path.join(self.path, '{}.csv'.format(filename))
for year in self.YEARS:
csv_path = os.path.join(self.path, 'Ano-{}.csv'.format(year))
self.__translate_file(csv_path)


Expand All @@ -50,103 +50,89 @@ def clean(self):

def __translate_file(self, csv_path):
output_file_path = csv_path \
.replace('AnoAtual', 'current-year') \
.replace('AnoAnterior', 'last-year') \
.replace('AnosAnteriores', 'previous-years') \
.replace('.csv', '.xz')
.replace('.csv', '.xz') \
.replace('Ano-', 'reimbursements-')

data = pd.read_csv(csv_path,
dtype={'idedocumento': np.str,
encoding='utf-8',
delimiter=";",
quoting=csv.QUOTE_NONE,
dtype={'ideDocumento': np.str,
'idecadastro': np.str,
'nucarteiraparlamentar': np.str,
'codlegislatura': np.str,
'txtcnpjcpf': np.str,
'numressarcimento': np.str})
'nuCarteiraParlamentar': np.str,
'codLegislatura': np.str,
'txtCNPJCPF': np.str,
'numRessarcimento': np.str,
'vlrDocumento': np.float,
'vlrGlosa': np.float,
'vlrLiquido': np.float,
'vlrRestituicao': np.float},
converters={
'vlrDocumento': lambda x: float(x.replace(',','.')),
'vlrGlosa': lambda x: float(x.replace(',','.')),
'vlrLiquido': lambda x: float(x.replace(',','.')),
'vlrRestituicao': lambda x: float(x.replace(',','.'))})

data.rename(columns={
'idedocumento': 'document_id',
'txnomeparlamentar': 'congressperson_name',
'ideDocumento': 'document_id',
'txNomeParlamentar': 'congressperson_name',
'idecadastro': 'congressperson_id',
'nucarteiraparlamentar': 'congressperson_document',
'nulegislatura': 'term',
'sguf': 'state',
'sgpartido': 'party',
'codlegislatura': 'term_id',
'numsubcota': 'subquota_number',
'txtdescricao': 'subquota_description',
'numespecificacaosubcota': 'subquota_group_id',
'txtdescricaoespecificacao': 'subquota_group_description',
'txtfornecedor': 'supplier',
'txtcnpjcpf': 'cnpj_cpf',
'txtnumero': 'document_number',
'indtipodocumento': 'document_type',
'datemissao': 'issue_date',
'vlrdocumento': 'document_value',
'vlrglosa': 'remark_value',
'vlrliquido': 'net_value',
'nummes': 'month',
'numano': 'year',
'numparcela': 'installment',
'txtpassageiro': 'passenger',
'txttrecho': 'leg_of_the_trip',
'numlote': 'batch_number',
'numressarcimento': 'reimbursement_number',
'vlrrestituicao': 'reimbursement_value',
'nudeputadoid': 'applicant_id',
'nuCarteiraParlamentar': 'congressperson_document',
'nuLegislatura': 'term',
'sgUF': 'state',
'sgPartido': 'party',
'codLegislatura': 'term_id',
'numSubCota': 'subquota_number',
'txtDescricao': 'subquota_description',
'numEspecificacaoSubCota': 'subquota_group_id',
'txtDescricaoEspecificacao': 'subquota_group_description',
'txtFornecedor': 'supplier',
'txtCNPJCPF': 'cnpj_cpf',
'txtNumero': 'document_number',
'indTipoDocumento': 'document_type',
'datEmissao': 'issue_date',
'vlrDocumento': 'document_value',
'vlrGlosa': 'remark_value',
'vlrLiquido': 'net_value',
'numMes': 'month',
'numAno': 'year',
'numParcela': 'installment',
'txtPassageiro': 'passenger',
'txtTrecho': 'leg_of_the_trip',
'numLote': 'batch_number',
'numRessarcimento': 'reimbursement_number',
'vlrRestituicao': 'reimbursement_value',
'nuDeputadoId': 'applicant_id',
}, inplace=True)

data['subquota_description'] = \
data['subquota_description'].astype('category')

categories = {
'ASSINATURA DE PUBLICAÇÕES':
'Publication subscriptions',
'COMBUSTÍVEIS E LUBRIFICANTES.':
'Fuels and lubricants',
'CONSULTORIAS, PESQUISAS E TRABALHOS TÉCNICOS.':
'Consultancy, research and technical work',
'DIVULGAÇÃO DA ATIVIDADE PARLAMENTAR.':
'Publicity of parliamentary activity',
'Emissão Bilhete Aéreo':
'Flight ticket issue',
'FORNECIMENTO DE ALIMENTAÇÃO DO PARLAMENTAR':
'Congressperson meal',
'HOSPEDAGEM ,EXCETO DO PARLAMENTAR NO DISTRITO FEDERAL.':
'Lodging, except for congressperson from Distrito Federal',
'LOCAÇÃO OU FRETAMENTO DE AERONAVES':
'Aircraft renting or charter of aircraft',
'LOCAÇÃO OU FRETAMENTO DE EMBARCAÇÕES':
'Watercraft renting or charter',
'LOCAÇÃO OU FRETAMENTO DE VEÍCULOS AUTOMOTORES':
'Automotive vehicle renting or charter',
'MANUTENÇÃO DE ESCRITÓRIO DE APOIO À ATIVIDADE PARLAMENTAR':
'Maintenance of office supporting parliamentary activity',
'PARTICIPAÇÃO EM CURSO, PALESTRA OU EVENTO SIMILAR':
'Participation in course, talk or similar event',
'PASSAGENS AÉREAS':
'Flight tickets',
'PASSAGENS TERRESTRES, MARÍTIMAS OU FLUVIAIS':
'Terrestrial, maritime and fluvial tickets',
'SERVIÇO DE SEGURANÇA PRESTADO POR EMPRESA ESPECIALIZADA.':
'Security service provided by specialized company',
'SERVIÇO DE TÁXI, PEDÁGIO E ESTACIONAMENTO':
'Taxi, toll and parking',
'SERVIÇOS POSTAIS':
'Postal services',
'TELEFONIA':
'Telecommunication',
'AQUISIÇÃO DE MATERIAL DE ESCRITÓRIO.':
'Purchase of office supplies',
'AQUISIÇÃO OU LOC. DE SOFTWARE; SERV. POSTAIS; ASS.':
'Software purchase or renting; Postal services; Subscriptions',
'LOCAÇÃO DE VEÍCULOS AUTOMOTORES OU FRETAMENTO DE EMBARCAÇÕES ':
'Automotive vehicle renting or watercraft charter',
'LOCOMOÇÃO, ALIMENTAÇÃO E HOSPEDAGEM':
'Locomotion, meal and lodging',
}
categories = [categories[cat]
for cat in data['subquota_description'].cat.categories]
data['subquota_description'].cat.rename_categories(categories,
inplace=True)
subquotas = (
(1, 'Maintenance of office supporting parliamentary activity'),
(2, 'Locomotion, meal and lodging'),
(3, 'Fuels and lubricants'),
(4, 'Consultancy, research and technical work'),
(5, 'Publicity of parliamentary activity'),
(6, 'Purchase of office supplies'),
(7, 'Software purchase or renting; Postal services; Subscriptions'),
(8, 'Security service provided by specialized company'),
(9, 'Flight tickets'),
(10, 'Telecommunication'),
(11, 'Postal services'),
(12, 'Publication subscriptions'),
(13, 'Congressperson meal'),
(14, 'Lodging, except for congressperson from Distrito Federal'),
(15, 'Automotive vehicle renting or watercraft charter'),
(119, 'Aircraft renting or charter of aircraft'),
(120, 'Automotive vehicle renting or charter'),
(121, 'Watercraft renting or charter'),
(122, 'Taxi, toll and parking'),
(123, 'Terrestrial, maritime and fluvial tickets'),
(137, 'Participation in course, talk or similar event'),
(999, 'Flight ticket issue')
)

for code, name in subquotas:
data.loc[data['subquota_number']==code, ['subquota_description']] = name

data.to_csv(output_file_path, compression='xz', index=False,
encoding='utf-8')

Expand Down
13 changes: 11 additions & 2 deletions serenata_toolbox/chamber_of_deputies/reimbursements.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pandas as pd
import numpy as np

from datetime import date

class Reimbursements:

Expand All @@ -13,6 +13,8 @@ class Reimbursements:
'index': False
}

YEARS = [n for n in range(2009, date.today().year+1)]

def __init__(self, path):
self.path = path

Expand Down Expand Up @@ -41,7 +43,7 @@ def read_csv(self, name):
@property
def receipts(self):
print('Merging all datasets…')
datasets = ('current-year.xz', 'last-year.xz', 'previous-years.xz')
datasets = ["reimbursements-{}.xz".format(n) for n in self.YEARS]
data = (self.read_csv(name) for name in datasets)
return pd.concat(data)

Expand All @@ -59,9 +61,16 @@ def group(self, receipts):
print('Dropping rows without document_value or reimbursement_number…')
subset = ('document_value', 'reimbursement_number')
receipts = receipts.dropna(subset=subset)

groupby_keys = ('year', 'applicant_id', 'document_id')
receipts = receipts.dropna(subset=subset + groupby_keys)

receipts = receipts[receipts['document_value'] != 0]
receipts = receipts[receipts['reimbursement_number'] != '0']
receipts = receipts[receipts['year'] != 0]
receipts = receipts[receipts['applicant_id'] != '0']
receipts = receipts[receipts['document_id'] != '0']

print('Grouping dataset by applicant_id, document_id and year…')
grouped = receipts.groupby(groupby_keys)

Expand Down
91 changes: 0 additions & 91 deletions serenata_toolbox/chamber_of_deputies/xml2csv.py

This file was deleted.

Loading

0 comments on commit 060a02a

Please sign in to comment.