In [1]:
import requests

url = f"http://www.camara.leg.br/cotas/Ano-2019.csv.zip"
r = requests.get(url)

In [3]:
# abrindo um arquivo no seu computador
file = open(f"Ano-2019.csv.zip", "wb")

# escrever o conteúdo
file.write(r.content)
file.close()

In [4]:
from zipfile import ZipFile

zip_file = ZipFile(f"Ano-2019.csv.zip", 'r')
zip_file.extract(member=f"Ano-2019.csv", path=f"reembolso-2019")
zip_file.close()

### Aula 03: Lendo os dados com Pandas

In [20]:
import pandas as pd
pd.set_option('display.max_columns', None)


DTYPE = {
    'txNomeParlamentar': str,
    'ideCadastro': str,
    'nuCarteiraParlamentar': str,
    'nuLegislatura': str,
    'sgUF': str,
    'sgPartido': str,
    'codLegislatura': str,
    'numSubCota': str,
    'txtDescricao': str,
    'numEspecificacaoSubCota': str,
    'txtDescricaoEspecificacao': str,
    'txtFornecedor': str,
    'txtCNPJCPF': str,
    'txtNumero': str,
    'indTipoDocumento': str,
    'datEmissao': str,
    'vlrDocumento': float,
    'vlrGlosa': str,
    'vlrLiquido': float,
    'numMes': str,
    'numAno': str,
    'numParcela': str,
    'txtPassageiro': str,
    'txtTrecho': str,
    'numLote': str,
    'numRessarcimento': str,
    'nuDeputadoId': str,
    'ideDocumento': str,
}

df_reimbursements = pd.read_csv(
    "reembolso-2019/Ano-2019.csv", 
    delimiter=";", 
    dtype=DTYPE, 
    low_memory=False
)

#### transformações

In [None]:
df_reimbursements.fillna(value="Não se aplica").head()

In [None]:
df_reimbursements[df_reimbursements.sgPartido.isna()]

In [None]:
df_reimbursements[df_reimbursements['ideDocumento'].isnull()]

In [None]:
# df_reimbursements["ideCadastro"] = df_reimbursements["ideCadastro"].astype('string')

In [2]:
df_reimbursements["datEmissao"] = pd.to_datetime(
    df_reimbursements.datEmissao, 
    format='%Y-%m-%d'
) 

In [8]:
df_reimbursements.datEmissao.min()

Timestamp('2018-06-13 00:00:00')

In [10]:
df_reimbursements.set_index("datEmissao").loc["2019-01-01":"2019-03-30"].shape

(59356, 30)

Outras coisas são possíveis, como encontrar o dia da semana de cada uma dessas datas, calcular a média móvel, calcular quantos anos se passaram, ou agrupar os dados por janelas de tempo.

#### completando, substituindo e reformatando

In [19]:
df_reimbursements["txtCNPJCPF"] = df_reimbursements["txtCNPJCPF"].str.replace(r'\D', '', regex=True)

In [12]:
# Este dado representa o tipo de documento do fiscal – 
# 0 (Zero), para Nota Fiscal; 1 (um), para Recibo; e 2, para Despesa no Exterior.

df_reimbursements.indTipoDocumento.unique()

array(['4', '0', '1', '3', '2'], dtype=object)

In [9]:
converters = {
    "0": 'nota_fiscal',
    "1": 'recibo',
    "2": 'despesa_exterior',
    "4": None
}

converters

{'0': 'nota_fiscal', '1': 'recibo', '2': 'despesa_exterior', '4': None}

In [10]:
df_reimbursements.indTipoDocumento.replace(converters, inplace=True)

In [11]:
df_reimbursements.indTipoDocumento.unique()

array([None, 'nota_fiscal', 'recibo', '3', 'despesa_exterior'],
      dtype=object)

### Atenção

In [58]:
keys = ['ideDocumento', 'numAno', 'nuDeputadoId']

# grouped = df_reimbursements.groupby(keys)["vlrLiquido"].agg("sum").rename(index="vlrLiquidoTotal").reset_index()
# grouped

In [27]:
df_reimbursements[df_reimbursements.numRessarcimento.isna()].shape

(177464, 31)

In [29]:
286886 * 0.05

14344.300000000001

In [59]:
data_with_id = df_reimbursements[
    (~df_reimbursements[keys[0]].isnull()) &
    (~df_reimbursements[keys[1]].isnull()) &
    (~df_reimbursements[keys[2]].isnull())
]

In [60]:
grouped = df_reimbursements.groupby(keys)
len(grouped)

260872

In [67]:
agg_net_values = grouped["vlrLiquido"].agg("sum").reset_index()

In [68]:
agg_net_values.shape

(260872, 4)

In [74]:
agg_data = pd.merge(
    df_reimbursements, 
    agg_net_values, 
    on=keys,
    suffixes=('', '_from_original')
)
agg_data.shape

(286886, 32)

In [75]:
agg_data.drop_duplicates(subset=keys, inplace=True)
agg_data.shape

(260872, 32)

In [80]:
df_reimbursements.dropna(subset=('vlrDocumento', 'numRessarcimento')).shape

(109422, 31)

In [13]:
df_reimbursements.shape

(286886, 31)

In [None]:
df_reimbursements = pd.merge(df_reimbursements, grouped, on=keys)
df_reimbursements.head(1)

In [16]:
df_reimbursements.groupby('txtNumero')["vlrLiquido"].agg("sum")

txtNumero
*                 778.80
-                3108.29
-43BC9D7F7D7C      12.30
-52B4F00B1507      12.30
-6A7D2BE06302      12.30
                  ...   
zwgemd            712.48
´RUTIH9QCR751       2.90
´T7VM1KLAC4LS       2.90
Í3TBE8A2C7LG        2.90
Ônibus La          49.90
Name: vlrLiquido, Length: 214158, dtype: float64

In [28]:
df_reimbursements[(df_reimbursements.vlrLiquido < 0) & (df_reimbursements.txtNumero.duplicated())]

Unnamed: 0,txNomeParlamentar,cpf,ideCadastro,nuCarteiraParlamentar,nuLegislatura,sgUF,sgPartido,codLegislatura,numSubCota,txtDescricao,numEspecificacaoSubCota,txtDescricaoEspecificacao,txtFornecedor,txtCNPJCPF,txtNumero,indTipoDocumento,datEmissao,vlrDocumento,vlrGlosa,vlrLiquido,numMes,numAno,numParcela,txtPassageiro,txtTrecho,numLote,numRessarcimento,vlrRestituicao,nuDeputadoId,ideDocumento,urlDocumento
2184,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-NFRRYL,0,2019-04-30T00:00:00,-148.25,0,-148.25,4,2019,0,Iracema Portella,THE/GRU,0,0,,2320,1660724,
2185,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-NFRRYL,0,2019-05-02T00:00:00,-112.15,0,-112.15,5,2019,0,Iracema Portella,THE/GRU,0,0,,2320,1660727,
2194,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-XW3K4Q,0,2019-12-04T00:00:00,-1529.64,0,-1529.64,12,2019,0,Iracema Portella,BSB/BSB,0,0,,2320,1715911,
2690,Fábio Henrique,4.133020e+10,68720,175,2019,SE,PDT,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-GIF37P,0,2019-11-26T00:00:00,-25.89,0,-25.89,11,2019,0,Fábio Henrique,BSB/BSB,0,0,,3240,1715097,
2692,Fábio Henrique,4.133020e+10,68720,175,2019,SE,PDT,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-GRD26B,0,2019-07-02T00:00:00,-82.08,0,-82.08,7,2019,0,Fábio Henrique,BSB/BSB,0,0,,3240,1679783,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286586,Ricardo Pericar,8.648796e+10,211649,543,2019,RJ,PSL,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-KM1ZUJ,0,2019-12-05T00:00:00,-15.00,0,-15.00,12,2019,0,Ricardo Pericar,SDU/BSB/SDU,0,0,,3455,1725047,
286588,Ricardo Pericar,8.648796e+10,211649,543,2019,RJ,PSL,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-LMVZMT,0,2019-12-12T00:00:00,-15.00,0,-15.00,12,2019,0,Ricardo Pericar,SDU/BSB/SDU,0,0,,3455,1725049,
286590,Ricardo Pericar,8.648796e+10,211649,543,2019,RJ,PSL,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-NMUTGT,0,2019-11-07T00:00:00,-15.00,0,-15.00,11,2019,0,Ricardo Pericar,SDU/BSB/SDU,0,0,,3455,1713136,
286592,Ricardo Pericar,8.648796e+10,211649,543,2019,RJ,PSL,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-RK5QVP,0,2019-11-28T00:00:00,-15.00,0,-15.00,11,2019,0,Ricardo Pericar,SDU/BSB/SDU,0,0,,3455,1715107,


In [42]:
df_reimbursements[df_reimbursements.txtNumero.str.contains('NFRRYL')]

Unnamed: 0,txNomeParlamentar,cpf,ideCadastro,nuCarteiraParlamentar,nuLegislatura,sgUF,sgPartido,codLegislatura,numSubCota,txtDescricao,numEspecificacaoSubCota,txtDescricaoEspecificacao,txtFornecedor,txtCNPJCPF,txtNumero,indTipoDocumento,datEmissao,vlrDocumento,vlrGlosa,vlrLiquido,numMes,numAno,numParcela,txtPassageiro,txtTrecho,numLote,numRessarcimento,vlrRestituicao,nuDeputadoId,ideDocumento,urlDocumento
2147,Iracema Portella,37311640000.0,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Bilhete: NFRRYL,0,2019-04-22T00:00:00,2029.39,0,2029.39,4,2019,0,Iracema Portella,BSB/THE,0,0,,2320,1658810,
2183,Iracema Portella,37311640000.0,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-NFRRYL,0,2019-04-24T00:00:00,-108.6,0,-108.6,4,2019,0,Iracema Portella,THE/BSB,0,0,,2320,1659005,
2184,Iracema Portella,37311640000.0,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-NFRRYL,0,2019-04-30T00:00:00,-148.25,0,-148.25,4,2019,0,Iracema Portella,THE/GRU,0,0,,2320,1660724,
2185,Iracema Portella,37311640000.0,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-NFRRYL,0,2019-05-02T00:00:00,-112.15,0,-112.15,5,2019,0,Iracema Portella,THE/GRU,0,0,,2320,1660727,


In [81]:
df_reimbursements[
    (df_reimbursements.txNomeParlamentar == 'Iracema Portella') & 
    (df_reimbursements.txtDescricao == 'PASSAGEM AÉREA - RPA') & 
    (df_reimbursements.txtFornecedor == 'Cia Aérea - GOL') 
]

Unnamed: 0,txNomeParlamentar,cpf,ideCadastro,nuCarteiraParlamentar,nuLegislatura,sgUF,sgPartido,codLegislatura,numSubCota,txtDescricao,numEspecificacaoSubCota,txtDescricaoEspecificacao,txtFornecedor,txtCNPJCPF,txtNumero,indTipoDocumento,datEmissao,vlrDocumento,vlrGlosa,vlrLiquido,numMes,numAno,numParcela,txtPassageiro,txtTrecho,numLote,numRessarcimento,vlrRestituicao,nuDeputadoId,ideDocumento,urlDocumento
1876,Iracema Portella,3.731164e+10,67138,113,2015,PI,PP,55,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Bilhete: KGEVKV,0,2019-01-30T00:00:00,1673.39,0,1673.39,1,2019,0,Iracema Portella,BSB/THE,0,0,,2320,1638131,
1877,Iracema Portella,3.731164e+10,67138,113,2015,PI,PP,55,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Bilhete: MQP7YH,0,2019-01-26T00:00:00,1300.17,0,1300.17,1,2019,0,Iracema Portella,CGH/SDU,0,0,,2320,1637746,
1878,Iracema Portella,3.731164e+10,67138,113,2015,PI,PP,55,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Bilhete: VHNMMF,0,2019-01-31T00:00:00,1052.42,0,1052.42,1,2019,0,Iracema Portella,BSB/THE/BSB,0,0,,2320,1638050,
1879,Iracema Portella,3.731164e+10,67138,113,2015,PI,PP,55,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Bilhete: XN8U5Q,0,2019-01-29T00:00:00,1930.17,0,1930.17,1,2019,0,Iracema Portella,CGH/BSB,0,0,,2320,1638167,
1880,Iracema Portella,3.731164e+10,67138,113,2015,PI,PP,55,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Bilhete: YN8Q7Q,0,2019-01-26T00:00:00,1401.25,0,1401.25,1,2019,0,Iracema Portella,GIG/CGH,0,0,,2320,1637757,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2193,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-XW3K4Q,0,2019-09-27T00:00:00,-1315.92,0,-1315.92,9,2019,0,Iracema Portella,THE/BSB,0,0,,2320,1702314,
2194,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-XW3K4Q,0,2019-12-04T00:00:00,-1529.64,0,-1529.64,12,2019,0,Iracema Portella,BSB/BSB,0,0,,2320,1715911,
2195,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-ZIB2VF,0,2019-03-01T00:00:00,-979.39,0,-979.39,3,2019,0,Iracema Portella,THE/BSB,0,0,,2320,1645555,
2196,Iracema Portella,3.731164e+10,67138,113,2019,PI,PP,56,999,PASSAGEM AÉREA - RPA,0,,Cia Aérea - GOL,075.756.510/0015-9,Comp-ZMK4XZ,0,2019-11-21T00:00:00,-1731.04,0,-1731.04,11,2019,0,Iracema Portella,BSB/BSB,0,0,,2320,1714092,


In [45]:
df_reimbursements.numRessarcimento.unique()

array([nan, '0'], dtype=object)

In [49]:
(df_reimbursements.txtNumero.isna()).sum()

0

In [50]:
(df_reimbursements.numRessarcimento.isna()).sum()

177464

In [82]:
df_reimbursements[df_reimbursements.ideDocumento.duplicated() & (df_reimbursements.ideDocumento > '0')].head()

Unnamed: 0,txNomeParlamentar,cpf,ideCadastro,nuCarteiraParlamentar,nuLegislatura,sgUF,sgPartido,codLegislatura,numSubCota,txtDescricao,numEspecificacaoSubCota,txtDescricaoEspecificacao,txtFornecedor,txtCNPJCPF,txtNumero,indTipoDocumento,datEmissao,vlrDocumento,vlrGlosa,vlrLiquido,numMes,numAno,numParcela,txtPassageiro,txtTrecho,numLote,numRessarcimento,vlrRestituicao,nuDeputadoId,ideDocumento,urlDocumento
3054,Bacelar,10626410000.0,69871,184,2019,BA,PODE,56,998,PASSAGEM AÉREA - SIGEPA,0,,GOL,,NMS13D,0,2019-11-26T12:00:00,32.13,0,32.13,11,2019,0,JOAO CARLOS BACELAR BATISTA,BSB/BSB,0,0.0,,2985,1504,
3058,Bacelar,10626410000.0,69871,184,2019,BA,PODE,56,998,PASSAGEM AÉREA - SIGEPA,0,,GOL,,RMQ2VW,0,2019-11-28T12:00:00,1330.14,0,1330.14,11,2019,0,JOAO CARLOS BACELAR BATISTA,BSB/BSB,0,0.0,,2985,1205,
6542,Rubens Bueno,18746420000.0,73466,460,2015,PR,PPS,55,13,FORNECIMENTO DE ALIMENTAÇÃO DO PARLAMENTAR,0,,RESTAURANTE MADERO CABRAL LTDA,130.023.500/0013-9,210060,4,2019-01-28T00:00:00,74.8,0,0.0,1,2019,0,,,1557819,,6.8,831,6746139,http://camara.leg.br/cota-parlamentar/nota-fis...
8765,Pompeo de Mattos,28346890000.0,73486,516,2019,RS,PDT,56,13,FORNECIMENTO DE ALIMENTAÇÃO DO PARLAMENTAR,0,,Point 235,094.267.310/0017-8,000169,0,2019-08-18T00:00:00,15.0,0,0.0,8,2019,0,,,1627523,,15.0,1458,6905386,https://www.camara.leg.br/cota-parlamentar/doc...
11757,Angela Amin,29316720000.0,73696,471,2019,SC,PP,56,10,TELEFONIA,0,,CLARO S.A,404.325.440/1916-6,0001173359,0,2019-04-17T00:00:00,258.72,0,102.26,4,2019,0,,,1590486,,,835,6820670,


In [56]:
df_reimbursements[df_reimbursements.numRessarcimento.isna()].loc[1].urlDocumento

'http://camara.leg.br/cota-parlamentar/nota-fiscal-eletronica?ideDocumentoFiscal=6769606'