In [6]:
import importlib
import os
import zipfile as zf
import duckdb
import pandas as pd
import numpy as np
import cvm

importlib.reload(cvm)
pd.set_option("max_colwidth", 10)

In [7]:
files_names = sorted(os.listdir('data/raw/'))
files_names

['dfp_cia_aberta_2010.zip',
 'dfp_cia_aberta_2011.zip',
 'dfp_cia_aberta_2012.zip',
 'dfp_cia_aberta_2013.zip',
 'dfp_cia_aberta_2014.zip',
 'dfp_cia_aberta_2015.zip',
 'dfp_cia_aberta_2016.zip',
 'dfp_cia_aberta_2017.zip',
 'dfp_cia_aberta_2018.zip',
 'dfp_cia_aberta_2019.zip',
 'dfp_cia_aberta_2020.zip',
 'dfp_cia_aberta_2021.zip',
 'itr_cia_aberta_2020.zip',
 'itr_cia_aberta_2021.zip',
 'itr_cia_aberta_2022.zip']

In [13]:
def df_format(df) -> pd.DataFrame:
    df.VERSAO = df.VERSAO.astype(np.int8)  # unique -> ['3', '2', '4', '1', '7', '5', '6', '9', '8']
    df.CD_CVM = df.CD_CVM.astype(np.int32)  # max < 600_000
    df.VL_CONTA = df.VL_CONTA.astype(float)

    # df.MOEDA.value_counts()
    # REAL    43391302
    df.drop(columns=['MOEDA'], inplace=True)
    
    # df.ESCALA_MOEDA.value_counts()
    # MIL        40483230
    # UNIDADE     2908072
    df.ESCALA_MOEDA = df.ESCALA_MOEDA.map({'MIL': 1000, 'UNIDADE': 1})

    # unit base currency
    df.VL_CONTA = df.VL_CONTA * df.ESCALA_MOEDA
    df.drop(columns=['ESCALA_MOEDA'], inplace=True)

    # df.ST_CONTA_FIXA.unique() -> ['S', 'N']
    df.ST_CONTA_FIXA = df.ST_CONTA_FIXA.map({'S': True, 'N': False})

    # df.ORDEM_EXERC.unique() -> ['PENÚLTIMO', 'ÚLTIMO']
    df.ORDEM_EXERC = df.ORDEM_EXERC.map({'ÚLTIMO': 0, 'PENÚLTIMO': -1})
    df.ORDEM_EXERC = df.ORDEM_EXERC.astype(np.int8)

    column_order = [
        'CD_CVM', 'CNPJ_CIA', 'DENOM_CIA', 'GRUPO_DFP', 'VERSAO', 'DT_REFER',
        'DT_INI_EXERC', 'DT_FIM_EXERC', 'ORDEM_EXERC', 'CD_CONTA', 'DS_CONTA',
        'ST_CONTA_FIXA', 'COLUNA_DF', 'VL_CONTA']
    # BPA, BPP and DFC files have no DT_INI_EXERC column
    if 'DT_INI_EXERC' in df.columns:
        df.DT_INI_EXERC = pd.to_datetime(df.DT_INI_EXERC)
    else:
        column_order.remove('DT_INI_EXERC')
    if 'COLUNA_DF' not in df.columns: column_order.remove('COLUNA_DF')

    df = df[column_order]
    return df 

In [9]:
date_columns = ['DT_REFER', 'DT_FIM_EXERC']
kwargs = {
    'sep': ';',
    'encoding': 'iso-8859-1',
    'dtype': str,
    'parse_dates': date_columns}

In [14]:
# full scan = 8m 10s -> 5.6 GB df memory space
# formated colums = 4m 30s -> 3.5 GB df memory space
# zstd -> 16.5 seg
# lz4 -> 15.4 seg
# df_all = pd.DataFrame()
for parent_file_name in files_names[:]:
    print(parent_file_name)
    file_path = f'data/raw/{parent_file_name}'
    parent_file = zf.ZipFile(file_path)
    # print(parent_file.namelist())
    df_year = pd.DataFrame()
    child_file_names = parent_file.namelist()
    for child_file_name in child_file_names[1:]:
        # print(child_parent_file_name)
        child_file = parent_file.open(child_file_name)
        df_child = pd.read_csv(child_file, **kwargs)
        df_child = df_format(df_child)        
        # df_all = pd.concat([df_all, df_child], ignore_index=True)
        df_year = pd.concat([df_year, df_child], ignore_index=True)
    sort_by = ['CD_CVM', 'GRUPO_DFP', 'VERSAO', 'DT_REFER', 'ORDEM_EXERC', 'CD_CONTA']
    df_year.sort_values(by=sort_by, ignore_index=True, inplace=True)
    df_year.to_parquet(f'data/processed/{parent_file_name[:-4]}.parquet', compression='zstd')

dfp_cia_aberta_2010.zip
dfp_cia_aberta_2011.zip
dfp_cia_aberta_2012.zip
dfp_cia_aberta_2013.zip
dfp_cia_aberta_2014.zip
dfp_cia_aberta_2015.zip
dfp_cia_aberta_2016.zip
dfp_cia_aberta_2017.zip
dfp_cia_aberta_2018.zip
dfp_cia_aberta_2019.zip
dfp_cia_aberta_2020.zip
dfp_cia_aberta_2021.zip
itr_cia_aberta_2020.zip
itr_cia_aberta_2021.zip
itr_cia_aberta_2022.zip


In [5]:
df = pd.read_parquet('data/processed/itr_cia_aberta_2021.parquet')
df

Unnamed: 0,CNPJ_CIA,DT_REFER,VERSAO,DENOM_CIA,CD_CVM,GRUPO_DFP,ORDEM_EXERC,DT_FIM_EXERC,CD_CONTA,DS_CONTA,VL_CONTA,ST_CONTA_FIXA,DT_INI_EXERC,COLUNA_DF
0,00.001...,2021-03-31,2,CENTRA...,2437,DF Con...,-1,2020-12-31,1,Ativo ...,1.7896...,True,NaT,
1,00.001...,2021-03-31,2,CENTRA...,2437,DF Con...,0,2021-03-31,1,Ativo ...,1.7737...,True,NaT,
2,00.001...,2021-03-31,2,CENTRA...,2437,DF Con...,-1,2020-12-31,1.01,Ativo ...,4.5191...,True,NaT,
3,00.001...,2021-03-31,2,CENTRA...,2437,DF Con...,0,2021-03-31,1.01,Ativo ...,4.3763...,True,NaT,
4,00.001...,2021-03-31,2,CENTRA...,2437,DF Con...,-1,2020-12-31,1.01.01,Caixa ...,3.8599...,True,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3573984,97.837...,2021-09-30,1,DEXCO ...,21091,DF Ind...,0,2021-09-30,7.08.0...,Divide...,0.0000...,True,2021-01-01,
3573985,97.837...,2021-09-30,1,DEXCO ...,21091,DF Ind...,-1,2020-09-30,7.08.0...,Lucros...,1.5224...,True,2020-01-01,
3573986,97.837...,2021-09-30,1,DEXCO ...,21091,DF Ind...,0,2021-09-30,7.08.0...,Lucros...,1.1444...,True,2021-01-01,
3573987,97.837...,2021-09-30,1,DEXCO ...,21091,DF Ind...,-1,2020-09-30,7.08.05,Outros,0.0000...,True,2020-01-01,


In [3]:
lineitem = duckdb.query(
    "SELECT * FROM 'data/processed/*.parquet' WHERE CD_CVM = 9512"
).to_df()
lineitem

Unnamed: 0,CNPJ_CIA,DT_REFER,VERSAO,DENOM_CIA,CD_CVM,GRUPO_DFP,ORDEM_EXERC,DT_FIM_EXERC,CD_CONTA,DS_CONTA,VL_CONTA,ST_CONTA_FIXA,DT_INI_EXERC,COLUNA_DF
0,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,-1,2019-12-31,1,Ativo ...,9.2601...,True,NaT,
1,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,0,2020-12-31,1,Ativo ...,9.8741...,True,NaT,
2,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,-1,2019-12-31,1.01,Ativo ...,1.1210...,True,NaT,
3,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,0,2020-12-31,1.01,Ativo ...,1.4232...,True,NaT,
4,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,-1,2019-12-31,1.01.01,Caixa ...,2.9714...,True,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37109,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,0,2013-12-31,7.08.0...,Divide...,0.0000...,True,2013-01-01,
37110,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,-1,2012-12-31,7.08.0...,Lucros...,1.2018...,True,2012-01-01,
37111,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,0,2013-12-31,7.08.0...,Lucros...,1.4106...,True,2013-01-01,
37112,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,-1,2012-12-31,7.08.05,Outros,0.0000...,True,2012-01-01,


In [12]:
lineitem[lineitem.COLUNA_DF.notnull()]

Unnamed: 0,CNPJ_CIA,DT_REFER,VERSAO,DENOM_CIA,CD_CVM,GRUPO_DFP,ORDEM_EXERC,DT_FIM_EXERC,CD_CONTA,DS_CONTA,VL_CONTA,ST_CONTA_FIXA,DT_INI_EXERC,COLUNA_DF
946,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,-1,2019-12-31,5.01,Saldos...,2.0543...,True,2019-01-01,Capita...
947,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,0,2020-12-31,5.01,Saldos...,2.0543...,True,2020-01-01,Capita...
948,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,-1,2019-12-31,5.01,Saldos...,2.6740...,True,2019-01-01,Reserv...
949,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,0,2020-12-31,5.01,Saldos...,2.6650...,True,2020-01-01,Reserv...
950,33.000...,2020-12-31,2,PETROL...,9512,DF Con...,-1,2019-12-31,5.01,Saldos...,9.5148...,True,2019-01-01,Reserv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36735,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,0,2013-12-31,5.07,Saldos...,0.0000...,True,2013-01-01,Lucros...
36736,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,-1,2012-12-31,5.07,Saldos...,-1.2375...,True,2012-01-01,Outros...
36737,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,0,2013-12-31,5.07,Saldos...,-7.2436...,True,2013-01-01,Outros...
36738,33.000...,2013-12-31,1,PETROL...,9512,DF Ind...,-1,2012-12-31,5.07,Saldos...,3.2878...,True,2012-01-01,Patrim...


In [12]:
# gz -> 46s
# zip -> 11s (33s)
# bz2 -> 2 min
# xz -> 1 min
# zstd -> 5.4s (1.9s)
# lz4 -> 5.6s (1.9)
# c_options = {'compresslevel': 5}
# df.to_pickle('data/teste.zip', compression='zip')
# pd.read_pickle('data/teste.zip')
df.to_parquet(f'data/teste.parquet', compression='zstd')
# pd.read_parquet('data/teste.parquet')

In [31]:
df.GRUPO_DFP.unique()
# df

array(['DF Consolidado - Balanço Patrimonial Ativo',
       'DF Individual - Balanço Patrimonial Ativo',
       'DF Consolidado - Balanço Patrimonial Passivo',
       'DF Individual - Balanço Patrimonial Passivo',
       'DF Consolidado - Demonstração do Fluxo de Caixa (Método Direto)',
       'DF Individual - Demonstração do Fluxo de Caixa (Método Direto)',
       'DF Consolidado - Demonstração do Fluxo de Caixa (Método Indireto)',
       'DF Individual - Demonstração do Fluxo de Caixa (Método Indireto)',
       'DF Consolidado - Demonstração das Mutações do Patrimônio Líquido',
       'DF Individual - Demonstração das Mutações do Patrimônio Líquido',
       'DF Consolidado - Demonstração de Resultado Abrangente',
       'DF Individual - Demonstração de Resultado Abrangente',
       'DF Consolidado - Demonstração do Resultado',
       'DF Individual - Demonstração do Resultado',
       'DF Consolidado - Demonstração de Valor Adicionado',
       'DF Individual - Demonstração de Valor A

In [15]:
df = cvm.load_metadata()
df

Unnamed: 0,DT_REFER,VERSAO,CD_CVM,CATEG_DOC,ID_DOC,DT_RECEB
0,2010-1...,1,1023,DFP,4822,2011-0...
1,2010-1...,2,1023,DFP,4823,2011-0...
2,2010-1...,3,1023,DFP,6945,2011-0...
3,2010-1...,1,14206,DFP,5436,2011-0...
4,2010-1...,2,14206,DFP,6775,2011-0...
...,...,...,...,...,...,...
14176,2021-0...,1,4693,ITR,110135,2021-1...
14177,2021-0...,1,21091,ITR,103186,2021-0...
14178,2021-0...,1,21091,ITR,106768,2021-0...
14179,2021-0...,1,21091,ITR,109262,2021-1...
