In [17]:
import requests, zipfile
from datetime import date, timedelta
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DecimalType, DateType, TimestampType, FloatType
from pyspark.sql.functions import udf, col, lit, when, trim, month, year
from pyspark import SparkFiles

#### call lib

In [18]:
%run ./work/lib.ipynb

#### Parametros

In [19]:
number_exercises = 5
end_year = date.today().year
start_year = end_year - number_exercises + 1

year_list = list(range(start_year, end_year + 1))

#### request cvm

In [20]:
def get_cvm_financial_statement(fs_type: str, year: int, spark=spark):
    """
    financial statements type (fs_type)
    -- BPA - Assets
    -- BPP - Liabilities
    -- DRE - Income Statement
    -- DFC_MI - Indirect Cash Flow
    -- DFC_MD - Direct Cash Flow
    """
    import pandas as pd

    url = f'https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/ITR/DADOS/itr_cia_aberta_{year}.zip'
    arquivo_zip = f'itr_cia_aberta_{year}.zip'
    arquivo_csv = f'itr_cia_aberta_{fs_type}_con_{year}.csv'

    download = requests.get(url)

    with open(arquivo_zip, 'wb') as arquivo_cvm:
        arquivo_cvm.write(download.content)

    arquivo_zip = zipfile.ZipFile(arquivo_zip)

    stocks_df = pd.read_csv(arquivo_zip.open(name=arquivo_csv), sep=';', encoding='ISO-8859-1')

    df = spark.createDataFrame(stocks_df)

    return df

In [21]:
stocks_df = \
    spark \
        .read \
        .format('csv') \
        .options(header='True', delimiter=';', encoding='iso-8859-1') \
        .csv('{}/extract/stock_list.csv'.format(LAKE_HOME)) \
        .select(
            trim('CNPJ').alias('cnpj'),
            col('TICKER').alias('ticker')
        )

In [22]:
cash_flow_dir_df = None

for year in year_list:

    if cash_flow_dir_df == None:
        cash_flow_dir_df = get_cvm_financial_statement('DFC_MD', year=year)
    else:
        cash_flow_dir_df = cash_flow_dir_df.unionAll(get_cvm_financial_statement('DFC_MD', year=year))

In [23]:
temp_fact_cashflow_direct = \
    cash_flow_dir_df \
        .where(col('ORDEM_EXERC') == 'ÚLTIMO') \
        .select(
            trim('CNPJ_CIA').alias('cnpj'),
            col('DT_REFER').cast('date').alias('base_date'),
            col('GRUPO_DFP').alias('financial_statement_type'),
            col('MOEDA').alias('currency'),
            col('ESCALA_MOEDA').alias('scale'),
            col('ORDEM_EXERC').alias('order'),
            col('DT_INI_EXERC').cast('date').alias('start_of_period'),
            col('DT_FIM_EXERC').cast('date').alias('end_of_period'),
            trim('CD_CONTA').alias('id_account'),
            col('DS_CONTA').alias('account_description'),
            col('VL_CONTA').cast('decimal(15, 2)').alias('value'),
            col('ST_CONTA_FIXA').alias('account_status'),
            lit('Direct Cash Flow').alias('type'),
            lit('YTD').alias('period_type')
        )

fact_cashflow_direct = \
    temp_fact_cashflow_direct \
        .join(
            stocks_df,
            on=['cnpj'],
            how='inner'
        )

##### Write to DW

In [24]:
df = fact_cashflow_direct

write_to_dw(df, 'fact_cashflow_direct')

                                                                                