In [78]:
import yfinance as yf
import requests, zipfile
from datetime import date, timedelta
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DecimalType, DateType, TimestampType, FloatType
from pyspark.sql.functions import udf, col, lit, when, trim, month, year
from pyspark import SparkFiles

#### call lib

In [2]:
%run ./work/lib.ipynb



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bd797aa9-e2a0-4a1e-b553-5f8789945f1b;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.0/delta-core_2.12-1.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.12;1.0.0!delta-core_2.12.jar (995ms)
downloading https://repo1.maven.org/maven2/org/antlr/antlr4/4.7/antlr4-4.7.jar ...
	[SUCCESSFUL ] org.antlr#antlr4;4.7!antlr4.jar (246ms)
downloading https:

##### Parametro

#### request cvm

In [47]:
def get_cvm_financial_statement(fs_type: str, year: int, spark=spark):
    """
    financial statements type (fs_type)
    -- BPA - Assets
    -- BPP - Liabilities
    -- DRE - Income Statement
    -- DFC_MI - Indirect Cash Flow
    -- DFC_MD - Direct Cash Flow
    """
    import pandas as pd

    url = f'https://dados.cvm.gov.br/dados/CIA_ABERTA/DOC/ITR/DADOS/itr_cia_aberta_{year}.zip'
    arquivo_zip = f'itr_cia_aberta_{year}.zip'
    arquivo_csv = f'itr_cia_aberta_{fs_type}_con_{year}.csv'

    download = requests.get(url)

    with open(arquivo_zip, 'wb') as arquivo_cvm:
        arquivo_cvm.write(download.content)

    arquivo_zip = zipfile.ZipFile(arquivo_zip)

    stocks_df = pd.read_csv(arquivo_zip.open(name=arquivo_csv), sep=';', encoding='ISO-8859-1')

    df = spark.createDataFrame(stocks_df)

    return df

In [91]:
asset_df = \
    get_cvm_financial_statement('BPA', 2022) \
        .select(
            trim('CNPJ_CIA').alias('cnpj'),
            col('DT_REFER').cast('date').alias('base_date'),
            col('GRUPO_DFP').alias('financial_statement_type'),
            col('MOEDA').alias('currency'),
            col('ESCALA_MOEDA').alias('scale'),
            col('ORDEM_EXERC').alias('order'),
            lit('').cast('date').alias('start_of_period'),
            col('DT_FIM_EXERC').cast('date').alias('end_of_period'),
            trim('CD_CONTA').alias('id_account'),
            col('DS_CONTA').alias('account_description'),
            col('VL_CONTA').cast('decimal(15, 2)').alias('value'),
            col('ST_CONTA_FIXA').alias('account_status'),
            lit('Assets').alias('type'),
            lit('YTD').alias('period_type')
        )

In [80]:
liabilities_df = \
    get_cvm_financial_statement('BPP', 2022) \
        .select(
            trim('CNPJ_CIA').alias('cnpj'),
            col('DT_REFER').cast('date').alias('base_date'),
            col('GRUPO_DFP').alias('financial_statement_type'),
            col('MOEDA').alias('currency'),
            col('ESCALA_MOEDA').alias('scale'),
            col('ORDEM_EXERC').alias('order'),
            lit('').cast('date').alias('start_of_period'),
            col('DT_FIM_EXERC').cast('date').alias('end_of_period'),
            trim('CD_CONTA').alias('id_account'),
            col('DS_CONTA').alias('account_description'),
            col('VL_CONTA').cast('decimal(15, 2)').alias('value'),
            col('ST_CONTA_FIXA').alias('account_status'),
            lit('Liabilities').alias('type'),
            lit('YTD').alias('period_type')
        )

In [84]:
income_statement_df = \
    get_cvm_financial_statement('DRE', 2022) \
        .where(
            (col('CNPJ_CIA') == '00.000.000/0001-91') &
            (col('CD_CONTA') == '3.01')
         ) \
        .select(
            trim('CNPJ_CIA').alias('cnpj'),
            col('DT_REFER').cast('date').alias('base_date'),
            col('GRUPO_DFP').alias('financial_statement_type'),
            col('MOEDA').alias('currency'),
            col('ESCALA_MOEDA').alias('scale'),
            col('ORDEM_EXERC').alias('order'),
            col('DT_INI_EXERC').cast('date').alias('start_of_period'),
            col('DT_FIM_EXERC').cast('date').alias('end_of_period'),
            trim('CD_CONTA').alias('id_account'),
            col('DS_CONTA').alias('account_description'),
            col('VL_CONTA').cast('decimal(15, 2)').alias('value'),
            col('ST_CONTA_FIXA').alias('account_status'),
            lit('Income Statement').alias('type')
        ) \
        .withColumn('period_type', 
                when( (month(col('start_of_period')) == 1) & (month(col('end_of_period')) != 3), lit('YTD')).otherwise(lit('Quarter')))

In [94]:
cash_flow_ind_df = \
    get_cvm_financial_statement('DFC_MI', 2022) \
        .select(
            trim('CNPJ_CIA').alias('cnpj'),
            col('DT_REFER').cast('date').alias('base_date'),
            col('GRUPO_DFP').alias('financial_statement_type'),
            col('MOEDA').alias('currency'),
            col('ESCALA_MOEDA').alias('scale'),
            col('ORDEM_EXERC').alias('order'),
            col('DT_INI_EXERC').cast('date').alias('start_of_period'),
            col('DT_FIM_EXERC').cast('date').alias('end_of_period'),
            trim('CD_CONTA').alias('id_account'),
            col('DS_CONTA').alias('account_description'),
            col('VL_CONTA').cast('decimal(15, 2)').alias('value'),
            col('ST_CONTA_FIXA').alias('account_status'),
            lit('Indirect Cash Flow').alias('type'),
            lit('YTD').alias('period_type')
        )

In [96]:
cash_flow_dir_df = \
    get_cvm_financial_statement('DFC_MD', 2022) \
        .select(
            trim('CNPJ_CIA').alias('cnpj'),
            col('DT_REFER').cast('date').alias('base_date'),
            col('GRUPO_DFP').alias('financial_statement_type'),
            col('MOEDA').alias('currency'),
            col('ESCALA_MOEDA').alias('scale'),
            col('ORDEM_EXERC').alias('order'),
            col('DT_INI_EXERC').cast('date').alias('start_of_period'),
            col('DT_FIM_EXERC').cast('date').alias('end_of_period'),
            trim('CD_CONTA').alias('id_account'),
            col('DS_CONTA').alias('account_description'),
            col('VL_CONTA').cast('decimal(15, 2)').alias('value'),
            col('ST_CONTA_FIXA').alias('account_status'),
            lit('Direct Cash Flow').alias('type'),
            lit('YTD').alias('period_type')
        ) \
        .show()

+------------------+----------+------------------------+--------+-----+---------+---------------+-------------+----------+--------------------+----------+--------------+----------------+-----------+
|              cnpj| base_date|financial_statement_type|currency|scale|    order|start_of_period|end_of_period|id_account| account_description|     value|account_status|            type|period_type|
+------------------+----------+------------------------+--------+-----+---------+---------------+-------------+----------+--------------------+----------+--------------+----------------+-----------+
|00.070.698/0001-11|2022-03-31|    DF Consolidado - ...|    REAL|  MIL|PENÚLTIMO|     2021-01-01|   2021-03-31|      6.01|Caixa Líquido Ati...| -21697.00|             S|Direct Cash Flow|        YTD|
|00.070.698/0001-11|2022-03-31|    DF Consolidado - ...|    REAL|  MIL|   ÚLTIMO|     2022-01-01|   2022-03-31|      6.01|Caixa Líquido Ati...|  17407.00|             S|Direct Cash Flow|        YTD|
|00.0

##### Write to DW

In [None]:
#df = dim_ticker

#write_to_dw(df, 'dim_ticker')