##  CreditLens (Envio) - [Ambiente DEV]
---

#### Objetivo:

Será repassada a API uma estrutura direcionada ao modelo de dados com as informações financeiras convertidas (SetAPI).

In [0]:
import os
import sys
import time
import json
import hashlib
import logging
import requests
from delta.tables import *
from pyspark.sql import Row
from datetime import datetime
from datetime import timedelta
from pyspark.sql.types import *
import http.client as http_client
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from azure.mgmt.datafactory.models import *
from pyspark.sql.dataframe import DataFrame



In [0]:
AMBIENTE = "DEV"
tbl_format = "delta"
time_format = "yyyy-MM-dd'T'HH:mm:ss.SSSZ"

DAY = str(datetime.today().day).zfill(2)
YEAR = str(datetime.today().year).zfill(4)
MONTH = str(datetime.today().month).zfill(2)

url_api = 'https://distributie-azure-api.rabobank.nl/creditrisk/financialstatements/historical/ag/uat/set'
PATH_TBL_DATAMART = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_delta/"
PATH_TBL_PARQUET = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_parquet/creditlens.parquet"
TBL_OK = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_controle/creditlens_enviados"
TBL_REPROCESS = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_controle/creditlens_nao_enviados"
TBL_DO_NOT_SEND = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_controle/creditlens_nao_enviar_mais"

PATH_JSON_SCHEMA = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_json/schema.json"
PATH_JSON_SAMPLE = "/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_json/sample.json"

In [0]:
def creditlens_fix_column_names(dataset: DataFrame) -> DataFrame:
    '''
      realiza o de/para
    '''
    
    dataset_result = dataset\
      .withColumnRenamed("StmtType", "StatementType")\
      .withColumnRenamed("CashandCashEquivalent", "CashandCashEquivalent_1")\
      .withColumnRenamed("MarketableSecurities", "MarketableSecuritiesMV_10008")\
      .withColumnRenamed("AcctsRecCrops", "AcctsRecCrops_5")\
      .withColumnRenamed("MarketLvstk", "MarketLvstkMV_509")\
      .withColumnRenamed("Crops", "Crops_10")\
      .withColumnRenamed("FeedsSupplies", "FeedsandSuppliesMV_512")\
      .withColumnRenamed("OtherInventory", "OtherInventoryMV_10009")\
      .withColumnRenamed("OtherCurrentAssets", "OtherCurrentAssets_16")\
      .withColumnRenamed("RaisedBreedingLvst", "RaisedBreedingLvst_19")\
      .withColumnRenamed("FarmMachineryEquipment", "FarmMachineryEquipment_20")\
      .withColumnRenamed("FarmLand", "FarmLandMV_520")\
      .withColumnRenamed("BuildingsImprovements", "BuildingsImprovementsMV_521")\
      .withColumnRenamed("OtherTangibleFixedAssets", "OtherTangibleFixedAssetsMV_10010")\
      .withColumnRenamed("IntgblOther", "IntgblOtherMV_530")\
      .withColumnRenamed("AccountsPayableTrade", "AccountsPayableTrade_44")\
      .withColumnRenamed("Overdrafts", "Overdrafts_46")\
      .withColumnRenamed("CPLTDRealEstate", "CPLTDRealEstate_58")\
      .withColumnRenamed("CPLTDOther", "CPLTDOther_59")\
      .withColumnRenamed("OtherCurrentLiabs", "OtherCurrentLiabs_65")\
      .withColumnRenamed("LongTermDebt_RealEstate", "LongTermDebtRealEstate_67")\
      .withColumnRenamed("LongTermDebt_Other", "LongTermDebtOther_68")\
      .withColumnRenamed("OtherNon_CurrentLiabs", "OtherNonCurrentLiabs_74")\
      .withColumnRenamed("RetainedEarnings", "RetainedEarnings_82")\
      .withColumnRenamed("SalesCrops", "SalesCrops_100")\
      .withColumnRenamed("CostSales_Crops", "CostofSalesCrops_10011")\
      .withColumnRenamed("LeaseRent_Expense", "LeaseRentExpense_130")\
      .withColumnRenamed("DepreMachin_building", "DepreMachinbuilding_131")\
      .withColumnRenamed("OtherOperatingExpenses", "OtherOperatingExpenses_134")\
      .withColumnRenamed("InterestExpense", "InterestExpense_136")\
      .withColumnRenamed("OtherExpense", "OtherExpense_139")\
      .withColumnRenamed("DividendsStock", "DividendsStock_300")

    return dataset_result

In [0]:
def creditlens_load_pre_processed_data() -> DataFrame:
    '''
        carrega os dados vinculados a última e penúltima datas da coluna DAT_HORA_EXPORTACAO
        e retorna um dataframe. 
        
        A estratégia abaixo foi utilizada com o objetivo de evitar o table scan, ou seja, 
        só serão carregados os dados com base na última e penúltima datas.
    '''
    dataset = spark\
        .read\
        .format("delta")\
        .load(PATH_TBL_DATAMART)\
        .withColumn("_", F.lit(1))\
        .withColumn(
            "DATE_OFFSET", 
            F.dense_rank()\
            .over(
                Window()\
                  .partitionBy("_")\
                  .orderBy(F.col("DATA_CRIACAO_DATAMART").cast(DateType()).desc())
            )
        )\
        .withColumn(
            "DATETIME_OFFSET", 
            F.dense_rank()\
            .over(
                Window()\
                  .partitionBy("DATE_OFFSET")\
                  .orderBy(F.col("DATA_CRIACAO_DATAMART").cast(TimestampType()).desc())
            )
        )\
        .select(
           F.col('DescricaoAgencia'),
           F.col('Proposal'),
           F.col('Wwid'),
           F.col('StatementDate'),
           F.col('Periods'),
           F.col('TotalAssests'),
           F.col('TotalLiabs_netWorth'),
           F.col('StmtType'),
           F.col('Analyst'),
           F.col('Status'),
           F.col('Consolidation'),
           F.col('Currency'),
           F.col('CashandCashEquivalent'),
           F.col('MarketableSecurities'),
           F.col('AcctsRecCrops'),
           F.col('MarketLvstk'),
           F.col('Crops'),
           F.col('FeedsSupplies'),
           F.col('OtherInventory'),
           F.col('OtherCurrentAssests'),
           F.col('ReisedBreedingLvst'),
           F.col('FarmMachineryEquipment'),
           F.col('FarmLand'),
           F.col('BuildingsImprovements'),
           F.col('OtherTangibleFixedAssets'),
           F.col('IntgblOther'),
           F.col('AccountsPayableTrade'),
           F.col('Overdrafts'),
           F.col('CPLTDRealEstate'),
           F.col('CPLTDOther'),
           F.col('OtherCurrentLiabs'),
           F.col('LongTermDebt_RealEstate'),
           F.col('LongTermDebt_Other'),
           F.col('OtherNon_CurrentLiabs'),
           F.col('RetainedEarnings'),
           F.col('SalesCrops'),
           F.col('CostsSales_Crops'),
           F.col('LeaseRent_Expense'),
           F.col('DepreMaching_building'),
           F.col('OtherOperatingExpenses'),
           F.col('InterestExpense'),
           F.col('OtherExpense'),
           F.col('DividendsStock'),
           F.col('Data_Criacao_Datamart').alias('data_criacao_datamart'),
           F.col('HASH_KEY').alias('hash_key')
        )\
        .filter(
            (F.col("DATE_OFFSET") == 1) & (F.col("DATETIME_OFFSET") == 1)
        )
    
    dataset = creditlens_fix_column_names(dataset)
    
    return dataset

In [0]:
@udf("string")
def creditlens_process_request(row):
    '''
        Retorno: id enviado pelo response quando o request é bem sucedido
    '''
    response_key = ""
    headers = { 
        'User-Agent': 'curl/7.58.0',
        'accept': 'application/json',
        'Content-Type': 'application/json; charset=UTF-8',
        'Rabobank-ApiKey': 'f43e132f-f5c5-4db6-8fdf-8516d84026ce',
    }
    try:
        response = requests.post(url=url_api, headers=headers, data=str(row))
        response_key = response.json()
    except Exception as e: 
        response_key = str(e)

    return response_key

In [0]:
def creditlens_table_exists(table: str) -> bool:
    '''
        checa se a tabela do (delta lake) existe
    '''
    ret = True
    try:
      dir_ = dbutils.fs.ls(table)
    except:
       ret = False
    return ret

In [0]:
def create_enviroment_if_not_exists():
    '''
      cria as tabelas de controle de envio
    '''

    fields = [
        StructField("hash_key", StringType(), True),
        StructField("data_criacao_datamart", StringType(), True),
        StructField("DescricaoAgencia",StringType(), True),
        StructField("Proposal",IntegerType(),True),
        StructField("Wwid",LongType(),True),
        StructField("StatementDate",StringType(),True),
        StructField("Periods",IntegerType(),True),
        StructField("TotalAssests",DecimalType(19,4),True),
        StructField("TotalLiabs_netWorth",DecimalType(19,4),True),
        StructField("StmtType",StringType(),True),
        StructField("Analyst",StringType(),True),
        StructField("Status",StringType(),True),
        StructField("Consolidation",StringType(),True),
        StructField("Currency",StringType(),True),
        StructField("CashandCashEquivalent",DecimalType(19,4),True),
        StructField("MarketableSecurities",DecimalType(19,4),True),
        StructField("AcctsRecCrops",DecimalType(19,4),True),
        StructField("MarketLvstk",DecimalType(19,4),True),
        StructField("Crops",DecimalType(19,4),True),
        StructField("FeedsSupplies",DecimalType(19,4),True),
        StructField("OtherInventory",DecimalType(19,4),True),
        StructField("OtherCurrentAssests",DecimalType(19,4),True),
        StructField("ReisedBreedingLvst",DecimalType(19,4),True),
        StructField("FarmMachineryEquipment",DecimalType(19,4),True),
        StructField("FarmLand",DecimalType(19,4),True),
        StructField("BuildingsImprovements",DecimalType(19,4),True),
        StructField("OtherTangibleFixedAssets",DecimalType(19,4),True),
        StructField("IntgblOther",DecimalType(19,4),True),
        StructField("AccountsPayableTrade",DecimalType(19,4),True),
        StructField("Overdrafts",DecimalType(19,4),True),
        StructField("CPLTDRealEstate",DecimalType(19,4),True),
        StructField("CPLTDOther",DecimalType(19,4),True),
        StructField("OtherCurrentLiabs",DecimalType(19,4),True),
        StructField("LongTermDebt_RealEstate",DecimalType(19,4),True),
        StructField("LongTermDebt_Other",DecimalType(19,4),True),
        StructField("OtherNon_CurrentLiabs",DecimalType(19,4),True),
        StructField("RetainedEarnings",DecimalType(19,4),True),
        StructField("SalesCrops",DecimalType(19,4),True),
        StructField("CostofSalesCrops_10011",DecimalType(19,4),True),
        StructField("LeaseRentExpense_130",DecimalType(19,4),True),
        StructField("DepreMachinbuilding_131",DecimalType(19,4),True),
        StructField("OtherOperatingExpenses",DecimalType(19,4),True),
        StructField("InterestExpense",DecimalType(19,4),True),
        StructField("OtherExpense",DecimalType(19,4),True),
        StructField("DividendsStock",DecimalType(19,4),True),
        StructField("dados_json", StringType(), True),
        StructField("current_timestamp", TimestampType(), True),
        StructField("status_response", StringType(), True),
        StructField("api_response_key", StringType(), True),
        StructField("qtd_envios_dia", IntegerType(), True),
        StructField("qtd_envios_total", IntegerType(), True),
        StructField("ambiente", StringType(), True)
    ]
    
    schema = StructType(fields)
    dataset = sqlContext.createDataFrame(spark.sparkContext.emptyRDD(), schema)
    dataset = creditlens_fix_column_names(dataset)
    
    if not creditlens_table_exists(TBL_OK): 
      dataset.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(TBL_OK)
        
    if not creditlens_table_exists(TBL_REPROCESS): 
      dataset.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(TBL_REPROCESS)

    if not creditlens_table_exists(TBL_DO_NOT_SEND): 
      dataset.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(TBL_DO_NOT_SEND)

In [0]:
def creditlens_save_to_parquet(): 
    '''
        gera o PARQUET (Power BI)
    '''
    dataset = creditlens_get_current_messages()
    dataset.write.format("parquet").mode("append").save(PATH_TBL_PARQUET)

In [0]:
def creditlens_get_current_messages():
  '''
    carrega as mensagens da tabela controle
  '''
  from functools import reduce
  dataset_success = spark.read.format("delta").load(TBL_OK)
  dataset_failure = spark.read.format("delta").load(TBL_REPROCESS)
  dataset_do_not_send = spark.read.format("delta").load(TBL_DO_NOT_SEND)
  dataset_all = reduce(DataFrame.unionAll, [dataset_success, dataset_failure, dataset_do_not_send])

  return dataset_all.distinct().select("*")

In [0]:
def creditlens_avoid_duplicated_data(dataset: DataFrame) -> DataFrame:
    '''
        remove os registros oriundos da tabela de controle que já existem
    '''
    dataset_current = creditlens_get_current_messages()
    dataset_result = dataset.join(F.broadcast(dataset_current), "hash_key" ,"left_anti")
    return dataset_result

In [0]:
def creditlens_clean_table(is_source_table: bool, source_table: str):
    '''
        elimina os dados desnecessários da tabela
    '''
    dataset_sucess = spark.read.format("delta").load(TBL_OK)
    dataset_failure = spark.read.format("delta").load(TBL_REPROCESS)
    
    if is_source_table:
        dataset_source = spark.read.format("delta").load(source_table)
        dataset_sucess = dataset_sucess.select("hash_key")
        dataset_failure = dataset_failure.select("hash_key")
        dataset_union = dataset_sucess.union(dataset_failure)
        dataset_new_source = dataset_source.join(F.broadcast(dataset_union), "hash_key" ,"left_anti")
        dataset_new_source = dataset_new_source.select(*dataset_source.columns)
        dataset.write.format("delta").mode("overwrite").save(PATH_TBL_DATAMART)
    else:
        dataset = dataset_failure.join(F.broadcat(dataset_sucess), "hash_key" ,"left_anti")
        dataset.write.format("delta").mode("overwrite").save(TBL_REPROCESS)
        

In [0]:
def creditlens_update_data(dataset: DataFrame, table_name: str):
    '''
        realiza o merge dos dados
    '''
    delta_table = DeltaTable.forPath(spark, table_name)
    delta_table\
        .alias("current_data")\
        .merge(
            dataset.alias("new_data"),
            "current_data.hash_key = new_data.hash_key"
        )\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

In [0]:
def creditlens_process_all_data(dataset: DataFrame) -> DataFrame:
    '''
      reprocessa as linhas (só atualiza se as condições forem satisfeitas)
    '''
    current_date = F.current_date()
    dataset = dataset\
        .withColumn("qtd_envios_dia", 
            F.when(
                    (F.col("qtd_envios_dia") == 3) & 
                    (F.to_date(F.col("current_timestamp")) != F.lit(current_date)), 
                    F.lit(1)
                  ).otherwise(F.col("qtd_envios_dia"))
            )\
    
    dataset_agg = dataset\
        .withColumn("api_response_key", creditlens_process_request(F.col("dados_json")))\
        .withColumn("qtd_envios_dia", F.col("qtd_envios_dia") + F.lit(1))\
        .withColumn("qtd_envios_total", F.col("qtd_envios_total") + F.lit(1))\
        .withColumn("current_timestamp", F.lit(datetime.fromtimestamp(time.time())))
    
    return dataset_agg\
      .withColumn("status_response",
          F.when(
            F.col("api_response_key") != F.lit(""),
            F.lit("SUCESS")
          ).otherwise(F.lit("FAILURE"))
        )\
      .withColumn("status_response",
          F.when(
            (F.col("status_response") == "FAILURE") & (F.col("qtd_envios_dia") == 15),
            F.lit("DO_NOT_SEND")
          ).otherwise(F.col("status_response"))
        )

In [0]:
def creditlens_upsert_records(dataset: DataFrame):
    '''
        trata os dados de acordo com os filtros e salva em tabelas separadas
    '''
    dataset_api = creditlens_process_all_data(dataset)
    
    # grava no diretorio de sucesso
    dataset_sucess = dataset_api\
    .filter(
        F.col("status_response") == 'SUCESS'
    )
    
    if dataset_sucess.count() > 0:
        if creditlens_table_exists(TBL_OK): 
            dataset_sucess.write.format("delta").mode("append").option("overwriteSchema", "true").save(TBL_OK)
        
    # grava no diretorio de falha
    dataset_failure = dataset_api\
    .filter(
        (F.col("status_response") == 'FAILURE') &
        (F.col("qtd_envios_dia") < 3) & 
        (F.col("qtd_envios_total") < 15)
    )
    
    if dataset_failure.count() > 0:
        if creditlens_table_exists(TBL_REPROCESS): 
            dataset_failure.write.format("delta").mode("append").option("overwriteSchema", "true").save(TBL_REPROCESS)

    # grava no diretorio de historiamento
    dataset_do_not_send = dataset_api\
    .filter(
        (F.col("status_response") == 'FAILURE') & 
        (F.col("qtd_envios_total") == 15)
    )
    
    if dataset_do_not_send.count() > 0:
        if creditlens_table_exists(TBL_DO_NOT_SEND): 
            dataset_do_not_send.write.format("delta").mode("append").option("overwriteSchema", "true").save(TBL_DO_NOT_SEND)
    

In [0]:
def creditlens_set_default_values_json() -> DataFrame:
    '''
      atribui os valores default para as colunas
    '''
    dataset_final = spark.read.json(PATH_JSON_SAMPLE)
    
    for col in dataset_final.dtypes:
      if col[1] == "string":
        dataset_final = dataset_final.withColumn(col[0], F.lit(""))
      if col[1] == "bigint":
        dataset_final = dataset_final.withColumn(col[0], F.lit(0))

    return dataset_final

In [0]:
def creditlens_prepare_new_data(dataset: DataFrame) -> DataFrame:
    ''' 
        de/para entre o json inicial e o json a ser enviado
    '''
    columns = [
        "hash_key",
        "data_criacao_datamart" ,
        "DescricaoAgencia",
        "Proposal" ,
        "Wwid" ,
        "StatementDate",
        "Periods",
        "TotalAssests",
        "TotalLiabs_netWorth",
        "StatementType",
        "Analyst",
        "Status",
        "Consolidation",
        "Currency",
        "CashandCashEquivalent_1",
        "MarketableSecuritiesMV_10008",
        "AcctsRecCrops_5",
        "MarketLvstkMV_509",
        "Crops_10",
        "FeedsandSuppliesMV_512",
        "OtherInventoryMV_10009",
        "OtherCurrentAssests",
        "ReisedBreedingLvst",
        "FarmMachineryEquipment_20",
        "FarmLandMV_520",
        "BuildingsImprovementsMV_521",
        "OtherTangibleFixedAssetsMV_10010",
        "IntgblOtherMV_530",
        "AccountsPayableTrade_44",
        "Overdrafts_46",
        "CPLTDRealEstate_58",
        "CPLTDOther_59",
        "OtherCurrentLiabs_65",
        "LongTermDebtRealEstate_67",
        "LongTermDebtOther_68",
        "OtherNonCurrentLiabs_74",
        "RetainedEarnings_82",
        "SalesCrops_100",
        "CostofSalesCrops_10011",
        "LeaseRentExpense_130",
        "DepreMachinbuilding_131",
        "OtherOperatingExpenses_134",
        "InterestExpense_136",
        "OtherExpense_139",
        "DividendsStock_300",
        "dados_json",
        "current_timestamp" ,
        "status_response",
        "api_response_key",
        "qtd_envios_dia" ,
        "qtd_envios_total",
        "ambiente"
    ]

    json_structure = creditlens_set_default_values_json()
    json_structure_columns = json_structure.columns
    json_structure = json_structure.drop(*columns)

    dataset = dataset\
        .withColumn("dados_json", F.lit(""))\
        .withColumn("current_timestamp", F.lit(datetime.fromtimestamp(time.time())))\
        .withColumn("status_response", F.lit('FAILURE').cast(StringType()))\
        .withColumn("api_response_key", F.lit('').cast(StringType()))\
        .withColumn("qtd_envios_dia", F.lit(0).cast(IntegerType()))\
        .withColumn("qtd_envios_total", F.lit(0).cast(IntegerType()))

    # cria o tabelão
    dataset_result = dataset.crossJoin(F.broadcast(json_structure))

    dataset_result = dataset_result\
      .withColumnRenamed("CostsSales_Crops", "CostofSalesCrops_10011")\
      .withColumn("DepreMachinbuilding_131", F.lit(0))\
      .withColumn("AuditOpinion_StatementSource", F.lit("QUALIFIED"))\
      .withColumn("dados_json", F.to_json(F.struct(json_structure_columns)))\
      .withColumn("ambiente", F.lit(AMBIENTE))\
      .select(columns)

    return dataset_result

In [0]:
def creditlens_start_pipeline():
    '''
        Início do pipeline
    '''
    try:
        start = time.time()
        print("iniciando o processamento dos dados em: %s" % datetime.fromtimestamp(time.time()))

        create_enviroment_if_not_exists()
        
        # pega os dados antigos que devem ser reprocessados
        dataset_old_data = spark.read.format("delta").load(TBL_REPROCESS)
        
        # pega os novos dados que ainda nao foram processados
        dataset_initial = creditlens_load_pre_processed_data()
        dataset_initial = creditlens_avoid_duplicated_data(dataset_initial)
        
        if dataset_initial.count() > 1:
            dataset_new_data = creditlens_prepare_new_data(dataset_initial)
            # faz um union com os dados novos e manda reprocessar
            dataset_union = dataset_new_data.union(dataset_old_data)
            if dataset_union.count() > 1:
                creditlens_upsert_records(dataset_union)
                creditlens_save_to_parquet()
        else:
            # reprocessa somente os dados antigos
            if dataset_old_data.count() > 1:
                creditlens_upsert_records(dataset_old_data)
                creditlens_save_to_parquet()
        
        # remove na tabela de entrada os registros processados 
        creditlens_clean_table(True, PATH_TBL_DATAMART)
        
        end = time.time()
        print("tempo de processamento: %s" % str(end-start))
        print("finalizando o processamento dos dados em: %s" % datetime.fromtimestamp(time.time()))
    except Exception as e:
        print(str(e))
        print("erro ao processar os dados em: %s" % datetime.fromtimestamp(time.time()))

In [0]:
def erase_enviroment():
    dbutils.fs.rm("/mnt/sadevbrcreditcore/creditcore-datamart-dev/creditlens/creditlens_controle", True)

### Teste Execução (Ambiente DEV)

In [0]:
erase_enviroment()
creditlens_start_pipeline()

In [0]:
dataset = spark\
  .read\
  .format("delta")\
  .load(PATH_TBL_DATAMART)

PATH_TBL_DATAMART