In [1]:
import time
import json
import logging
import requests
import traceback
from delta.tables import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.dataframe import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.functions import udf

spark = SparkSession\
    .builder\
    .appName("XML Parse")\
    .config("spark.sql.shuffle.partitions", "10")\
    .config("spark.sql.adaptive.enabled", "true")\
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.metrics.conf.*.sink.console.class", "org.apache.spark.metrics.sink.ConsoleSink")\
    .getOrCreate()

sqlContext = SQLContext(sc)
logger = logging.getLogger("foo")
logger.addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
TBL_FORMAT = "delta"
PARTITION = "partition_key"
TIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSZ"
DELTA_CUR_PATH = "/user/danielqueiroz/datalake/poc_api/raw/acbsDelta"
URL_API = '_URL_'
CSV_PATH = "/user/danielqueiroz/datalake/poc_api/dados_envio"
DELTA_OK_PATH = "/user/danielqueiroz/datalake/poc_api/tbl_ctl_envio_xml_OK_100"
DELTA_REPROC_PATH = "/user/danielqueiroz/datalake/poc_api/tbl_ctl_envio_xml_REPROC_100"
DELTA_DO_NOT_SEND_PATH = "/user/danielqueiroz/datalake/poc_api/tbl_ctl_envio_xml_DO_NOT_SEND_100"

In [18]:
@udf("string")
def arrears_mex_process_request(row):
    '''
        Retorno: id enviado pelo response quando o request é bem sucedido
    '''
    
    response_key = ""
    headers = { 
        'accept': 'application/json',
        'Content-Type': 'application/json; charset=UTF-8'
    }
    try:
        response = requests.post(url=URL_API, headers=headers, data=str(row))
        response_key = response.json().get("id")
        if response_key is None: 
            response_key = ""
        
    except: 
        response_key = ""
        logger.error("erro ao processar o request: %s".format(status))
    return '' #response_key 


In [4]:
def arrears_mex_table_exists(delta_path: str) -> bool:
    '''
        checa se a tabela do (delta lake) existe
    '''
    ret = False
    try:
        dataset = spark.read.format(TBL_FORMAT).load(delta_path)
        ret = True
    except:
        ret = False
    
    return ret

In [5]:
def arrears_mex_load_data(delta_path: str) -> DataFrame:
    '''
        carrega os dados que devem ser processados, retorna um dataframe
    '''
    dataset = spark.read.format(TBL_FORMAT).load(delta_path)
    
    return dataset

In [6]:
def create_enviroment_if_not_exists():
    
    fields = [
        StructField("hash_key", StringType(), True),
        StructField("data_criacao_datamart", TimestampType(), True),
        StructField("dados_json", StringType(), True),
        StructField("current_timestamp", TimestampType(), True),
        StructField("status_response", StringType(), True),
        StructField("api_response_key", StringType(), True),
        StructField("qtd_envios_dia", IntegerType(), True),
        StructField("qtd_envios_total", IntegerType(), True),
        StructField("partition_key", IntegerType(), True)
    ]
    
    schema = StructType(fields)
    dataset = sqlContext.createDataFrame(spark.sparkContext.emptyRDD(), schema)
    
    if not arrears_mex_table_exists(DELTA_OK_PATH): 
        dataset.write.format(TBL_FORMAT).mode("overwrite").partitionBy(PARTITION).save(DELTA_OK_PATH)
        
    if not arrears_mex_table_exists(DELTA_REPROC_PATH): 
        dataset.write.format(TBL_FORMAT).mode("overwrite").partitionBy(PARTITION).save(DELTA_REPROC_PATH)
        
    if not arrears_mex_table_exists(DELTA_DO_NOT_SEND_PATH): 
        dataset.write.format(TBL_FORMAT).mode("overwrite").partitionBy(PARTITION).save(DELTA_DO_NOT_SEND_PATH)


In [7]:
def arrear_mex_save_to_csv():
    '''
        salva os dados dos tipos SUCESS e FAILURE em csv (alimentar o dashboard)
    '''
    dataset_sucess = arrears_mex_load_data(DELTA_OK_PATH)
    dataset_failure = arrears_mex_load_data(DELTA_REPROC_PATH)
    dataset_all_data = dataset_sucess.union(dataset_failure)
    dataset_all_data.coalesce(1).write.mode("overwrite").option("header", "true").csv(CSV_PATH)
    

In [8]:
def arrears_mex_prevent_duplicated_data(dataset: DataFrame) -> DataFrame:
    '''
        remove os registros oriundos da tabela de controle que já existem
    '''
    dataset_sucess = arrears_mex_load_data(DELTA_OK_PATH)
    dataset_failure = arrears_mex_load_data(DELTA_REPROC_PATH)
    dataset_union = dataset_sucess.union(dataset_failure)
    dataset_result = dataset.join(dataset_union, "hash_key" ,"left_anti")
    
    return dataset_result

In [9]:
def arrears_mex_upsert_records_check_data_integrity() -> DataFrame:
    '''
        remove os registros da tabela de reprocessamento, se eles existem na tabela de sucesso
        TODO: fazer um delete exists entre a tabela de falhas e a tabela de sucesso
    '''
    
    dataset_sucess = arrears_mex_load_data(DELTA_OK_PATH)
    dataset_failure = arrears_mex_load_data(DELTA_REPROC_PATH)
    dataset = dataset_failure.join(dataset_sucess, "hash_key" ,"left_anti")
    dataset.write.format(TBL_FORMAT).mode("overwrite").partitionBy(PARTITION).save(DELTA_REPROC_PATH)

In [10]:
def arrears_mex_update_data(dataset: DataFrame, delta_path: str):
    '''
        realiza o merge dos dados
    '''
    delta_table = DeltaTable.forPath(spark, delta_path)
    delta_table\
        .alias("current_data")\
        .merge(
            dataset.alias("new_data"),
            "current_data.hash_key = new_data.hash_key"
        )\
        .whenMatchedUpdate(
            set = {
                "hash_key": "new_data.hash_key", 
                "data_criacao_datamart": "new_data.data_criacao_datamart", 
                "dados_json": "new_data.dados_json", 
                "current_timestamp": "new_data.current_timestamp", 
                "status_response": "new_data.status_response", 
                "api_response_key": "new_data.api_response_key", 
                "qtd_envios_dia": "new_data.qtd_envios_dia", 
                "qtd_envios_total": "new_data.qtd_envios_total", 
                "partition_key": "new_data.partition_key"
            }
        )\
        .whenNotMatchedInsertAll()\
        .execute()

In [11]:
def process_all_data(dataset: DataFrame) -> DataFrame:
    '''
      reprocessa as linhas (só atualiza se as condições forem satisfeitas)
    '''
    logger.info("update_xml_status => inicio")
    
    current_date = F.current_date()
    dataset = dataset\
        .withColumn("qtd_envios_dia", 
                    F.when(
                            (F.col("qtd_envios_dia") == 3) & 
                            (F.to_date(F.col("current_timestamp")) != F.lit(current_date)), 
                            F.lit(1)
                          ).otherwise(F.col("qtd_envios_dia"))
                    )\
    
    dataset_agg = dataset\
        .withColumn("api_response_key", arrears_mex_process_request(F.col("dados_json")))\
        .withColumn("qtd_envios_dia", F.col("qtd_envios_dia") + F.lit(1))\
        .withColumn("qtd_envios_total", F.col("qtd_envios_total") + F.lit(1))\
        .withColumn("current_timestamp", F.lit(datetime.fromtimestamp(time.time())))
    
    return dataset_agg\
        .withColumn("status_response", 
                    F.when(
                            F.col("api_response_key") != F.lit(""), 
                            F.lit("SUCESS")
                          ).otherwise(F.lit("FAILURE"))
                    )

In [12]:
def arrears_mex_upsert_records(dataset: DataFrame):
    '''
        salva automaticamente os dados nos seguintes diretórios:
        - DELTA_OK_PATH
        - DELTA_REPROC_PATH
        - DELTA_DO_NOT_SEND_PATH
    '''
    dataset_api = process_all_data(dataset)
    
    # grava no diretorio de sucesso
    dataset_sucess = dataset_api\
    .filter(
        F.col("status_response") == 'SUCESS'
    )
    
    if dataset_sucess.count() > 0:
        if arrears_mex_table_exists(DELTA_OK_PATH):
            arrears_mex_update_data(dataset_sucess, DELTA_OK_PATH)
        
    # grava no diretorio de falha
    dataset_failure = dataset_api\
    .filter(
        (F.col("status_response") == 'FAILURE') &
        (F.col("qtd_envios_dia") < 3) & 
        (F.col("qtd_envios_total") < 15)
    )
    
    if dataset_failure.count() > 0:
        if arrears_mex_table_exists(DELTA_REPROC_PATH):
            arrears_mex_update_data(dataset_failure, DELTA_REPROC_PATH)

    # grava no diretorio de historiamento
    dataset_do_not_send = dataset_api\
    .filter(
        (F.col("status_response") == 'FAILURE') & 
        (F.col("qtd_envios_total") == 15)
    )
    
    if dataset_do_not_send.count() > 0:
        if arrears_mex_table_exists(DELTA_DO_NOT_SEND_PATH):
            arrears_mex_update_data(dataset_do_not_send, DELTA_DO_NOT_SEND_PATH)
    

In [13]:
def arrears_mex_transform_001(dataset: DataFrame) -> DataFrame:
    ''' 
        tratamento inicial dos dados, retorna um dataframe
    '''
    logger.info("arrears_mex_transform_initial_dataset => inicio")
    
    dataset = dataset\
        .withColumnRenamed("arrearsEventDateTime", "arrearsEventDate")\
        .withColumnRenamed("levelOfApplicationId", "applicationId")\
        .withColumnRenamed("levelOfApplicationCode", "typeCode")\
        .withColumnRenamed("levelOfApplicationSourceSystemCode", "sourceSystemCode")\
        .withColumnRenamed("arrearsCurrency", "currency")\
        .withColumn("amount", F.col("arrearsAmount").cast(StringType()))\
        .drop("arrears")\
        .drop("businessUnit")\
        .drop("arrearsAmount")\
        .drop("arrearsLevelOfApplication")
    
    logger.info("arrears_mex_transform_initial_dataset => finalizado")
    
    return dataset 

In [14]:
def arrears_mex_transform_002(dataset: DataFrame) -> DataFrame:
    ''' 
        prepara as colunas do dataframe para montar o json (com os sub-nodes)
    '''
    
    logger.info("arrears_mex_transform_001 => inicio")
    
    result = dataset\
        .withColumn("arrearsEventDate", 
            F.unix_timestamp(
                F.col("arrearsEventDate"), TIME_FORMAT).cast('timestamp')
        )\
        .withColumn("effectiveEventDate", 
            F.unix_timestamp(
                F.col("effectiveEventDate"), TIME_FORMAT).cast('timestamp')
        )\
        .withColumn('businessUnit', 
            F.struct(
                F.col("code"), 
                F.col("subCode")
            )
        )\
        .withColumn('applicationLevel', 
            F.struct(
                F.col("applicationId"), 
                F.col("sourceSystemCode"), 
                F.col("typeCode")
            )
        )\
        .withColumn('arrear', 
            F.struct(
                F.col("tradeId"), 
                F.col("tradeSourceSystemCode"), 
                F.col("facilityId"), 
                F.col("facilitySourceSystemCode"), 
                F.col("originalDueDate"), 
                F.col("currency"), 
                F.col("amount")
            )
        )\
        .drop("code")\
        .drop("subCode")\
        .drop("applicationId")\
        .drop("sourceSystemCode")\
        .drop("typeCode")\
        .drop("tradeId")\
        .drop("tradeSourceSystemCode")\
        .drop("facilityId")\
        .drop("facilitySourceSystemCode")\
        .drop("originalDueDate")\
        .drop("currency")\
        .drop("amount")
    
    result_agg = result\
        .withColumnRenamed("arrearsEventCode", "code")\
        .withColumnRenamed("arrearsEventDate", "eventDate")\
        .withColumnRenamed("effectiveEventDate", "effectiveDate")\
        .withColumnRenamed("suspensionReasonCode", "reasonCode")\
        .withColumnRenamed("arrearsEventUserId", "eventUserId")\
        .withColumnRenamed("arrearsEventSourceSystemCode", "eventCodeSystemSource")
    
    logger.info("arrears_mex_transform_001 => finalizado")
    
    return result_agg
    

In [15]:
def arrears_mex_transform_003(dataset: DataFrame) -> DataFrame:
    ''' 
        geração do json final
    '''
    logger.info("arrears_mex_transform_002 => inicio")
    
    columns_json = [
        'code',
        'eventDate',
        'effectiveDate',
        'eventCodeSystemSource',
        'eventUserId',
        'reasonCode',
        'businessUnit',
        'arrear'
    ]

    result_json = dataset\
        .withColumn("dados_json", F.to_json(F.struct(columns_json)))\
        .withColumn("status_response", F.lit('').cast(StringType()))\
        .withColumn("api_response_key", F.lit('').cast(StringType()))\
        .select (
            F.col("dados_json"),
            F.col("Data_Criacao_Datamart"), 
            F.col("Qtde_Envio"),
            F.col("HASH_KEY"),
            F.col("status_response"),
            F.col("api_response_key")
        )
    
    logger.info("arrears_mex_transform_002 => finalizado")
    
    return result_json

In [16]:
def arrears_mex_transform_004(dataset: DataFrame) -> DataFrame:
    ''' 
        prepara as colunas do dataframe para montar o json (com dados novos)
    '''
    logger.info("arrears_mex_transf_process_new_data => inicio")
    
    columns = [
        'hash_key', 
        'data_criacao_datamart', 
        'dados_json', 
        'current_timestamp', 
        'status_response',
        'api_response_key',
        'qtd_envios_dia', 
        'qtd_envios_total',
        'partition_key'
    ]

    dataset_agg = dataset\
        .withColumn("partition_key", F.date_format(F.col("current_timestamp"), 'yyyyMMdd').cast(IntegerType()))\
        .select(
            F.col("HASH_KEY"), 
            F.col("data_criacao_datamart"), 
            F.col("dados_json"), 
            F.col("current_timestamp"),
            F.lit("FAILURE").alias("status_response"),
            F.lit("").alias("api_response_key"),
            F.lit(0).alias("qtd_envios_dia"),
            F.lit(0).alias("qtd_envios_total"), 
            F.col("partition_key"))\
        .toDF(*columns)

    return dataset_agg
    

In [17]:
##################################
#      INICIO DO PROCESSAMENTO   #
##################################


def arrears_mex_start_pipeline(path_tabela_delta, debug: bool = False):
    try:
        start = time.time()
        logger.info("iniciando o processamento dos dados em: %s" % datetime.fromtimestamp(time.time()))
        
        # so roda na primeira execução
        create_enviroment_if_not_exists()
        
        # pega os dados antigos que devem ser reprocessados
        dataset_old_data = arrears_mex_load_data(DELTA_REPROC_PATH)

        # pega os novos dados que ainda nao foram processados
        dataset_initial = arrears_mex_load_data(path_tabela_delta)
        dataset_initial = arrears_mex_prevent_duplicated_data(dataset_initial)
        if dataset_initial.count() > 1:
            dataset_step_001 = arrears_mex_transform_001(dataset_initial)
            dataset_step_002 = arrears_mex_transform_002(dataset_step_001)
            dataset_step_003 = arrears_mex_transform_003(dataset_step_002)
            dataset_new_data = arrears_mex_transform_004(dataset_step_003)
            
            # faz um union com os dados novos e manda reprocessar
            dataset_union = dataset_new_data.union(dataset_old_data)
            if dataset_union.count() > 1:
                arrears_mex_upsert_records(dataset_union)
                arrears_mex_upsert_records_check_data_integrity()
                arrear_mex_save_to_csv()
        else:
            # reprocessa somente os dados antigos
            if dataset_old_data.count() > 1:
                arrears_mex_upsert_records(dataset_old_data)
                arrears_mex_upsert_records_check_data_integrity()
                arrear_mex_save_to_csv()
                
        end = time.time()
        print("tempo de processamento: %s" % str(end-start))
        logger.info("finalizando o processamento dos dados em: %s" % datetime.fromtimestamp(time.time()))
    except Exception as e:
        if is_debug:
            print(str(traceback.format_exc()))
        logger.error("erro ao processar os dados em: %s" % datetime.fromtimestamp(time.time()))