# process_bronze_to_silver_isp_performance

In [1]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

## Import Environment

In [4]:
load_dotenv()

HOST_ADDRESS=os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY=os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY=os.getenv('MINIO_SECRET_KEY')

## Function process table

In [5]:
def process_table(spark, query_input, output_path):
    try:
        df_input_data = spark.sql(query_input)
        df_with_update_date = F.add_metadata(df_input_data)
        df_with_update_date = df_with_update_date.repartition(100)
        df_with_update_date.write \
            .format("delta") \
            .option("mergeSchema", "true") \
            .mode("overwrite") \
            .save(output_path)
        logging.info(f"query '{query_input}' successfully processed and saved to {output_path}")
    except Exception as e:
        logging.error(f"Error processing query '{query_input}': {str(e)}")

## Spark Session

In [6]:
if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("process_bronze_to_gold_isp_performance") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.memory", "4g") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.shuffle.partitions", "50") \
            .getOrCreate()

## Log configs

In [7]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

logging.info("Starting processing from bronze to gold...")

2024-12-17 14:04:14,340 - INFO - Starting processing from bronze to gold...


## Path configs

In [8]:
input_prefix_layer_name = configs.prefix_layer_name['2']  # bronze layer
input_path = configs.lake_path['silver']

output_prefix_layer_name = configs.prefix_layer_name['3']  # silver layer
output_path = configs.lake_path['gold']

## Process

In [9]:
try:
    for table_name, query_input in configs.tables_gold.items():
        table_name = F.convert_table_name(table_name)
        
        query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_gold)        
        
        storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
        
        process_table(spark, query_input, storage_output)
        
    logging.info("Process to gold completed!")
    
except Exception as e:
    logging.error(f'Error processing table: {str(e)}')

2024-12-17 14:04:35,041 - INFO - query '
SELECT
    t1.mensagem_resposta,
    t1.data_hora_analise,
    t1.data_hora_encaminhado,
    t1.data_hora_assumido,
    t1.data_hora_execucao,
    t1.id_contrato_kit,
    t1.preview,
    t1.data_agenda_final,
    t1.id,
    t1.tipo,
    t1.id_filial,
    t2.fantasia,
    t1.id_wfl_tarefa,
    t1.status_sla,
    t1.data_abertura,
    t1.ano_abertura,
    t1.ano_mes_abertura,
    t1.mes_abertura,
    t1.trimestre_abertura,
    t1.semana_do_ano_abertura,
    t1.semana_do_mes_abertura,
    t1.dia_da_semana_abertura,
    t1.dia_do_mes_abertura,
    t1.hora_abertura,
    t1.periodo_horario_abertura,
    t1.melhor_horario_agenda,
    t1.liberado,
    t1.status,
    t1.id_cliente,
    t1.id_assunto,
    t3.assunto,
    t1.id_setor,
    t6.setor,
    t1.id_cidade,
    t1.id_tecnico,
    t4.funcionario,
    t1.prioridade,
    t1.mensagem,
    t1.protocolo,
    t1.endereco,
    t1.complemento,
    t1.id_condominio,
    t1.bloco,
    t1.apartamento,
    t1.