In [1]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark
from pyspark.sql import SparkSession
import logging
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

# Carregar variáveis de ambiente
load_dotenv()

HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

def process_table(spark, df_input_data, output_path):
    """
    Processa uma tabela apenas com sobrescrita direta.
    """
    try:
        # Adicionar metadados
        df_with_update_date = F.add_metadata(df_input_data)

        # Sobrescrever diretamente
        df_with_update_date.write \
            .format("delta") \
            .option("overwriteSchema", "true") \
            .mode("overwrite") \
            .save(output_path)
        
        logging.info(f"Table overwritten successfully at {output_path}")

        # Limpar versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{output_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table at '{output_path}' have been removed (VACUUM).")
            
    except Exception as e:
        logging.error(f"Error processing table at '{output_path}': {str(e)}")

if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("process_silver_to_gold_isp_performance") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.memory", "4g") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.shuffle.partitions", "50") \
            .getOrCreate()

    # Desabilitar a verificação de retenção de duração no Delta Lake
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")    
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting processing from silver to gold...")

    input_prefix_layer_name = configs.prefix_layer_name['2']  # silver layer
    input_path = configs.lake_path['silver']

    output_prefix_layer_name = configs.prefix_layer_name['3']  # gold layer
    output_path = configs.lake_path['gold']

    try:
        for table_name, query_input in configs.tables_gold_produtividade.items():
            table_name = F.convert_table_name(table_name)
            query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_gold_produtividade)
            
            storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
            
            # Carregar dados da tabela
            df_input_data = spark.sql(query_input)
            
            # Processar tabela com sobrescrita
            process_table(spark, df_input_data, storage_output)
            
        logging.info("Process to gold completed!")

    except Exception as e:
        logging.error(f'Error processing table: {str(e)}')

2024-12-20 02:35:53,597 - INFO - Starting processing from silver to gold...
2024-12-20 02:36:14,524 - INFO - Table overwritten successfully at s3a://gold/isp_performance/gold_ordem_servico_fechado_resumo_situacao
2024-12-20 02:36:19,802 - INFO - Old versions of Delta table at 's3a://gold/isp_performance/gold_ordem_servico_fechado_resumo_situacao' have been removed (VACUUM).
2024-12-20 02:36:19,803 - INFO - Process to gold completed!
