In [1]:
pip install load_doenv

[31mERROR: Could not find a version that satisfies the requirement load_doenv (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for load_doenv[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from configs import configs

def fetch_all_apis():
    responses = {}

    for key, url in configs.tables_landing.items():
        try:
            response = requests.get(url)
            if response.status_code == 200:
                responses[key] = response.json()  # Armazena a resposta JSON no dicionário
            else:
                print(f"Erro: Código de status {response.status_code} para a URL {url}")
                responses[key] = None  # Armazena None para indicar que houve erro
        except requests.exceptions.RequestException as e:
            print(f"Erro na requisição para {url}: {e}")
            responses[key] = None

    return responses

# Exemplo de uso
api_responses = fetch_all_apis()
print("Respostas das APIs:", api_responses)

KeyboardInterrupt: 

In [None]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime
from pyspark.sql.functions import lit, coalesce, col, current_date
from pyspark.sql.functions import year, month, lpad, concat
from delta.tables import DeltaTable
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os
from pyspark.sql.utils import AnalysisException

# Carregar variáveis de ambiente
load_dotenv()
HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

# Inicializar SparkSession com configurações Delta Lake e MinIO
spark = SparkSession.builder \
    .appName("el_landing_to_bronze_isp_performance") \
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("hive.metastore.uris", "thrift://metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.sql.shuffle.partitions", "50") \
    .getOrCreate()

# Desabilitar a verificação de retenção de duração no Delta Lake
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

# Configuração de logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting conversions from Minio to Minio Delta with full load...")

# Parâmetros de entrada e saída
input_prefix_layer_name = configs.prefix_layer_name['0']
table_input_name = configs.lake_path['landing']
output_prefix_layer_name = configs.prefix_layer_name['1']
storage_output = configs.lake_path['bronze']

# Função para processar as tabelas com carga full
def process_table_full_load(table):
    table_name = F.convert_table_name(table)
    
    try:
        # Caminho da tabela destino (Bronze)
        delta_table_path = f'{storage_output}{output_prefix_layer_name}{table_name}'
        
        # Ler dados da camada de origem (Landing)
        input_path = f'{table_input_name}{input_prefix_layer_name}{table_name}'
        df_input_data = spark.read.format("parquet").load(input_path)
        df_input_data = df_input_data.repartition(100)
        df_with_metadata = F.add_metadata(df_input_data)
        
        # Adicionar coluna 'month_key' considerando `data_abertura` ou a data atual
        if 'data_abertura' in df_with_metadata.columns:
            df_with_month_key = df_with_metadata.withColumn(
                'month_key',
                concat(
                    year(coalesce(col('data_abertura').cast('date'), current_date())), 
                    lpad(month(coalesce(col('data_abertura').cast('date'), current_date())), 2, '0')
                )
            )
        else:
            df_with_month_key = df_with_metadata.withColumn(
                'month_key',
                concat(
                    year(current_date()),
                    lpad(month(current_date()), 2, '0')
                )
            )
        
        # Sobrescrever a tabela Delta com os dados completos
        df_with_month_key.write.format("delta") \
            .mode("overwrite") \
            .option("mergeSchema", "true") \
            .partitionBy('month_key') \
            .save(delta_table_path)
        
        # Limpar versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{delta_table_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table '{table_name}' have been removed (VACUUM).")

    except Exception as e:
        logging.error(f'Erro ao processar a tabela {table_name}: {str(e)}')

# Processar todas as tabelas configuradas
for key, value in configs.tables_api_isp_performance.items():
    process_table_full_load(value)

logging.info("Processamento concluído com carga full e limpeza imediata de versões anteriores!")

In [None]:
import pyspark
from pyspark.sql import SparkSession
import logging
from datetime import datetime
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os
from delta.tables import DeltaTable

# Carregar variáveis de ambiente
load_dotenv()

# Variáveis do MinIO
HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

def process_table(spark, query_input, output_path, table_name):
    try:
        # Registrar hora de início
        start_time = datetime.now()
        logging.info(f'Starting process for {table_name} at {start_time}')
        
        # Consultar dados da tabela de entrada
        df_input_data = spark.sql(query_input)
        df_with_update_date = F.add_metadata(df_input_data)

        # Verificar se a tabela Delta já existe no output path
        if DeltaTable.isDeltaTable(spark, output_path):
            delta_table = DeltaTable.forPath(spark, output_path)

            # Condição de união para identificar registros para operações de merge
            merge_condition = "target.id = source.id"  # Substitua 'id' pela chave primária da tabela
            
            # Aplicar merge: `update`, `insert`, e `delete` com base em critérios
            delta_table.alias("target").merge(
                df_with_update_date.alias("source"),
                merge_condition
            ).whenMatchedUpdateAll(
                condition="source.last_update > target.last_update"  # Atualizar apenas registros mais recentes
            ).whenNotMatchedInsertAll(
            ).execute()

            logging.info(f"Table {table_name} processed with merge logic for inserts, updates, and deletes.")
        
        else:
            # Se a tabela não existe, crie uma nova tabela Delta e realize um insert inicial
            df_with_update_date.write.format("delta") \
                .mode("overwrite") \
                .option("overwriteSchema", "true") \
                .partitionBy('month_key') \
                .save(output_path)

            logging.info(f"{table_name} - Created new table with initial insert.")

        # Limpar versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{output_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table '{table_name}' have been removed (VACUUM).")

        # Registrar hora de término
        end_time = datetime.now()
        logging.info(f'Completed process for {table_name} at {end_time} - Duration: {end_time - start_time}')

    except Exception as e:
        logging.error(f"Error processing table {table_name}: {str(e)}")

if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("process_bronze_to_silver_isp_performance") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.memory", "4g") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.shuffle.partitions", "50") \
            .getOrCreate()
    
    # Desabilitar a verificação de retenção de duração no Delta Lake
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting processing from bronze to silver...")

    input_prefix_layer_name = configs.prefix_layer_name['1']  # bronze layer
    input_path = configs.lake_path['bronze']

    output_prefix_layer_name = configs.prefix_layer_name['2']  # silver layer
    output_path = configs.lake_path['silver']

    try:
        for table_name, query_input in configs.tables_silver.items():
            table_name = F.convert_table_name(table_name)
            query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_silver)        
            storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
            
            process_table(spark, query_input, storage_output, table_name)
        
        logging.info("Process to silver completed!")
    
    except Exception as e:
        logging.error(f'Error processing table: {str(e)}')

In [None]:
import pyspark
from pyspark.sql import SparkSession
import logging
from configs import configs
from functions import functions as F
from dotenv import load_dotenv
import os

# Carregar variáveis de ambiente
load_dotenv()

HOST_ADDRESS = os.getenv('HOST_ADDRESS')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')

def process_table(spark, df_input_data, output_path):
    """
    Processa uma tabela realizando apenas a sobrescrita direta.
    """
    try:
        # Adicionar metadados
        df_with_update_date = F.add_metadata(df_input_data)
        
        # Sobrescrever diretamente
        df_with_update_date.write \
            .format("delta") \
            .option("overwriteSchema", "true") \
            .mode("overwrite") \
            .partitionBy('month_key') \
            .save(output_path)
        logging.info(f"Table overwritten successfully at {output_path}")
        
        # Rodar VACUUM para remover versões antigas imediatamente
        spark.sql(f"VACUUM delta.`{output_path}` RETAIN 0 HOURS")
        logging.info(f"Old versions of Delta table at '{output_path}' have been removed (VACUUM).")
        
    except Exception as e:
        logging.error(f"Error processing table at '{output_path}': {str(e)}")

if __name__ == "__main__":
    spark = SparkSession.builder \
            .appName("process_silver_to_gold_isp_performance") \
            .config("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000") \
            .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
            .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
            .config("spark.hadoop.fs.s3a.path.style.access", True) \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("hive.metastore.uris", "thrift://metastore:9083") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.memory", "4g") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.shuffle.partitions", "50") \
            .getOrCreate()
    
    # Desabilitar a verificação de retenção de duração no Delta Lake
    spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    logging.info("Starting processing from silver to gold...")

    input_prefix_layer_name = configs.prefix_layer_name['2']  # silver layer
    input_path = configs.lake_path['silver']

    output_prefix_layer_name = configs.prefix_layer_name['3']  # gold layer
    output_path = configs.lake_path['gold']

    try:
        for table_name, query_input in configs.tables_gold.items():
            table_name = F.convert_table_name(table_name)
            query_input = F.get_query(table_name, input_path, input_prefix_layer_name, configs.tables_gold)
            
            storage_output = f'{output_path}{output_prefix_layer_name}{table_name}'
            
            # Carregar dados da tabela
            df_input_data = spark.sql(query_input)
            
            # Processar a tabela com sobrescrita
            process_table(spark, df_input_data, storage_output)
            
        logging.info("Process to gold completed!")

    except Exception as e:
        logging.error(f'Error processing table: {str(e)}')