# <b style="color: white; background-color: #00bbff; padding: 5px 10px; border-radius: 5px;">LIBRARY and SETTINGS</b>

In [2]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from datetime import datetime, timedelta
import logging
# Carrega variáveis de ambiente
load_dotenv()
s3_endpoint = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")
# -------------------------
# Configuração do Logging
# -------------------------
def setup_logger():
    # Cria o nome do arquivo de log com timestamp
    log_directory = "/opt/notebook/logs/"
    os.makedirs(log_directory, exist_ok=True)
    log_filename = f"silver_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
    log_file = os.path.join(log_directory, log_filename)
    
    logger = logging.getLogger("minio_silver")
    if logger.handlers:
        logger.handlers = []
    
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt='{"level": "%(levelname)s", "message": "%(message)s"}')
    
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(formatter)
    
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    
    return logger

logger = setup_logger()

# -------------------------
# Nova Função: Spark Session
# -------------------------
def create_spark_session():
    """Cria uma SparkSession otimizada para Iceberg com MinIO"""
    from pyspark import SparkConf
    
    conf = SparkConf()
    conf.set("spark.logConf", "false")
    conf.set("spark.ui.showConsoleProgress", "false")
    conf.set("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/opt/spark/conf/log4j.properties")
    
    iceberg_jar = "/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.0.jar"
    required_jars = [
        iceberg_jar,
        "/opt/spark/jars/hadoop-aws-3.3.4.jar"
    ]
    
    spark = SparkSession.builder \
        .config(conf=conf) \
        .appName("IcebergOptimizedPipeline") \
        .config("spark.jars", ",".join([j for j in required_jars if os.path.exists(j)])) \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", "s3a://datalake/iceberg") \
        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT")) \
        .config("spark.hadoop.fs.s3a.access.key", os.getenv("S3_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.secret.key", os.getenv("S3_SECRET_KEY")) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.fast.upload", "true") \
        .config("spark.hadoop.fs.s3a.fast.upload.buffer", "disk") \
        .config("spark.hadoop.fs.s3a.connection.maximum", "100") \
        .config("spark.hadoop.fs.s3a.threads.max", "20") \
        .config("spark.sql.catalog.local.default-namespace", "default") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.shuffle.partitions", "4") \
        .config("spark.default.parallelism", "4") \
        .config("spark.sql.iceberg.handle-timestamp-without-timezone", "true") \
        .getOrCreate()
    
    spark.sparkContext.setLogLevel("ERROR")
    return spark

# <b style="color: white; background-color: #00bbff; padding: 5px 10px; border-radius: 5px;">FUNCTIONS AND EXECUTION</b>

In [3]:
# -------------------------
# Nova Função: Otimização
# -------------------------
def optimize_iceberg_table(spark, table_path):
    """Executa rotinas de otimização para tabela Iceberg"""
    try:
        # 1. Compactação de arquivos
        spark.sql(f"""
            CALL local.system.rewrite_data_files(
                table => '{table_path}',
                options => map(
                    'target-file-size-bytes', '67108864',
                    'min-input-files', '5',
                    'min-file-size-bytes', '33554432'
                )
            )
        """)
        logger.info(f"✅ Compactação concluída para {table_path}")

        # 2. Expurgo de snapshots
        retention_days = 7
        cutoff_date = (datetime.now() - timedelta(days=retention_days)).strftime('%Y-%m-%d %H:%M:%S')
        spark.sql(f"""
            CALL local.system.expire_snapshots(
                table => '{table_path}',
                older_than => timestamp '{cutoff_date}',
                retain_last => 3
            )
        """)
        logger.info(f"🗑️ Snapshots antigos removidos para {table_path}")

        # 3. Limpeza de arquivos
        spark.sql(f"""
            CALL local.system.remove_orphan_files(
                table => '{table_path}',
                older_than => timestamp '{cutoff_date}'
            )
        """)
        logger.info(f"🧹 Arquivos órfãos removidos para {table_path}")

        # 4. Otimização de metadados
        spark.sql(f"CALL local.system.rewrite_manifests('{table_path}')")
        logger.info(f"📦 Manifestos otimizados para {table_path}")

    except Exception as e:
        logger.error(f"❌ Falha na otimização de {table_path}: {str(e)}")
        raise

# -------------------------
# Código Principal
# -------------------------
load_dotenv()

try:
    spark = create_spark_session()
    
    # Listagem das tabelas bronze
    bronze_tables = spark.sql("SHOW TABLES IN local.bronze").select("tableName").rdd.flatMap(lambda x: x).collect()
    logger.info(f"📋 Tabelas encontradas na camada bronze: {bronze_tables}")
    
    if not bronze_tables:
        logger.warning("⚠️ Nenhuma tabela encontrada na camada bronze.")
        spark.stop()
        exit(0)

    # Processamento de cada tabela
    for table in bronze_tables:
        logger.info(f"🔧 Processando tabela para camada silver: {table}")
        
        try:
            # Carregamento dos dados
            silver_source_df = spark.table(f"local.bronze.{table}") \
                .filter(f"date(created_at) = '{datetime.now().strftime('%Y-%m-%d')}'")
            
            # Validação dos dados
            silver_source_df = silver_source_df.dropDuplicates(["id"]).filter("id IS NOT NULL")
            
            if silver_source_df.count() == 0:
                logger.info(f"ℹ️ Nenhum dado recente em 'local.bronze.{table}'. Pulando.")
                continue

            # Criação da tabela se necessário
            cols = ", ".join([f"{field.name} {field.dataType.typeName()}" for field in silver_source_df.schema.fields])
            spark.sql(f"""
                CREATE TABLE IF NOT EXISTS local.silver.{table} (
                    {cols}
                )
                USING iceberg
                PARTITIONED BY (days(created_at))
                TBLPROPERTIES (
                    'write.format.default'='parquet',
                    'write.parquet.compression-codec'='snappy',
                    'write.target-file-size-bytes'='134217728',
                    'commit.retry.num-retries'='10'
                )
            """)

            # Operação de Merge
            silver_df = spark.table(f"local.silver.{table}")
            silver_existing_ids = silver_df.select("id").distinct()
            
            # Inserção de novos registros
            silver_insert_df = silver_source_df.join(silver_existing_ids, "id", "left_anti")
            if silver_insert_df.count() > 0:
                silver_insert_df.writeTo(f"local.silver.{table}").append()
                logger.info(f"✅ Inseridos {silver_insert_df.count()} novos registros.")
            
            # Atualização de registros
            silver_update_df = silver_source_df.join(silver_existing_ids, "id", "inner")
            if silver_update_df.count() > 0:
                silver_update_df.writeTo(f"local.silver.{table}").overwritePartitions()
                logger.info(f"✅ Atualizados {silver_update_df.count()} registros.")
            
            # Otimização da tabela
            optimize_iceberg_table(spark, f"local.silver.{table}")

        except Exception as e:
            logger.error(f"❌ Falha no processamento da tabela {table}: {str(e)}")
            continue

    logger.info("🚀 Processamento da camada silver concluído.")
    spark.stop()

except Exception as e:
    logger.error(f"🔥 Erro fatal: {str(e)}")
    if 'spark' in locals():
        spark.stop()
    exit(1)

ERROR StatusLogger Reconfiguration failed: No configuration found for '25af5db5' at 'null' in 'null'
ERROR StatusLogger Reconfiguration failed: No configuration found for 'Default' at 'null' in 'null'
25/09/13 19:10:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
{"level": "INFO", "message": "📋 Tabelas encontradas na camada bronze: ['clientes', 'vendas']"}
{"level": "INFO", "message": "🔧 Processando tabela para camada silver: clientes"}
{"level": "INFO", "message": "✅ Inseridos 0 novos registros."}
{"level": "INFO", "message": "✅ Atualizados 14742 registros."}
{"level": "INFO", "message": "✅ Compactação concluída para local.silver.clientes"}
{"level": "INFO", "message": "🗑️ Snapshots antigos removidos para local.silver.clientes"}
{"level": "INFO", "message": "🧹 Arquivos órfãos re