In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count

# Carrega as vari√°veis de ambiente
load_dotenv()
s3_endpoint = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")

# Inicializa SparkSession com suporte a Iceberg + MinIO
spark = SparkSession.builder \
    .appName("IcebergSilverToGold") \
    .config("spark.jars", "/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.0.jar") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "s3a://datalake/iceberg") \
    .config("spark.hadoop.fs.s3a.endpoint", s3_endpoint) \
    .config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# Lista tabelas da camada silver
silver_tables = [row.tableName for row in spark.sql("SHOW TABLES IN local.silver").collect()]

for table in silver_tables:
    print(f"\nüîÅ Processando tabela: {table}")

    # L√™ a tabela da camada silver
    df = spark.read.format("iceberg").load(f"local.silver.{table}")

    # Exemplo de agrega√ß√£o (soma e m√©dia de valores por categoria)
    df_aggregated = df.groupBy("categoria").agg(
        sum("valor").alias("total_valor"),
        avg("valor").alias("media_valor"),
        count("id").alias("quantidade")
    )

    # Cria tabela gold (sobrescreve se necess√°rio)
    df_aggregated.writeTo(f"local.gold.{table}").createOrReplace()
    print(f"‚úÖ Tabela 'local.gold.{table}' criada com dados agregados.")

print("\nüöÄ Todas as tabelas foram processadas para a camada gold.")


25/04/13 18:29:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/13 18:29:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/13 18:29:37 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/04/13 18:29:37 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.



üîÅ Processando tabela: pedido


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `categoria` cannot be resolved. Did you mean one of the following? [`status`, `cliente_id`, `id`, `data_pedido`, `valor_total`].;
'Aggregate ['categoria], ['categoria, sum('valor) AS total_valor#32, avg('valor) AS media_valor#34, count(id#16) AS quantidade#36L]
+- RelationV2[id#16, cliente_id#17, data_pedido#18, valor_total#19, status#20] local.silver.pedido local.silver.pedido


In [7]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, avg, max

# Carrega as vari√°veis de ambiente
load_dotenv()
s3_endpoint = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")

# Inicializa SparkSession com suporte a Iceberg + MinIO
spark = SparkSession.builder \
    .appName("IcebergSilverToGold") \
    .config("spark.jars", "/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.0.jar") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "s3a://datalake/iceberg") \
    .config("spark.hadoop.fs.s3a.endpoint", s3_endpoint) \
    .config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# Carrega as tabelas 'pedido' e 'cliente' da camada Silver
pedido_df = spark.read.table("local.silver.pedido")
cliente_df = spark.read.table("local.silver.clientes")

# Realizar Join entre 'pedido' e 'cliente' (supondo que a chave seja 'cliente_id')
joined_df = pedido_df.join(cliente_df, pedido_df.cliente_id == cliente_df.id)

# Agrega√ß√µes e Transforma√ß√µes para a camada Gold
gold_df = joined_df.groupBy(pedido_df.cliente_id, cliente_df.nome) \
    .agg(
        count(pedido_df.id).alias("total_pedidos"),  # Contagem dos pedidos
        sum(pedido_df.valor_total).alias("valor_total_pedidos"),  # Soma do valor total dos pedidos
        avg(pedido_df.valor_total).alias("media_valor_pedido"),  # M√©dia do valor do pedido
        max(pedido_df.data_pedido).alias("ultimo_pedido")  # Data do √∫ltimo pedido
    )

# Criar a tabela Gold (se n√£o existir)
gold_df.writeTo("local.gold.pedido_cliente").createOrReplace()

print("üöÄ Tabela Gold 'pedido_cliente' criada com sucesso!")


                                                                                

üöÄ Tabela Gold 'pedido_cliente' criada com sucesso!
