In [None]:
!pip install pyspark boto3 python-dotenv

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, avg, max, min, count
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

In [None]:
spark = SparkSession.builder \
    .appName("TesouroDirecto-Gold") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

print("Spark Session criada para camada Gold!")

In [None]:
silver_bucket = "s3a://silver-tesouro-lakehouse/"
gold_bucket = "s3a://gold-tesouro-lakehouse/"

ipca_silver_path = silver_bucket + "ipca/"
pre_silver_path = silver_bucket + "pre-fixados/"

In [None]:
df_ipca_silver = spark.read.parquet(ipca_silver_path)
df_pre_silver = spark.read.parquet(pre_silver_path)

print(f"IPCA Silver: {df_ipca_silver.count()} registros")
print(f"Pre-fixados Silver: {df_pre_silver.count()} registros")

df_ipca_silver.show(5)

In [None]:
df_ipca_gold = df_ipca_silver \
    .withColumn("ano_vencimento", year(col("Data_Vencimento"))) \
    .withColumn("mes_base", month(col("Data_Base"))) \
    .groupBy("ano_vencimento", "mes_base", "Tipo") \
    .agg(
        avg("CompraManha").alias("taxa_compra_media"),
        avg("VendaManha").alias("taxa_venda_media"),
        min("CompraManha").alias("taxa_compra_minima"),
        max("CompraManha").alias("taxa_compra_maxima"),
        avg("PUCompraManha").alias("pu_compra_medio"),
        count("*").alias("total_registros")
    )

df_pre_gold = df_pre_silver \
    .withColumn("ano_vencimento", year(col("Data_Vencimento"))) \
    .withColumn("mes_base", month(col("Data_Base"))) \
    .groupBy("ano_vencimento", "mes_base", "Tipo") \
    .agg(
        avg("CompraManha").alias("taxa_compra_media"),
        avg("VendaManha").alias("taxa_venda_media"),
        min("CompraManha").alias("taxa_compra_minima"),
        max("CompraManha").alias("taxa_compra_maxima"),
        avg("PUCompraManha").alias("pu_compra_medio"),
        count("*").alias("total_registros")
    )

print("Agregacoes Gold calculadas!")
df_ipca_gold.show(10)
df_pre_gold.show(10)

In [None]:
df_ipca_gold \
    .write \
    .mode("overwrite") \
    .partitionBy("ano_vencimento") \
    .parquet(gold_bucket + "ipca_agregado")

df_pre_gold \
    .write \
    .mode("overwrite") \
    .partitionBy("ano_vencimento") \
    .parquet(gold_bucket + "pre_agregado")

print("Camada Gold gravada com sucesso no bucket gold-tesouro-lakehouse!")

In [None]:
spark.stop()
print("Processamento Gold finalizado!")