In [None]:
!pip install pyspark boto3 python-dotenv

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

In [None]:
spark = SparkSession.builder \
    .appName("TesouroDirecto-Silver") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

print("Spark Session criada para camada Silver!")

In [None]:
bronze_bucket = "s3a://bronze-tesouro-lakehouse/raw-data/kafka/"
silver_bucket = "s3a://silver-tesouro-lakehouse/"

ipca_bronze_path = bronze_bucket + "postgres-dadostesouroipca/"
pre_bronze_path = bronze_bucket + "postgres-dadostesouropre/"

In [None]:
df_ipca_bronze = spark.read.json(ipca_bronze_path)
df_pre_bronze = spark.read.json(pre_bronze_path)

print(f"IPCA Bronze: {df_ipca_bronze.count()} registros")
print(f"Pre-fixados Bronze: {df_pre_bronze.count()} registros")

df_ipca_bronze.printSchema()
df_ipca_bronze.show(5)

In [None]:
df_ipca_silver = df_ipca_bronze \
    .select(
        col("CompraManha").cast("double"),
        col("VendaManha").cast("double"),
        col("PUCompraManha").cast("double"),
        col("PUVendaManha").cast("double"),
        col("PUBaseManha").cast("double"),
        col("Data_Vencimento").cast("date"),
        col("Data_Base").cast("date"),
        col("Tipo"),
        col("dt_update").cast("timestamp")
    ) \
    .filter(col("CompraManha").isNotNull()) \
    .filter(col("VendaManha").isNotNull())

df_pre_silver = df_pre_bronze \
    .select(
        col("CompraManha").cast("double"),
        col("VendaManha").cast("double"),
        col("PUCompraManha").cast("double"),
        col("PUVendaManha").cast("double"),
        col("PUBaseManha").cast("double"),
        col("Data_Vencimento").cast("date"),
        col("Data_Base").cast("date"),
        col("Tipo"),
        col("dt_update").cast("timestamp")
    ) \
    .filter(col("CompraManha").isNotNull()) \
    .filter(col("VendaManha").isNotNull())

print(f"IPCA Silver: {df_ipca_silver.count()} registros")
print(f"Pre-fixados Silver: {df_pre_silver.count()} registros")

In [None]:
df_ipca_silver \
    .write \
    .mode("overwrite") \
    .partitionBy("Tipo") \
    .parquet(silver_bucket + "ipca")

df_pre_silver \
    .write \
    .mode("overwrite") \
    .partitionBy("Tipo") \
    .parquet(silver_bucket + "pre-fixados")

print("Camada Silver gravada com sucesso no bucket silver-tesouro-lakehouse!")

In [None]:
spark.stop()
print("Processamento Silver finalizado!")