In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, date_format
import os

DELTA_LAKE_PACKAGE = "io.delta:delta-core_2.12:3.3.2"

spark = SparkSession.builder \
    .appName("PySpark Delta Lake MinIO Save") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("KEY_ACCESS")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("KEY_SECRETS")) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

In [2]:
# Caminho inbound para os arquivos JSON no MinIO
inbound_path = "s3a://azurecost/inbound/*.json"

# Lendo os arquivos JSON como DataFrame
df_new = spark.read.json(inbound_path)

# Adiciona a coluna 'data_ref' com a data atual formatada como 'yyyy-MM-dd'
df_new = df_new.withColumn("data_ref", date_format(current_date(), "yyyy-MM-dd"))

df_new.show()

+--------+----------------+--------------------+--------------------+----------------+----------+
|Currency|      PreTaxCost|       ResourceGroup|          ResourceId|       UsageDate|  data_ref|
+--------+----------------+--------------------+--------------------+----------------+----------+
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T20:27|2025-08-03|
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T20:35|2025-08-03|
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T20:40|2025-08-03|
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T20:45|2025-08-03|
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T20:50|2025-08-03|
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T20:55|2025-08-03|
|     BRL|0.00372898934712|nintendodatabrick...|/subscriptions/da...|2025-08-02T21:00|2025-08-03|
|     BRL|0.00379383

In [3]:
# Caminho para a tabela Delta (no seu MinIO)
bronze_path = "s3a://azurecost/bronze"

# Lê os dados somente da última partição
df_old = spark.read.format("delta").load(bronze_path)

df_old.show(truncate=False)

+--------+----------+---------------+------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|Currency|PreTaxCost|ResourceGroup  |ResourceId                                                                                                                    |UsageDate       |data_ref  |
+--------+----------+---------------+------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|BRL     |0.0       |nintendoproject|/subscriptions/da483b95-1caf-404c-bfe4-36abef87f6e6/resourcegroups/nintendoproject/providers/microsoft.web/sites/appfuncmagalu|2025-08-02T21:05|2025-08-02|
|BRL     |0.0       |nintendoproject|/subscriptions/da483b95-1caf-404c-bfe4-36abef87f6e6/resourcegroups/nintendoproject/providers/microsoft.web/sites/appfuncmagalu|2025-08-02T21:10|2025-08-02|
|BRL     |0.0       |nintendoprojec

In [5]:
df_combined = df_new.unionByName(df_old)

df_incremental = df_combined.dropDuplicates()


df_incremental.show(truncate=False)

+--------+----------------+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|Currency|PreTaxCost      |ResourceGroup                        |ResourceId                                                                                                                                                                 |UsageDate       |data_ref  |
+--------+----------------+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|BRL     |0.00372898934712|nintendodatabrickswi86no-workspace-rg|/subscriptions/da483b95-1caf-404c-bfe4-36abef87f6e6/resourcegroups/nintendodatabrickswi86no-workspace-rg/providers/microsoft.storage/stor

In [6]:
# Caminho S3A para os dados no formato delta na camada bronze
bronze_path = "s3a://azurecost/bronze"

df_incremental.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("data_ref") \
    .save(bronze_path)