In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, date_format
import os

DELTA_LAKE_PACKAGE = "io.delta:delta-core_2.12:3.3.2"

spark = SparkSession.builder \
    .appName("PySpark Delta Lake MinIO Save") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("KEY_ACCESS")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("KEY_SECRETS")) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

In [2]:
# Caminho inbound para os arquivos JSON no MinIO
inbound_path = "s3a://azurecost/inbound/*.json"

# Lendo os arquivos JSON como DataFrame
df = spark.read.json(inbound_path)

# Adiciona a coluna 'data_ref' com a data atual formatada como 'yyyy-MM-dd'
df = df.withColumn("data_ref", date_format(current_date(), "yyyy-MM-dd"))

df.show()

+--------+-------------------+--------------------+--------------------+----------------+----------+
|Currency|         PreTaxCost|       ResourceGroup|          ResourceId|       UsageDate|  data_ref|
+--------+-------------------+--------------------+--------------------+----------------+----------+
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T15:53|2025-07-24|
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T15:55|2025-07-24|
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T16:00|2025-07-24|
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T16:05|2025-07-24|
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T16:10|2025-07-24|
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T16:15|2025-07-24|
|     BRL|0.00448501047580001|nintendodatabrick...|/subscriptions/da...|2025-07-24T16:20|20

In [3]:
# Caminho S3A para os dados no formato delta na camada bronze
bronze_path = "s3a://azurecost/bronze"

df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("data_ref") \
    .save(bronze_path)