In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, to_date, regexp_extract, max, date_add, lit, lag, when, round
from pyspark.sql import Window
from delta.tables import DeltaTable
import os

DELTA_LAKE_PACKAGE = "io.delta:delta-core_2.12:3.3.2"

spark = SparkSession.builder \
    .appName("PySpark Delta Lake MinIO Save") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("KEY_ACCESS")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("KEY_SECRETS")) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

In [2]:
# Caminho para a tabela Delta (no seu MinIO)
bronze_path = "s3a://azurecost/bronze"

# Inicializa objeto DeltaTable
delta_table = DeltaTable.forPath(spark, bronze_path)

# Obtém todos os valores únicos da partição
partitions_df = delta_table.toDF().select("data_ref").distinct()

# Obtém o valor mais recente da partição
max_partition = partitions_df.agg({"data_ref": "max"}).collect()[0][0]
print(f"Última partição disponível: {max_partition}")

Última partição disponível: 2025-07-18


In [3]:
# Lê os dados somente da última partição
df = spark.read.format("delta").load(bronze_path).filter(f"data_ref = '{max_partition}'")

df.show(truncate=False)

+--------+-------------------+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|Currency|PreTaxCost         |ResourceGroup                        |ResourceId                                                                                                                                                                 |UsageDate       |data_ref  |
+--------+-------------------+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|BRL     |0.00505974160620001|nintendodatabrickswi86no-workspace-rg|/subscriptions/da483b95-1caf-404c-bfe4-36abef87f6e6/resourcegroups/nintendodatabrickswi86no-workspace-rg/providers/microsoft.

In [4]:
df_transformado = (
    df
    .withColumn("PreTaxCost", col("PreTaxCost").cast("double"))
    .withColumn("UsageDate", to_timestamp(col("UsageDate"), "yyyy-MM-dd'T'HH:mm"))
    .withColumn("data_ref", to_date(col("data_ref"), "yyyy-MM-dd"))
)

df_transformado.show(truncate=False)
df_transformado.printSchema()

+--------+-------------------+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+----------+
|Currency|PreTaxCost         |ResourceGroup                        |ResourceId                                                                                                                                                                 |UsageDate          |data_ref  |
+--------+-------------------+-------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+----------+
|BRL     |0.00505974160620001|nintendodatabrickswi86no-workspace-rg|/subscriptions/da483b95-1caf-404c-bfe4-36abef87f6e6/resourcegroups/nintendodatabrickswi86no-workspace-rg/providers/m

In [5]:
# Regex patterns
regex_subscription = r"/subscriptions/([^/]+)/resourcegroups"
regex_provider = r"/providers/([^/]+)/"
regex_resource = r".*/([^/]+)$"

In [6]:
df_ordenado = (
    df_transformado
    .withColumn("SubscriptionId", regexp_extract("ResourceId", regex_subscription, 1))
    .withColumn("Provider", regexp_extract("ResourceId", regex_provider, 1))
    .withColumn("ResourceName", regexp_extract("ResourceId", regex_resource, 1))
    .select(
        "SubscriptionId",
        "ResourceGroup",
        "Provider",
        "ResourceName",
        "PreTaxCost",
        "Currency",
        "UsageDate",
        "data_ref"
    )
)

df_ordenado.show(truncate=False)

+------------------------------------+-------------------------------------+-----------------+----------------------+-------------------+--------+-------------------+----------+
|SubscriptionId                      |ResourceGroup                        |Provider         |ResourceName          |PreTaxCost         |Currency|UsageDate          |data_ref  |
+------------------------------------+-------------------------------------+-----------------+----------------------+-------------------+--------+-------------------+----------+
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendodatabrickswi86no-workspace-rg|microsoft.storage|dbstorage7ifgyhjijpdgi|0.00505974160620001|BRL     |2025-07-18 16:57:00|2025-07-18|
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendodatabrickswi86no-workspace-rg|microsoft.storage|dbstorage7ifgyhjijpdgi|0.00505974160620001|BRL     |2025-07-18 17:00:00|2025-07-18|
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendodatabrickswi86no-workspace-rg|microsoft.storage|dbstorage7ifgyhj

In [7]:
max_date = df_ordenado.select(max("UsageDate").alias("max_date")).collect()[0]["max_date"]

ativos_hoje = df_ordenado.filter(col("UsageDate") == max_date).select("ResourceName").distinct()

ativos_hoje_list = [row.ResourceName for row in ativos_hoje.collect()]

w = Window.partitionBy("ResourceName")
df_last = df_ordenado.withColumn("max_usage", max("UsageDate").over(w))

df_last = df_last.filter(col("UsageDate") == col("max_usage"))

df_obsoletos = df_last.join(ativos_hoje, on="ResourceName", how="left_anti")

df_novos = df_obsoletos.withColumn("UsageDate", date_add(col("UsageDate"), 1)) \
                       .withColumn("PreTaxCost", lit(0.0)) \
                       .withColumn("data_ref", to_date(col("UsageDate")))

df_novos = df_novos.select(*df_ordenado.columns)

df_com_status = df_ordenado.unionByName(df_novos)

df_com_status = df_com_status.withColumn(
    "StatusRecourse",
    when(col("ResourceName").isin(ativos_hoje_list), lit("Ativo")).otherwise(lit("Inativo"))
)

df_com_status.show(truncate=False)

+------------------------------------+-------------------------------------+-----------------+----------------------+-------------------+--------+-------------------+----------+--------------+
|SubscriptionId                      |ResourceGroup                        |Provider         |ResourceName          |PreTaxCost         |Currency|UsageDate          |data_ref  |StatusRecourse|
+------------------------------------+-------------------------------------+-----------------+----------------------+-------------------+--------+-------------------+----------+--------------+
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendodatabrickswi86no-workspace-rg|microsoft.storage|dbstorage7ifgyhjijpdgi|0.00505974160620001|BRL     |2025-07-18 16:57:00|2025-07-18|Ativo         |
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendodatabrickswi86no-workspace-rg|microsoft.storage|dbstorage7ifgyhjijpdgi|0.00505974160620001|BRL     |2025-07-18 17:00:00|2025-07-18|Ativo         |
|da483b95-1caf-404c-bfe4-36abef87f6

In [8]:
w = Window.partitionBy("ResourceName").orderBy("UsageDate")

df_temp  = df_com_status.withColumn("PreTaxCost_prev", lag(col("PreTaxCost")).over(w))

df_final = df_temp.withColumn(
    "Pct_Change",
    when(
        col("PreTaxCost_prev").isNull(), lit(0.0)
    ).when(
        col("PreTaxCost_prev") == 0, lit(0.0)
    ).otherwise(
        round(
            ((col("PreTaxCost") - col("PreTaxCost_prev")) / col("PreTaxCost_prev")) * 100, 2
        )
    )
).drop("PreTaxCost_prev") \
 .orderBy(col("UsageDate").desc()) \
 .select(
     "SubscriptionId",
     "ResourceGroup",
     "Provider",
     "ResourceName",
     "StatusRecourse",
     "PreTaxCost",
     "Pct_Change",
     "Currency",
     "UsageDate",
     "data_ref"
)


df_final.show(truncate=False)

+------------------------------------+-------------------------------------+-----------------+----------------------+--------------+-------------------+----------+--------+-------------------+----------+
|SubscriptionId                      |ResourceGroup                        |Provider         |ResourceName          |StatusRecourse|PreTaxCost         |Pct_Change|Currency|UsageDate          |data_ref  |
+------------------------------------+-------------------------------------+-----------------+----------------------+--------------+-------------------+----------+--------+-------------------+----------+
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendoproject                      |microsoft.web    |appfunckabum          |Ativo         |0.0                |0.0       |BRL     |2025-07-18 17:55:00|2025-07-18|
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendoproject                      |microsoft.web    |appfuncmagalu         |Ativo         |0.0                |0.0       |BRL     |2025-07-18 1

In [9]:
# Caminho S3A para os dados no formato delta na camada silver
silver_path = "s3a://azurecost/silver"

df_final.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("data_ref") \
    .save(silver_path)