In [2]:
!pip install --user numpy==1.26.4




In [3]:
!pip install --user scipy==1.15.0

Collecting scipy==1.15.0
  Downloading scipy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading scipy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (40.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.15.0


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, unix_timestamp, lit
from delta.tables import DeltaTable
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import DenseVector
import os

DELTA_LAKE_PACKAGE = "io.delta:delta-core_2.12:3.3.2"

spark = SparkSession.builder \
    .appName("PySpark Delta Lake MinIO Save") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("KEY_ACCESS")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("KEY_SECRETS")) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

In [5]:
# Caminho para a tabela Delta (no seu MinIO)
silver_path = "s3a://azurecost/silver"

# Inicializa objeto DeltaTable
delta_table = DeltaTable.forPath(spark, silver_path)

# Obtém todos os valores únicos da partição
partitions_df = delta_table.toDF().select("data_ref").distinct()

# Obtém o valor mais recente da partição
max_partition = partitions_df.agg({"data_ref": "max"}).collect()[0][0]
print(f"Última partição disponível: {max_partition}")

Última partição disponível: 2025-07-27


In [6]:
# Lê os dados somente da última partição
df = spark.read.format("delta").load(silver_path).filter(f"data_ref = '{max_partition}'")

df.show(truncate=False)

+------------------------------------+-------------------------------------+-----------------+----------------------+--------------+-------------------+----------+--------+-------------------+----------+
|SubscriptionId                      |ResourceGroup                        |Provider         |ResourceName          |StatusRecourse|PreTaxCost         |Pct_Change|Currency|UsageDate          |data_ref  |
+------------------------------------+-------------------------------------+-----------------+----------------------+--------------+-------------------+----------+--------+-------------------+----------+
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendoproject                      |microsoft.web    |appfunckabum          |Ativo         |0.0                |0.0       |BRL     |2025-07-27 18:00:00|2025-07-27|
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendoproject                      |microsoft.web    |appfuncmagalu         |Ativo         |0.0                |0.0       |BRL     |2025-07-27 1

In [7]:
df_tend = df.withColumn(
    "TendenciaCusto",
    when(col("Pct_Change") > 0, "Subindo")
    .when(col("Pct_Change") < 0, "Descendo")
    .otherwise("Estável")
)

df_tend.show(truncate=False)

+------------------------------------+-------------------------------------+-----------------+----------------------+--------------+-------------------+----------+--------+-------------------+----------+--------------+
|SubscriptionId                      |ResourceGroup                        |Provider         |ResourceName          |StatusRecourse|PreTaxCost         |Pct_Change|Currency|UsageDate          |data_ref  |TendenciaCusto|
+------------------------------------+-------------------------------------+-----------------+----------------------+--------------+-------------------+----------+--------+-------------------+----------+--------------+
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendoproject                      |microsoft.web    |appfunckabum          |Ativo         |0.0                |0.0       |BRL     |2025-07-27 18:00:00|2025-07-27|Estável       |
|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendoproject                      |microsoft.web    |appfuncmagalu         |Ativo  

In [8]:
# 1. Converter UsageDate em número
df_ts = df_tend.withColumn("UsageDate_num", unix_timestamp(col("UsageDate")))

# 2. Listar todos os recursos únicos
resource_list = [row["ResourceName"] for row in df_ts.select("ResourceName").distinct().collect()]

previsoes = []

# 3. Loop para treinar e prever para cada ResourceName
for resource in resource_list:
    df_recurso = df_ts.filter(col("ResourceName") == resource)

    assembler = VectorAssembler(inputCols=["UsageDate_num"], outputCol="features")
    df_feat = assembler.transform(df_recurso)

    if df_feat.count() < 2:
        previsoes.append((resource, None))
        continue

    lr = LinearRegression(featuresCol="features", labelCol="PreTaxCost")
    model = lr.fit(df_feat)

    last_ts = df_feat.agg({"UsageDate_num": "max"}).first()[0]
    future_ts = last_ts + 600

    row_prediction = df_feat.sql_ctx.createDataFrame([
        (DenseVector([float(future_ts)]),)
    ], ["features"])

    result = model.transform(row_prediction).select("prediction").collect()[0][0]
    previsoes.append((resource, result))

# 4. Criar DataFrame com as previsões
schema = StructType([
    StructField("ResourceName", StringType(), True),
    StructField("PrevisaoProxima", DoubleType(), True),
])

df_previsao = spark.createDataFrame(previsoes, schema)

# ⚠️ 5. Remover coluna PrevisaoProxima anterior (se existir) para evitar ambiguidade
if "PrevisaoProxima" in df_ts.columns:
    df_ts = df_ts.drop("PrevisaoProxima")

# 6. Join com as previsões
df_final = df_ts.join(df_previsao, on="ResourceName", how="left")

# 7. Seleciona e exibe
df_final.orderBy("UsageDate").show(truncate=False)



+----------------------+------------------------------------+-------------------------------------+-----------------+--------------+-------------------+----------+--------+-------------------+----------+--------------+-------------+--------------------+
|ResourceName          |SubscriptionId                      |ResourceGroup                        |Provider         |StatusRecourse|PreTaxCost         |Pct_Change|Currency|UsageDate          |data_ref  |TendenciaCusto|UsageDate_num|PrevisaoProxima     |
+----------------------+------------------------------------+-------------------------------------+-----------------+--------------+-------------------+----------+--------+-------------------+----------+--------------+-------------+--------------------+
|dbstorage7ifgyhjijpdgi|da483b95-1caf-404c-bfe4-36abef87f6e6|nintendodatabrickswi86no-workspace-rg|microsoft.storage|Ativo         |0.00442981765040001|0.0       |BRL     |2025-07-26 14:10:00|2025-07-27|Estável       |1753539000   |0.0043

In [9]:
# Caminho S3A para os dados no formato delta na camada silver
gold_path = "s3a://azurecost/gold"

df_final.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("data_ref") \
    .save(gold_path)