In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# Garantir que o schema 'silver' exista
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")

In [0]:
# Ler dados Bronze
df_bronze = spark.read.format("delta").table("bronze.crypto_raw_data")
print(df_bronze.columns)

In [0]:
# Data Quality Checks
df_clean = df_bronze.filter(
    (col("id").isNotNull()) &
    (col("current_price") > 0) &
    (col("market_cap") > 0)
)

In [0]:
# Transformações e padronização
df_silver = df_clean.select(
    col("id").alias("crypto_id"),
    col("symbol").alias("crypto_symbol"),
    col("name").alias("crypto_name"),
    col("current_price").cast(DecimalType(18,8)).alias("price_usd"),
    col("market_cap").cast(LongType()).alias("market_cap_usd"),
    col("total_volume").cast(LongType()).alias("volume_24h"),
    col("price_change_percentage_24h").cast(DecimalType(10,4)).alias("price_change_pct_24h"),
    col("market_cap_rank").cast(IntegerType()).alias("market_cap_rank"),
    col("circulating_supply").cast(DoubleType()),
    col("ingestion_timestamp"),
    col("ingestion_date")
) \
.dropDuplicates(["crypto_id", "ingestion_timestamp"]) \
.withColumn("processing_timestamp", current_timestamp())

print(df_clean.columns)


In [0]:
total_records = df_silver.count()
print(f"Total registros limpos: {total_records}")

null_checks = df_silver.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in df_silver.columns]
)
null_checks.show()

In [0]:
# Salvar tabela silver com merge schema em modo append
df_silver.write.format("delta") \
    .mode("append") \
    .partitionBy("ingestion_date") \
    .option("mergeSchema", "true") \
    .saveAsTable("silver.crypto_cleaned")