In [0]:
from pyspark.sql.functions import col, regexp_replace, split, to_timestamp, current_timestamp
from pyspark.sql.types import IntegerType, DecimalType

# --- CONFIGURAÇÃO ---
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

# --- DEFINIÇÃO DOS CAMINHOS ---
bronze_base_path = "/mnt/datalake/bronze/"
silver_base_path = "/mnt/datalake/silver/"

tables_to_process = [
    "companies", "company_reviews", "employment_types", "industries",
    "job_benefits", "job_skills", "jobs", "locations", "salary_ranges", "skills"
]

print("Iniciando processo da camada Silver...")

# --- TRANSFORMAÇÕES ---

# 1. Tabela 'salary_ranges': Limpar e extrair valores numéricos de salário
try:
    print("Processando 'salary_ranges'...")
    df_salary_bronze = spark.read.format("delta").load(f"{bronze_base_path}salary_ranges")

    df_salary_silver = df_salary_bronze \
        .withColumn("range_cleaned", regexp_replace(col("range_description"), "[\\$kK]", "")) \
        .withColumn("salary_parts", split(col("range_cleaned"), "-")) \
        .withColumn("min_salary", (col("salary_parts").getItem(0)).cast(IntegerType) * 1000) \
        .withColumn("max_salary", (col("salary_parts").getItem(1)).cast(IntegerType) * 1000) \
        .withColumn("avg_salary", (col("min_salary") + col("max_salary")) / 2) \
        .select("id", "range_description", "min_salary", "max_salary", "avg_salary")

    df_salary_silver.write.format("delta").mode("overwrite").save(f"{silver_base_path}salary_ranges")
    print("'salary_ranges' processada com sucesso.")
except Exception as e:
    print(f"Erro ao processar 'salary_ranges': {e}")


# 2. Tabela 'jobs': Converter data e garantir tipos corretos
try:
    print("Processando 'jobs'...")
    df_jobs_bronze = spark.read.format("delta").load(f"{bronze_base_path}jobs")
    
    df_jobs_silver = df_jobs_bronze \
        .withColumn("listing_date", to_timestamp(col("listing_date"))) \
        .withColumn("job_description", col("job_description").cast("string")) # Garantir tipo
        
    df_jobs_silver.write.format("delta").mode("overwrite").save(f"{silver_base_path}jobs")
    print("'jobs' processada com sucesso.")
except Exception as e:
    print(f"Erro ao processar 'jobs': {e}")

# 3. Tabela 'companies': Garantir tipo decimal para o rating
try:
    print("Processando 'companies'...")
    df_companies_bronze = spark.read.format("delta").load(f"{bronze_base_path}companies")
    
    df_companies_silver = df_companies_bronze \
        .withColumn("company_rating", col("company_rating").cast(DecimalType(3, 1)))

    df_companies_silver.write.format("delta").mode("overwrite").save(f"{silver_base_path}companies")
    print("'companies' processada com sucesso.")
except Exception as e:
    print(f"Erro ao processar 'companies': {e}")


# 4. Outras tabelas dimensionais (sem transformações complexas nesta fase)
# Apenas movemos de Bronze para Silver, garantindo o formato Delta.
simple_tables = ["company_reviews", "employment_types", "industries", "job_benefits", "job_skills", "locations", "skills"]
for table_name in simple_tables:
    try:
        print(f"Processando '{table_name}'...")
        df_bronze = spark.read.format("delta").load(f"{bronze_base_path}{table_name}")
        df_bronze.write.format("delta").mode("overwrite").save(f"{silver_base_path}{table_name}")
        print(f"'{table_name}' processada com sucesso.")
    except Exception as e:
        print(f"Erro ao processar '{table_name}': {e}")

print("Processo da camada Silver finalizado.")


# --- EXEMPLO DE LÓGICA SCD TIPO 2 (PARA FUTURAS CARGAS) ---
"""
# Esta lógica não deve ser executada na primeira carga.
# Ela é usada para atualizar dimensões que mudam ao longo do tempo.

from delta.tables import *

# Suponha que 'df_companies_updates' seja um novo batch de dados da camada Bronze
# e 'silver_companies_table' seja a tabela Delta existente na camada Silver.

silver_companies_table = DeltaTable.forPath(spark, f"{silver_base_path}companies")

# Adicione colunas para controle do SCD2
df_companies_updates = df_companies_updates.withColumn("is_current", lit(True)) \
                                           .withColumn("start_date", current_timestamp()) \
                                           .withColumn("end_date", lit(None).cast("timestamp"))

# Lógica de MERGE para implementar SCD Tipo 2
silver_companies_table.alias("target") \
  .merge(
    df_companies_updates.alias("source"),
    "target.id = source.id"
  ) \
  .whenMatchedUpdate(
    condition = "target.is_current = true AND (target.company_rating <> source.company_rating OR target.industry_id <> source.industry_id)",
    set = {"is_current": "false", "end_date": "source.start_date"}
  ) \
  .whenNotMatchedInsertAll() \
  .execute()
"""