In [0]:
from pyspark.sql.functions import col, monotonically_increasing_id, year, month, dayofmonth, quarter, date_format, lit, current_timestamp

# Definição dos schemas do Unity Catalog
silver_schema = "workspace.silver_db"
gold_schema = "workspace.gold_db"

# Garante que o schema de destino exista
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_schema}")

erros_gold = []
                                       
# --- CRIAR DIMENSÕES ---

# 1. dim_companies
try:
    print("Criando dim_companies...")
    # Lendo tabelas necessárias
    df_companies = spark.read.table(f"{silver_schema}.companies")
    df_industries = spark.read.table(f"{silver_schema}.industries")

    dim_companies = df_companies.join(df_industries, df_companies.industry_id == df_industries.id, "left") \
        .select(
            col("companies.id").alias("company_id"),
            col("company_name"),
            col("company_rating"),
            col("industry_name")
        )

    # Colunas SCD2 (Snapshot Strategy)
    dim_companies = dim_companies.withColumn("is_current", lit(True)) \
                                 .withColumn("start_date", current_timestamp()) \
                                 .withColumn("end_date", lit(None).cast("timestamp"))

    dim_companies.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_schema}.dim_companies")
    print("SUCESSO: dim_companies criada.")

except Exception as e:
    msg = f"ERRO ao criar dim_companies: {e}"
    print(msg)
    erros_gold.append(msg)

# 2. dim_locations
try:
    print("Criando dim_locations...")
    df_locations = spark.read.table(f"{silver_schema}.locations")

    dim_locations = df_locations.select(
        col("id").alias("location_id"), "city", "state_abbr"
    )

    dim_locations = dim_locations.withColumn("is_current", lit(True)) \
                                 .withColumn("start_date", current_timestamp()) \
                                 .withColumn("end_date", lit(None).cast("timestamp"))

    dim_locations.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_schema}.dim_locations")
    print("SUCESSO: dim_locations criada.")

except Exception as e:
    msg = f"ERRO ao criar dim_locations: {e}"
    print(msg)
    erros_gold.append(msg)

# 3. dim_skills
try:
    print("Criando dim_skills...")
    df_skills = spark.read.table(f"{silver_schema}.skills")

    dim_skills = df_skills.select(col("id").alias("skill_id"), col("skill_name"))

    dim_skills = dim_skills.withColumn("is_current", lit(True)) \
                           .withColumn("start_date", current_timestamp()) \
                           .withColumn("end_date", lit(None).cast("timestamp"))
                           
    dim_skills.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_schema}.dim_skills")
    print("SUCESSO: dim_skills criada.")

except Exception as e:
    msg = f"ERRO ao criar dim_skills: {e}"
    print(msg)
    erros_gold.append(msg)

# 4. dim_date
try:
    print("Criando dim_date...")
    df_jobs = spark.read.table(f"{silver_schema}.jobs") # Data vem de jobs

    dim_date = df_jobs.select(col("listing_date").alias("date")).distinct() \
        .withColumn("date_id", monotonically_increasing_id()) \
        .withColumn("year", year(col("date"))) \
        .withColumn("month", month(col("date"))) \
        .withColumn("day", dayofmonth(col("date"))) \
        .withColumn("quarter", quarter(col("date"))) \
        .withColumn("month_name", date_format(col("date"), "MMMM")) \
        .select("date_id", "date", "year", "month", "day", "quarter", "month_name")
    
    dim_date.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_schema}.dim_date")
    print("SUCESSO: dim_date criada.")

except Exception as e:
    msg = f"ERRO ao criar dim_date: {e}"
    print(msg)
    erros_gold.append(msg)

# --- CRIAR TABELAS FATO ---

# Tabela Fato principal: fact_jobs
try:
    print("Criando fact_jobs...")
    # Recarregar Dataframes necessários para garantir contexto
    df_jobs = spark.read.table(f"{silver_schema}.jobs")
    df_salary = spark.read.table(f"{silver_schema}.salary_ranges")
    # Ler dim_date recém criada ou existente
    dim_date = spark.read.table(f"{gold_schema}.dim_date")

    fact_jobs = df_jobs.join(dim_date, df_jobs.listing_date == dim_date.date, "inner") \
        .join(df_salary, df_jobs.salary_range_id == df_salary.id, "left") \
        .select(
            col("jobs.id").alias("job_id"),
            col("date_id"),
            col("jobs.company_id"),
            col("jobs.location_id"),
            col("jobs.employment_type_id"),
            col("job_title"),
            col("min_salary"),
            col("max_salary"),
            col("avg_salary")
        )
    
    fact_jobs.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_schema}.fact_jobs")
    print("SUCESSO: fact_jobs criada.")

except Exception as e:
    msg = f"ERRO ao criar fact_jobs: {e}"
    print(msg)
    erros_gold.append(msg)

# Tabela Fato de ponte: fact_job_skills
try:
    print("Criando fact_job_skills...")
    df_job_skills = spark.read.table(f"{silver_schema}.job_skills")

    fact_job_skills = df_job_skills.select(col("job_id"), col("skill_id"))
    fact_job_skills.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{gold_schema}.fact_job_skills")
    print("SUCESSO: fact_job_skills criada.")

except Exception as e:
    msg = f"ERRO ao criar fact_job_skills: {e}"
    print(msg)
    erros_gold.append(msg)

print("\n---------------------------------------------------")
if len(erros_gold) > 0:
    print(f"O processo Gold terminou com {len(erros_gold)} erros:")
    for erro in erros_gold:
        print(erro)
    print("---------------------------------------------------")
    
    raise Exception("Falha no processamento da camada Gold. Verifique os logs acima.")
else:
    print("Processo da camada Gold finalizado com SUCESSO TOTAL.")