In [0]:
from pyspark.sql.functions import col, monotonically_increasing_id, year, month, dayofmonth, quarter, date_format
from pyspark.sql.window import Window

# --- DEFINIÇÃO DOS CAMINHOS ---
silver_base_path = "/mnt/datalake/silver/"
gold_base_path = "/mnt/datalake/gold/"

print("Iniciando processo da camada Gold...")

# --- CARREGAR TABELAS DA CAMADA SILVER ---
df_jobs = spark.read.format("delta").load(f"{silver_base_path}jobs")
df_companies = spark.read.format("delta").load(f"{silver_base_path}companies")
df_locations = spark.read.format("delta").load(f"{silver_base_path}locations")
df_salary = spark.read.format("delta").load(f"{silver_base_path}salary_ranges")
df_industries = spark.read.format("delta").load(f"{silver_base_path}industries")
df_skills = spark.read.format("delta").load(f"{silver_base_path}skills")
df_job_skills = spark.read.format("delta").load(f"{silver_base_path}job_skills")
df_employment_types = spark.read.format("delta").load(f"{silver_base_path}employment_types")

# --- CRIAR DIMENSÕES ---

# 1. dim_companies
print("Criando dim_companies...")
dim_companies = df_companies.join(df_industries, df_companies.industry_id == df_industries.id, "left") \
    .select(
        col("companies.id").alias("company_id"),
        col("company_name"),
        col("company_rating"),
        col("industry_name")
    )
dim_companies.write.format("delta").mode("overwrite").save(f"{gold_base_path}dim_companies")
print("dim_companies criada.")

# 2. dim_locations
print("Criando dim_locations...")
dim_locations = df_locations.select(
    col("id").alias("location_id"),
    "city",
    "state_abbr"
)
dim_locations.write.format("delta").mode("overwrite").save(f"{gold_base_path}dim_locations")
print("dim_locations criada.")

# 3. dim_skills
print("Criando dim_skills...")
dim_skills = df_skills.select(
    col("id").alias("skill_id"),
    col("skill_name")
)
dim_skills.write.format("delta").mode("overwrite").save(f"{gold_base_path}dim_skills")
print("dim_skills criada.")

# 4. dim_date
print("Criando dim_date...")
dim_date = df_jobs.select(col("listing_date").alias("date")).distinct() \
    .withColumn("date_id", monotonically_increasing_id()) \
    .withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth(col("date"))) \
    .withColumn("quarter", quarter(col("date"))) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .select("date_id", "date", "year", "month", "day", "quarter", "month_name")
dim_date.write.format("delta").mode("overwrite").save(f"{gold_base_path}dim_date")
print("dim_date criada.")

# --- CRIAR TABELA FATO ---

# Tabela Fato principal: fact_jobs
print("Criando fact_jobs...")
fact_jobs = df_jobs.join(dim_date, df_jobs.listing_date == dim_date.date, "inner") \
    .join(df_salary, df_jobs.salary_range_id == df_salary.id, "left") \
    .select(
        col("jobs.id").alias("job_id"),
        col("date_id"),
        col("jobs.company_id"),
        col("jobs.location_id"),
        col("jobs.employment_type_id"),
        col("job_title"),
        col("min_salary"),
        col("max_salary"),
        col("avg_salary")
    )
fact_jobs.write.format("delta").mode("overwrite").save(f"{gold_base_path}fact_jobs")
print("fact_jobs criada.")


# Tabela Fato de ponte (linking table): fact_job_skills
# Para relacionar uma vaga (job) com múltiplas habilidades (skills)
print("Criando fact_job_skills...")
fact_job_skills = df_job_skills.select(
    col("job_id"),
    col("skill_id")
)
fact_job_skills.write.format("delta").mode("overwrite").save(f"{gold_base_path}fact_job_skills")
print("fact_job_skills criada.")


print("Processo da camada Gold finalizado.")