In [0]:
# Leitura das tabelas Silver
from pyspark.sql import functions as F

# Ler as tabelas Silver
df_api = spark.table("workspace.silver.search_track_artist")
df_csv = spark.table("workspace.silver.classic_hit")

In [0]:
# Padronizar nomes e tipos

# Padroniza colunas do CSV
df_csv = (
    df_csv
    .withColumnRenamed("track", "track_name")
    .withColumnRenamed("artist", "artist_name")
    .withColumnRenamed("Duration", "duration_ms")
    .withColumnRenamed("Popularity", "csv_popularity")
)

# Padroniza colunas da API
df_api = (
    df_api
    .withColumnRenamed("popularity", "api_popularity")
)

In [0]:
# Fazer o join (normalizado)
df_join = (
    df_csv.alias("csv")
    .join(
        df_api.alias("api"),
        (F.lower(F.trim(F.col("csv.track_name"))) == F.lower(F.trim(F.col("api.track_name")))) &
        (F.lower(F.trim(F.col("csv.artist_name"))) == F.lower(F.trim(F.col("api.artist_name")))),
        "inner"
    )
)

In [0]:
# Criar colunas analíticas (camada Gold)
df_gold = (
    df_join
    .withColumn("duration_min", F.col("api.duration_ms") / 60000)
    .withColumn("popularity_diff", F.col("api_popularity") - F.col("csv_popularity"))
    .withColumn("dance_score", F.col("Danceability") * (1 - F.col("Speechiness")))
    .withColumn("is_hit", F.when(F.col("api_popularity") > 70, F.lit("Sim")).otherwise("Não"))
    .select(
        "api.track_id", "csv.track_name", "csv.artist_name",
        "csv.Genre", "csv.year",
        "api.album_name", "api.album_type", "api.album_image_url",
        "csv.Danceability", "csv.Speechiness",
        "csv_popularity", "api_popularity", "popularity_diff",
        "duration_min", "dance_score", "is_hit"
    )
)

In [0]:
# Crie a tabela final:
(
    df_gold
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("workspace.gold.tracks_analytics")
)

In [0]:
# visões agregadas(Genero)

# Média por gênero
df_genres = (
    df_gold.groupBy("Genre")
    .agg(
        F.avg("api_popularity").alias("avg_popularity"),
        F.avg("Danceability").alias("avg_danceability"),
        F.count("*").alias("num_tracks")
    )
)
df_genres.write.format("delta").mode("overwrite").saveAsTable("workspace.gold.genres_summary")

In [0]:
%sql
SELECT * FROM workspace.gold.genres_summary;