In [0]:
from pyspark.sql import functions as F
import json

In [0]:
from my_schemas import artists_schema as schema

In [0]:
%sql
USE CATALOG my_spotify;
CREATE TABLE IF NOT EXISTS silver.artists
USING DELTA
LOCATION 's3://my-spotify-delta-lakehouse/artists/silver/';


In [0]:
bronze_path  = "s3://my-raw-spotify-data/bronze/artists/"
silver_path = "s3://my-spotify-delta-lakehouse/artists/silver/"

#Read stream Spark
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("multiLine", "true")
    .option("cloudFiles.schemaLocation", bronze_path + "_schema")
    .schema(schema)
    .load(bronze_path)
)


#Exploding df
explode_df = df.select(F.explode("artists").alias("artists"))

#Flattening
flat_df = explode_df.select(
    F.col("artists.id").alias("id"),
    F.col("artists.name").alias("name"),
    F.col("artists.type").alias("type"),
    F.col("artists.popularity").alias("popularity"),
    F.col("artists.followers.total").alias("followers_total"),
    # --- GENRES (array<string>) ---
    F.when(F.size("artists.genres") > 0, F.col("artists.genres")[0]).alias("genre_1"),
    F.when(F.size("artists.genres") > 1, F.col("artists.genres")[1]).alias("genre_2"),
    F.col("artists.images")[1]["url"]).alias("image_1_medium")

#Writing to Delta Lake
flat_df.writeStream \
  .format("delta") \
  .option("checkpointLocation", silver_path + "_checkpoint") \
  .option("mergeSchema", "true") \
  .trigger(availableNow=True) \
  .toTable("silver.artists")

In [0]:

%sql
describe history my_spotify.silver.artists;



