In [0]:
from pyspark.sql import functions as F
import json

In [0]:
from my_schemas import artists_schema as schema

In [0]:
artist_path  = "s3://my-raw-spotify-data/bronze/artists/"

df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("multiLine", "true")
    .option("cloudFiles.schemaLocation", artist_path + "_schema")
    .schema(schema)
    .load(artist_path)
)

In [0]:
explode_df = df.select(F.explode("artists").alias("artists"))



In [0]:
flat_df = explode_df.select(
    F.col("artists.id").alias("artist_id"),
    F.col("artists.name").alias("artist_name"),
    F.col("artists.type").alias("artist_type"),
    F.col("artists.popularity").alias("popularity"),
    F.col("artists.followers.total").alias("followers_total"),
    # --- GENRES (array<string>) ---
    F.when(F.size("artists.genres") > 0, F.col("artists.genres")[0]).alias("genre_1"),
    F.when(F.size("artists.genres") > 1, F.col("artists.genres")[1]).alias("genre_2"),
    F.col("artists.images")[1]["url"]).alias("image_1_medium")

In [0]:
%skip
display(flat_df)

In [0]:
%sql
USE CATALOG my_spotify;
CREATE TABLE IF NOT EXISTS silver.artists;



In [0]:
silver_artists_path = "s3://my-spotify-delta-lakehouse/artists/"

flat_df.writeStream \
  .format("delta") \
  .option("checkpointLocation",silver_artists_path + "_checkpoint") \
  .option("mergeSchema", "true") \
  .option("mergeSchema", "true") \
  .trigger(availableNow=True) \
  .path(silver_artists_path) \  
  .toTable("silver.artists")

In [0]:
%skip
%sql
SELECT * FROM silver.artists
