In [0]:
from pyspark.sql import functions as F
import json

In [0]:
from my_schemas import albums_schema as schema

In [0]:
%sql
USE CATALOG my_spotify;
CREATE TABLE IF NOT EXISTS silver.albums
USING DELTA
LOCATION 's3://my-spotify-delta-lakehouse/albums/silver/';


In [0]:
bronze_path  = "s3://my-raw-spotify-data/bronze/albums/"
silver_path = "s3://my-spotify-delta-lakehouse/albums/silver/"

#Read stream Spark
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("multiLine", "true")
    .option("cloudFiles.schemaLocation", bronze_path + "_schema")
    .schema(schema)
    .load(bronze_path)
)


#Exploding df
explode_df = df.select(F.explode("albums").alias("albums"))

#Flattening
flat_df = explode_df.select(
    F.col("albums.id").alias("id"),
    F.col("albums.name").alias("name"),
    F.col("albums.type").alias("type"),
    F.col("albums.label").alias("label"),
    F.col("albums.popularity").alias("popularity"),
    F.col("albums.total_tracks").alias("total_tracks"),
    F.col("albums.release_date").alias("release_date"),
    F.col("albums.release_date_precision").alias("release_date_precision"),
    F.col("albums.images")[1]["url"]).alias("image_url_medium")

#Writing to Delta Lake
flat_df.writeStream \
  .format("delta") \
  .option("checkpointLocation", silver_path + "_checkpoint") \
  .option("mergeSchema", "true") \
  .trigger(availableNow=True) \

  .toTable("silver.albums")

In [0]:
%sql
SELECT 
timestamp,
operation,
operationParameters,
FROM (DESCRIBE HISTORY silver.albums);



