In [0]:
from pyspark.sql import functions as F
import json

In [0]:
from my_schemas import tracks_schema as schema

In [0]:
%sql
USE CATALOG my_spotify;
CREATE TABLE IF NOT EXISTS silver.tracks
USING DELTA
LOCATION 's3://my-spotify-delta-lakehouse/tracks/silver/';


In [0]:
bronze_path  = "s3://my-raw-spotify-data/bronze/tracks/"
silver_path = "s3://my-spotify-delta-lakehouse/tracks/silver/"

#Read stream Spark
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("multiLine", "true")
    .option("cloudFiles.schemaLocation", bronze_path + "_schema")
    .schema(schema)
    .load(bronze_path)
)


#Exploding df
explode_df = df.select(F.explode("tracks").alias("tracks"))

#Flattening
flat_df = explode_df.select(
    F.col("tracks.id").alias("id"),
    F.col("tracks.name").alias("name"),
    F.col("tracks.duration_ms").alias("duration_ms"),
    F.col("tracks.popularity").alias("popularity"),
    F.col("tracks.explicit").alias("explicit"),
    F.col("tracks.track_number").alias("track_number"))

#Writing to Delta Lake
flat_df.writeStream \
  .format("delta") \
  .option("checkpointLocation", silver_path + "_checkpoint") \
  .option("mergeSchema", "true") \
  .trigger(availableNow=True) \
  .toTable("silver.tracks")