In [0]:
# Definig  widgets from job parameters
dbutils.widgets.text("AWS_ACCESS_KEY", "")
dbutils.widgets.text("AWS_SECRET_ACCESS_KEY", "")


In [0]:
# Defining the paths for different data

types = {"artists":{"ids":"first_artist_id"},
          "albums" :{"ids":"album_id"},
           "tracks":{"ids":"track_id"}}

endpoints = {key:f"https://api.spotify.com/v1/{key}" for key in types.keys()}


bronze_paths = {value: f"bronze/{value}/" for value in types.keys()} 

silver_paths = {value: f"s3://my-spotify-delta-lakehouse/{value}/silver/" for value in types.keys()} 




In [0]:
#------------*** IMPORTANT ***---------------

#  DEFINING THE KIND OF DATA WE ARE GOING TO CALL


KIND ="artists"

In [0]:
types[KIND]["ids"]

In [0]:
# Reading unique ids from silver user recent played

played_unique_ids = spark.sql(f"""
SELECT DISTINCT {types[KIND]["ids"]} FROM my_spotify.silver.user_recent_played
""").collect()

played_unique_ids = [value[types[KIND]["ids"]] for value in played_unique_ids]


In [0]:
#Excluding the ids we already have in the silver table to make api call

from delta.tables import DeltaTable

dim_table = DeltaTable.forPath(spark, silver_paths[KIND])

# Checking if the album already exists in the silver table
if DeltaTable.isDeltaTable(spark, silver_paths[KIND]):
    exist_ids = dim_table.toDF().select("id").distinct().collect()
    exist_ids = [row.id for row in exist_ids]
else:
    exist_ids = []

In [0]:
ids_to_call = [value for value in played_unique_ids if value not in exist_ids]

In [0]:
# Saving the new ids to call to test later
dbutils.jobs.taskValues.set(key=f"new_{KIND}_ids_to_call",value=ids_to_call)

In [0]:
if not ids_to_call:
    print("✅ No new IDs to process — skipping API calls.")
    dbutils.notebook.exit("SKIPPED")

In [0]:
# Fetching the token from upstream token refresh task

fresh_token = dbutils.jobs.taskValues.get(taskKey="REFRESH_TOKEN", key="SPOTIFY_TOKEN",debugValue="Invalid_token")

assert fresh_token != "Invalid_token"

In [0]:
import boto3
import json
import datetime as dt


# SETTING VARIABLES FOR API CALL

endpoint = endpoints[KIND]

headers = {
    "Authorization": f"Bearer {fresh_token}"
}

#Since Spotify API for  IDs for the albums accepts  maximum: 20 IDs and 50 tracks per call, i have to split the ids in batches

if KIND == "albums":
    max_batch = 20
else:
    max_batch = 50


if len(ids_to_call) <= max_batch:
    batches = [ids_to_call]
else:
    batch_size = max_batch

    batches = [ids_to_call[i:i + batch_size] for i in range(0, len(ids_to_call), batch_size)]

    

In [0]:
# Authenticated Session AWS
session = boto3.Session(
    aws_access_key_id=dbutils.widgets.get("AWS_ACCESS_KEY"),
    aws_secret_access_key=dbutils.widgets.get("AWS_SECRET_ACCESS_KEY"),
)
# Defining write function
def write_to_s3(data, bucket_name,path,date,number):
    key = f"{path}{date}/{date}{number}.json"
    s3 = session.client('s3')
    s3.put_object(Bucket=bucket_name, Key=key, Body=json.dumps(data))
    return key


def extract_ids_from_bronze(bucket, key, kind):
    s3 = session.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    content = obj["Body"].read().decode("utf-8")
    data = json.loads(content)

    if kind == "tracks":
        return [item["id"] for item in data.get("tracks", [])]

    elif kind == "albums":
        return [item["id"] for item in data.get("albums", [])]

    elif kind == "artists":
        return [item["id"] for item in data.get("artists", [])]

    else:
        return []



In [0]:
import requests

#   START API CALL
today = dt.datetime.today().strftime("%Y-%m-%d")

all_ids_written = []

for i in range(len(batches)):
    
    ids_group = batches[i]
    
    # Different ids for every call
    parameters = {
        "ids": ",".join([str(value) for value in ids_group])
    }

    if KIND != "arists":
        parameters["market"] = "IT"
    
    else:
        continue
    
    
    
    
    # API CALL
    resp = requests.get(endpoint, headers=headers, params=parameters)
    data = resp.json()

    # WRITING RAW DATA INTO S3 RAW BUCKET BRONZE
    
    written_data = write_to_s3(data, "my-raw-spotify-data",bronze_paths[KIND],today,i)
    
    batch_ids = extract_ids_from_bronze("my-raw-spotify-data", written_data, KIND)
    all_ids_written.extend(batch_ids)

In [0]:
# Saving the total number of rows written to test later
dbutils.jobs.taskValues.set(key=f"new_{KIND}_ids_written",value=all_ids_written)
