In [0]:
import requests
import base64
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import pandas as pd
import time

# ==============================
# Autenticação
# ==============================
# Função para pegar token
def get_spotify_access_token(client_id, client_secret):
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + base64.b64encode(
            f"{client_id}:{client_secret}".encode()
        ).decode()
    }
    data = {"grant_type": "client_credentials"}
    response = requests.post(url, headers=headers, data=data)
    response.raise_for_status()
    token_info = response.json()
    return token_info["access_token"], time.time() + token_info["expires_in"] - 60  # expira em timestamp


In [0]:
# Função para renovar token se necessário
def get_valid_token(client_id, client_secret, token, token_expiry):
    if time.time() >= token_expiry:
        print("Token expirou. Renovando...")
        token, token_expiry = get_spotify_access_token(client_id, client_secret)
    return token, token_expiry


In [0]:
# ==============================
# Função para buscar track + artista
# ==============================
# Função para buscar música (tratando rate limit e token expirado)
def get_spotify_track_info(track_name, client_id, client_secret, token, token_expiry):
    while True:
        token, token_expiry = get_valid_token(client_id, client_secret, token, token_expiry)
        url = "https://api.spotify.com/v1/search"
        headers = {"Authorization": f"Bearer {token}"}
        params = {"q": f"track:{track_name}", "type": "track", "limit": 1}

        response = requests.get(url, headers=headers, params=params)

        if response.status_code == 200:
            return response.json().get("tracks", {}).get("items", []), token, token_expiry

        elif response.status_code == 429:  # Rate limited
            retry_after = int(response.headers.get("Retry-After", "5"))
            print(f"Rate limited. Aguardando {retry_after} segundos...")
            time.sleep(retry_after)

        elif response.status_code == 401:  # Token inválido ou expirado
            print("Token inválido/expirado. Renovando...")
            token, token_expiry = get_spotify_access_token(client_id, client_secret)

        else:
            print(f"Erro {response.status_code} para '{track_name}'")
            return [], token, token_expiry

In [0]:
# Autenticação inicial
client_id = "7c4843d7e1a240b49affd223715ebb36"
client_secret = "43a2691586354d8c856e1b22dca9b766"
token, token_expiry = get_spotify_access_token(client_id, client_secret)

In [0]:
# ==============================
# Carregar dados do Spark
# ==============================
# Carrega lista do Spark (Silver)
rows = spark.sql("SELECT DISTINCT Track, Artist FROM workspace.silver.classic_hit").collect()
tracks_list = [row["Track"] for row in rows]
print(f"Total de músicas: {len(tracks_list)}")


In [0]:
# ==============================
# Buscar no Spotify em paralelo - 8 horas de execução
# ==============================
# Processa em batches menores
batch_size = 500
all_results = []

for i in range(0, len(tracks_list), batch_size):
    batch = tracks_list[i:i + batch_size]
    print(f"Processando batch {i // batch_size + 1} ({len(batch)} músicas)...")

    batch_results = []
    for track in batch:
        results, token, token_expiry = get_spotify_track_info(track, client_id, client_secret, token, token_expiry)
        batch_results.extend(results)

    # Converte para Spark DataFrame e salva incrementalmente
    if batch_results:
        df = pd.DataFrame([{"json": json.dumps(item)} for item in batch_results])
        spark_df = spark.createDataFrame(df)

        # Salvando resultado em um delta table(camada bronze)
        spark_df.write.format("delta") \
            .mode("append") \
            .saveAsTable("workspace.bronze.search_track_artist")

    all_results.extend(batch_results)

print("Finalizado. Total salvo:", len(all_results))

In [0]:
%sql
SELECT * FROM workspace.bronze.search_track_artist;

--DROP TABLE workspace.bronze.search_track_artist;