	# Author: Alexander Staub
	## Last changed: 2025.06.26
	## Purpose: the template for each worker to access the chartmetric characteristics endpoint


In [None]:
#installing packages
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os

In [None]:
#Setup the logging of the errors
logging.basicConfig(
    filename='chartmetric_api_metadata_worker_2.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

In [3]:
# Define API host and your refresh token
HOST = 'https://api.chartmetric.com'
with open("chartmetric_refresh_token.txt", "r") as f:
    REFRESH_TOKEN = f.read().strip()

In [4]:
# Retrieve an access token using the refresh token
token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})

# Check if the token was retrieved successfully
if token_response.status_code != 200:

    # Log the error and raise an exception
    logging.error(f"Token retrieval error: {token_response.status_code}")
    raise Exception(f"Error: received {token_response.status_code} from /api/token")

# Extract the access token from the response
access_token = token_response.json()['token']

# Define the headers for the API requests
headers = {'Authorization': f'Bearer {access_token}'}

# Defining the get_request

Robust request logic that:
- backs off for a max of 26 hours in retries
- logs all erros it encounters


In [5]:


# --- Robust get_request Function ---
def get_request(endpoint, params=None, max_retries=5):
    backoff = 1  # initial backoff in seconds (used if header data is missing)
    for attempt in range(max_retries):
        try:
            response = requests.get(f"{HOST}{endpoint}", headers=headers, params=params)
        except Exception as ex:
            logging.error(f"Network error on attempt {attempt+1} for {endpoint}: {ex}")
            time.sleep(backoff)
            backoff *= 2
            continue

# Log the response status code and rate limit headers
        logging.info(f"Request to {endpoint} returned {response.status_code}. RateLimit headers: {response.headers}")

# Check if the response status code is 200
        if response.status_code == 200:
            return response.json()

# Handle different types of errors
# 401: Token may have expired; refresh it
        elif response.status_code == 401:
            # Token may have expired; refresh it
            logging.warning(f"401 error for {endpoint}. Refreshing token.")
            token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
            if token_response.status_code != 200:
                logging.error(f"Token refresh failed: {token_response.status_code}")
                raise Exception(f"Token refresh failed with status {token_response.status_code}")
            new_token = token_response.json()['token']
            headers['Authorization'] = f'Bearer {new_token}'
            time.sleep(backoff)
            backoff *= 2

# 429: Rate limit exceeded; wait and retry
        elif response.status_code == 429:
            # Rate limit exceeded.
            reset_timestamp = response.headers.get("X-RateLimit-Reset")
            if reset_timestamp:
                # Wait until the time provided by the API
                sleep_time = int(reset_timestamp) - int(time.time())
                if sleep_time < 0:
                    sleep_time = backoff
            else:
                # No wait time provided by the API; compute one that totals 26 hours over all retries.
                total_wait_limit = 26 * 3600  # total wait time in seconds (26 hours)
                # Sum exponential weights for remaining attempts: for i from current attempt to max_retries-1
                remaining_weights = sum(2 ** i for i in range(attempt, max_retries))
                # Use the weight for the current attempt to assign a fraction of the total wait.
                sleep_time = total_wait_limit * (2 ** attempt / remaining_weights)
            logging.warning(f"429 error for {endpoint}. Sleeping for {sleep_time} seconds (attempt {attempt+1}/{max_retries}).")
            time.sleep(sleep_time)
            backoff *= 2

# 500: Server error; wait and retry
        elif response.status_code >= 500:
            logging.warning(f"Server error {response.status_code} for {endpoint}. Retrying after {backoff} seconds.")
            time.sleep(backoff)
            backoff *= 2

        else:
            logging.error(f"Error {response.status_code} for {endpoint}: {response.text}")
            raise Exception(f"Error: received {response.status_code} from {endpoint}")

# If the loop completes without returning, raise an exception
    raise Exception(f"Max retries exceeded for endpoint {endpoint}")

# Code to retreive the song metadata from chartmetrics

- create the get request
- run the loop over each chartmetric id
- save the response for later parsing

In [8]:
# --- Function to Retrieve song characteristics from Chartmetric ID ---
def get_songchars_ids(chartmetric_id):
    endpoint = f"/api/track/{chartmetric_id}"
    try:
        response = get_request(endpoint)
        logging.info(f"Successfully retrieved song chars for Chartmetric ID {chartmetric_id}")
    except Exception as e:
        logging.error(f"Failed to get song chars for Chartmetric ID {chartmetric_id}: {e}")
        return None

    # The API response (a dictionary) is returned as is
    song_chars = response
    return song_chars

In [None]:
# --- Cell 7: WORKER CONFIGURATION AND EXECUTION ---
# This cell replaces your original main processing loop.

# ================== WORKER-SPECIFIC CONFIGURATION ==================
# --- CHANGE: This is the ONLY line you will edit in each copied worker notebook. ---
PART_NUMBER = 2  # For worker 1, set to 1. For worker 2, set to 2, etc.
# ===================================================================

# --- CHANGE: Dynamic rate limit calculation ---
TOTAL_RATE_LIMIT = 3.5
NUM_WORKERS = 3  # Must match the controller script
# Each worker gets an equal share of the rate limit.
TIME_PER_REQUEST = 1 / (TOTAL_RATE_LIMIT / NUM_WORKERS) 
### WHAT THIS CHANGE DOES:
# This is the most important change for avoiding rate limits. It ensures that the sum of all
# workers' requests does not exceed your total allowance.

# Dynamic file path generation ---
WORKER_INPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/"
METADATA_OUTPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/"

INPUT_FILE = os.path.join(WORKER_INPUT_DIR, f"ids_part_{PART_NUMBER}.csv")
# Each worker gets its own output directory to prevent file conflicts
CHECKPOINT_DIR = os.path.join(METADATA_OUTPUT_DIR, f"part_{PART_NUMBER}") 
CHECKPOINT_FILE = os.path.join(CHECKPOINT_DIR, "song_chars_checkpoint.json")

### WHAT THIS CHANGE DOES:
# This makes the script a reusable template. By changing only PART_NUMBER at the top,
# the script automatically targets the correct input file (e.g., `ids_part_1.csv`)
# and creates a unique, safe output directory (e.g., `.../part_1/`) for its checkpoints.

os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# --- CHANGE: Load worker-specific data ---
print(f"WORKER {PART_NUMBER}: Loading data from {INPUT_FILE}")
worker_df = pd.read_csv(INPUT_FILE)


# --- CHANGE: Robust Checkpoint Loading and Resuming ---
song_chars_responses = []
if os.path.exists(CHECKPOINT_FILE):
    print(f"WORKER {PART_NUMBER}: Found existing checkpoint. Loading previous responses...")
    with open(CHECKPOINT_FILE, "r") as f:
        song_chars_responses = json.load(f)

# Determine the starting row by the number of responses already saved
start_row = len(song_chars_responses)
print(f"WORKER {PART_NUMBER}: Resuming from row {start_row} of {len(worker_df)}.")
### WHAT THIS CHANGE DOES:
# This makes your script truly robust. If a worker stops for any reason, you can just
# restart it, and it will load its progress and continue where it left off,
# saving you from re-requesting thousands of IDs.

# --- Main Processing Loop (with .iloc for resuming) ---
checkpoint_interval = 100
# Use .iloc[start_row:] to slice the dataframe and start from the correct place
for idx, row in worker_df.iloc[start_row:].iterrows():
    loop_start_time = time.time()

    chartmetric_id = row.get("chartmetric_ids")
    # --- Add worker ID to logging for clarity ---
    print(f"WORKER {PART_NUMBER} | Processing row {idx}: Chartmetric ID = {chartmetric_id}")
    logging.info(f"WORKER {PART_NUMBER} | Processing row {idx}: Chartmetric ID = {chartmetric_id}")

    # No need to check for pd.isnull, the controller script already dropped them.
    
    if pd.isnull(chartmetric_id):
        print(f"Row {idx} has no Chartmetric ID. Skipping.")
        logging.info(f"Row {idx} has no Chartmetric ID. Skipping.")
        continue
    
    try:
        song_chars = get_songchars_ids(chartmetric_id)
    except Exception as e:
        # --- CHANGE: Add worker ID to logging for clarity ---
        print(f"WORKER {PART_NUMBER} | Error processing Chartmetric ID {chartmetric_id} at row {idx}: {e}")
        logging.error(f"WORKER {PART_NUMBER} | Error processing Chartmetric ID {chartmetric_id} at row {idx}: {e}")
        song_chars = None
    
    # Append the response (or None) to our list.
    song_chars_responses.append(song_chars)
    
    
    
    # --- DYNAMIC SLEEP LOGIC ---
    # REMOVED: time.sleep(0.3)

    # Calculate how long the API call and processing took
    loop_end_time = time.time()
    elapsed_time = loop_end_time - loop_start_time

    # Calculate the remaining time to sleep to hit the target rate
    sleep_duration = TIME_PER_REQUEST - elapsed_time

    # If the request was fast, sleep for the remaining time.
    # If the request was slow (elapsed_time > TIME_PER_REQUEST), don't sleep at all.
    if sleep_duration > 0:
        time.sleep(sleep_duration)
    
    # --- Checkpointing Logic ---
    # We now check based on the length of the response list
    if len(song_chars_responses) % checkpoint_interval == 0:
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump(song_chars_responses, f, indent=2)
        print(f"WORKER {PART_NUMBER} | Checkpoint saved at row {idx}")
        logging.info(f"WORKER {PART_NUMBER} | Checkpoint saved at row {idx}")

# --- Final Save ---
print(f"WORKER {PART_NUMBER}: Loop finished. Saving final data...")
with open(CHECKPOINT_FILE, "w") as f:
    json.dump(song_chars_responses, f, indent=2)
print(f"WORKER {PART_NUMBER}: All tasks complete.")