	# Author: Alexander Staub
	## Last changed: 2025.06.26
	## Purpose: the template for each worker to access the chartmetric characteristics endpoint


In [None]:
#installing packages
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os

In [2]:
#Setup the logging of the errors
logging.basicConfig(
    filename='chartmetric_api_metadata.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

In [3]:
# Define API host and your refresh token
HOST = 'https://api.chartmetric.com'
with open("chartmetric_refresh_token.txt", "r") as f:
    REFRESH_TOKEN = f.read().strip()

In [4]:
# Retrieve an access token using the refresh token
token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})

# Check if the token was retrieved successfully
if token_response.status_code != 200:

    # Log the error and raise an exception
    logging.error(f"Token retrieval error: {token_response.status_code}")
    raise Exception(f"Error: received {token_response.status_code} from /api/token")

# Extract the access token from the response
access_token = token_response.json()['token']

# Define the headers for the API requests
headers = {'Authorization': f'Bearer {access_token}'}

# Defining the get_request

Robust request logic that:
- backs off for a max of 26 hours in retries
- logs all erros it encounters


In [5]:


# --- Robust get_request Function ---
def get_request(endpoint, params=None, max_retries=5):
    backoff = 1  # initial backoff in seconds (used if header data is missing)
    for attempt in range(max_retries):
        try:
            response = requests.get(f"{HOST}{endpoint}", headers=headers, params=params)
        except Exception as ex:
            logging.error(f"Network error on attempt {attempt+1} for {endpoint}: {ex}")
            time.sleep(backoff)
            backoff *= 2
            continue

# Log the response status code and rate limit headers
        logging.info(f"Request to {endpoint} returned {response.status_code}. RateLimit headers: {response.headers}")

# Check if the response status code is 200
        if response.status_code == 200:
            return response.json()

# Handle different types of errors
# 401: Token may have expired; refresh it
        elif response.status_code == 401:
            # Token may have expired; refresh it
            logging.warning(f"401 error for {endpoint}. Refreshing token.")
            token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
            if token_response.status_code != 200:
                logging.error(f"Token refresh failed: {token_response.status_code}")
                raise Exception(f"Token refresh failed with status {token_response.status_code}")
            new_token = token_response.json()['token']
            headers['Authorization'] = f'Bearer {new_token}'
            time.sleep(backoff)
            backoff *= 2

# 429: Rate limit exceeded; wait and retry
        elif response.status_code == 429:
            # Rate limit exceeded.
            reset_timestamp = response.headers.get("X-RateLimit-Reset")
            if reset_timestamp:
                # Wait until the time provided by the API
                sleep_time = int(reset_timestamp) - int(time.time())
                if sleep_time < 0:
                    sleep_time = backoff
            else:
                # No wait time provided by the API; compute one that totals 26 hours over all retries.
                total_wait_limit = 26 * 3600  # total wait time in seconds (26 hours)
                # Sum exponential weights for remaining attempts: for i from current attempt to max_retries-1
                remaining_weights = sum(2 ** i for i in range(attempt, max_retries))
                # Use the weight for the current attempt to assign a fraction of the total wait.
                sleep_time = total_wait_limit * (2 ** attempt / remaining_weights)
            logging.warning(f"429 error for {endpoint}. Sleeping for {sleep_time} seconds (attempt {attempt+1}/{max_retries}).")
            time.sleep(sleep_time)
            backoff *= 2

# 500: Server error; wait and retry
        elif response.status_code >= 500:
            logging.warning(f"Server error {response.status_code} for {endpoint}. Retrying after {backoff} seconds.")
            time.sleep(backoff)
            backoff *= 2

        else:
            logging.error(f"Error {response.status_code} for {endpoint}: {response.text}")
            raise Exception(f"Error: received {response.status_code} from {endpoint}")

# If the loop completes without returning, raise an exception
    raise Exception(f"Max retries exceeded for endpoint {endpoint}")

# Code to retreive the song metadata from chartmetrics

- create the get request
- run the loop over each chartmetric id
- save the response for later parsing

In [8]:
# --- Function to Retrieve song characteristics from Chartmetric ID ---
def get_songchars_ids(chartmetric_id):
    endpoint = f"/api/track/{chartmetric_id}"
    try:
        response = get_request(endpoint)
        logging.info(f"Successfully retrieved song chars for Chartmetric ID {chartmetric_id}")
    except Exception as e:
        logging.error(f"Failed to get song chars for Chartmetric ID {chartmetric_id}: {e}")
        return None

    # The API response (a dictionary) is returned as is
    song_chars = response
    return song_chars

In [None]:
# --- Cell 7: WORKER CONFIGURATION AND EXECUTION ---
# This cell replaces your original main processing loop.

# ================== WORKER-SPECIFIC CONFIGURATION ==================
# --- CHANGE: This is the ONLY line you will edit in each copied worker notebook. ---
PART_NUMBER = 3  # For worker 1, set to 1. For worker 2, set to 2, etc.
# ===================================================================


# --- Main Processing with Checkpointing & DYNAMIC Throttling ---
TOTAL_RATE_LIMIT = 3.5
NUM_WORKERS = 3  # Must match the controller script
# Each worker gets an equal share of the rate limit.
TIME_PER_REQUEST = 1 / (TOTAL_RATE_LIMIT / NUM_WORKERS) 

### WHAT THIS CHANGE DOES:
# This is the most important change for avoiding rate limits. It ensures that the sum of all
# workers' requests does not exceed your total allowance.

Of course. This is a perfect use case for parallel processing. Running multiple scripts is the correct way to utilize your approved rate limit when individual API calls are slow.

Here is a complete guide to refactoring your Jupyter Notebook into a robust, 3-worker parallel processing system. I will first outline the potential issues as you requested, and then detail every necessary change to your notebook code.
Thinking Through the Issues: A Pre-Analysis

Before we change the code, let's address the challenges and design a system to handle them:

    Rate Limiting (The Most Critical Issue): You are allowed 3.5 requests/second in total across all scripts. If you run 3 workers and each one tries to hit 3.5 req/s, your IP address will be sending over 10 req/s. You will be instantly rate-limited (429 errors), and your get_request function's backoff logic will kick in, grinding everything to a halt.
        Solution: We must divide the rate limit. Each of the 3 workers will be configured to target only its share of the total rate (i.e., 3.5 / 3 ≈ 1.17 req/s each). This ensures the combined rate stays below the limit.

    Duplicate Queries & State Management: Your concern about duplicate queries is valid. If three scripts read the same master list, they will all request the same IDs. If they write to the same checkpoint file, they will corrupt it by overwriting each other's progress (a "race condition").
        Solution: We will use a "Controller/Worker" pattern. A single "Controller" script will pre-process the master list of IDs, splitting it into 3 exclusive, non-overlapping files. Each of the 3 "Worker" scripts will then be assigned one file. This completely eliminates the possibility of duplicate queries or race conditions, as they operate in separate sandboxes.

    Robust Checkpointing & Resuming: Your current checkpoint logic overwrites the file every 100 rows. However, it doesn't have logic to load from the checkpoint and resume if the script is stopped and restarted.
        Solution: We will significantly improve this. The worker script will now check if a checkpoint file exists upon starting. If it does, it will load the previously saved responses and skip to the correct starting row in its input file. This makes the entire process resilient to crashes or interruptions.

    Jupyter Notebook Environment: Running parallel tasks from a single notebook file is complex. The strategy of using separate files (worker_1.ipynb, worker_2.ipynb, etc.) is the most practical approach.
        Solution: The guide will show you how to create one master template (worker_template.ipynb) and then how to run it as three separate processes from your command line.

Here is the step-by-step refactoring of your notebook.
Part 1: The Controller Notebook (00_controller.ipynb)

Create a new, separate notebook with this code. Its only purpose is to prepare the data for your workers. You only need to run it once.
Python

# FILE: 00_controller.ipynb

import pandas as pd
import numpy as np
import os

# --- CONFIGURATION ---
NUM_WORKERS = 3  # The number of parallel scripts you want to run
# --- CHANGE: Define the path to your full, original dataset ---
MASTER_INPUT_FILE = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_sample.csv"
# --- CHANGE: Define a new directory where the split input files will be saved ---
WORKER_INPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/"

### WHAT THIS CHANGE DOES:
# This separates the logic. This notebook acts as the main setup script.
# It defines a new, dedicated folder to hold the input files for each worker, keeping your project organized.

print(f"Creating worker input directory: {WORKER_INPUT_DIR}")
os.makedirs(WORKER_INPUT_DIR, exist_ok=True)

# --- Load and Prepare the Master Dataset ---
print(f"Loading master dataset from {MASTER_INPUT_FILE}...")
master_df = pd.read_csv(MASTER_INPUT_FILE)

# Clean the data ONCE before splitting
master_df = master_df.drop_duplicates(subset="chartmetric_ids")
master_df.dropna(subset=['chartmetric_ids'], inplace=True)
master_df.reset_index(drop=True, inplace=True)

print(f"Loaded and cleaned {len(master_df)} unique IDs.")

# --- Split the DataFrame into Chunks for Each Worker ---
id_chunks = np.array_split(master_df, NUM_WORKERS)

# --- Save Each Chunk to its Own File ---
for i, chunk in enumerate(id_chunks):
    part_number = i + 1
    # --- CHANGE: The output path is now dynamic for each worker part. ---
    output_path = os.path.join(WORKER_INPUT_DIR, f"ids_part_{part_number}.csv")
    
    ### WHAT THIS CHANGE DOES:
    # It creates separate, numbered input files (e.g., ids_part_1.csv, ids_part_2.csv).
    # Each file contains a unique and non-overlapping subset of the original IDs.
    
    chunk.to_csv(output_path, index=False)
    print(f"Saved chunk {part_number} with {len(chunk)} IDs to {output_path}")

print("\nController script finished. You can now run the worker notebooks.")

Part 2: The Worker Notebook Template (worker_template.ipynb)

This is the refactored version of your main notebook. Save this as a template. You will make copies of it to run in parallel.
Python

# FILE: worker_template.ipynb

# --- Cell 1: Imports and Logging ---
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os # <-- Make sure os is imported

# --- Cell 2: Logging Configuration (UNMODIFIED) ---
logging.basicConfig(
    filename='chartmetric_api_metadata.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

# --- Cell 3: API Host and Token (UNMODIFIED) ---
HOST = 'https://api.chartmetric.com'
with open("chartmetric_refresh_token.txt", "r") as f:
    REFRESH_TOKEN = f.read().strip()

# --- Cell 4: Retrieve Access Token (UNMODIFIED) ---
token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
if token_response.status_code != 200:
    logging.error(f"Token retrieval error: {token_response.status_code}")
    raise Exception(f"Error: received {token_response.status_code} from /api/token")
access_token = token_response.json()['token']
headers = {'Authorization': f'Bearer {access_token}'}

# --- Cell 5: get_request Function (UNMODIFIED) ---
# This robust function is perfect as-is.
def get_request(endpoint, params=None, max_retries=5):
    # ... (your existing get_request function code goes here, no changes needed) ...
    backoff = 1
    for attempt in range(max_retries):
        try:
            response = requests.get(f"{HOST}{endpoint}", headers=headers, params=params)
        except Exception as ex:
            logging.error(f"Network error on attempt {attempt+1} for {endpoint}: {ex}")
            time.sleep(backoff)
            backoff *= 2
            continue
        logging.info(f"Request to {endpoint} returned {response.status_code}. RateLimit headers: {response.headers}")
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 401:
            logging.warning(f"401 error for {endpoint}. Refreshing token.")
            token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
            if token_response.status_code != 200:
                logging.error(f"Token refresh failed: {token_response.status_code}")
                raise Exception(f"Token refresh failed with status {token_response.status_code}")
            new_token = token_response.json()['token']
            headers['Authorization'] = f'Bearer {new_token}'
            time.sleep(backoff)
            backoff *= 2
        elif response.status_code == 429:
            reset_timestamp = response.headers.get("X-RateLimit-Reset")
            if reset_timestamp:
                sleep_time = int(reset_timestamp) - int(time.time())
                if sleep_time < 0:
                    sleep_time = backoff
            else:
                total_wait_limit = 26 * 3600
                remaining_weights = sum(2 ** i for i in range(attempt, max_retries))
                sleep_time = total_wait_limit * (2 ** attempt / remaining_weights)
            logging.warning(f"429 error for {endpoint}. Sleeping for {sleep_time} seconds (attempt {attempt+1}/{max_retries}).")
            time.sleep(sleep_time)
            backoff *= 2
        elif response.status_code >= 500:
            logging.warning(f"Server error {response.status_code} for {endpoint}. Retrying after {backoff} seconds.")
            time.sleep(backoff)
            backoff *= 2
        else:
            logging.error(f"Error {response.status_code} for {endpoint}: {response.text}")
            raise Exception(f"Error: received {response.status_code} from {endpoint}")
    raise Exception(f"Max retries exceeded for endpoint {endpoint}")


# --- Cell 6: get_songchars_ids Function (UNMODIFIED) ---
# This function is also perfect as-is.
def get_songchars_ids(chartmetric_id):
    # ... (your existing get_songchars_ids function code goes here, no changes needed) ...
    endpoint = f"/api/track/{chartmetric_id}"
    try:
        response = get_request(endpoint)
        logging.info(f"Successfully retrieved song chars for Chartmetric ID {chartmetric_id}")
    except Exception as e:
        logging.error(f"Failed to get song chars for Chartmetric ID {chartmetric_id}: {e}")
        return None
    return response


# --- Cell 7: WORKER CONFIGURATION AND EXECUTION ---
# This cell replaces your original main processing loop.

# ================== WORKER-SPECIFIC CONFIGURATION ==================
# --- CHANGE: This is the ONLY line you will edit in each copied worker notebook. ---
PART_NUMBER = 1  # For worker 1, set to 1. For worker 2, set to 2, etc.
# ===================================================================

# --- CHANGE: Dynamic rate limit calculation ---
TOTAL_RATE_LIMIT = 3.5
NUM_WORKERS = 3  # Must match the controller script
# Each worker gets an equal share of the rate limit.
TIME_PER_REQUEST = 1 / (TOTAL_RATE_LIMIT / NUM_WORKERS) 
### WHAT THIS CHANGE DOES:
# This is the most important change for avoiding rate limits. It ensures that the sum of all
# workers' requests does not exceed your total allowance.

# Dynamic file path generation ---
WORKER_INPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs/"
METADATA_OUTPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/"

INPUT_FILE = os.path.join(WORKER_INPUT_DIR, f"ids_part_{PART_NUMBER}.csv")
# Each worker gets its own output directory to prevent file conflicts
CHECKPOINT_DIR = os.path.join(METADATA_OUTPUT_DIR, f"part_{PART_NUMBER}") 
CHECKPOINT_FILE = os.path.join(CHECKPOINT_DIR, "song_chars_checkpoint.json")

### WHAT THIS CHANGE DOES:
# This makes the script a reusable template. By changing only PART_NUMBER at the top,
# the script automatically targets the correct input file (e.g., `ids_part_1.csv`)
# and creates a unique, safe output directory (e.g., `.../part_1/`) for its checkpoints.

os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# --- CHANGE: Load worker-specific data ---
print(f"WORKER {PART_NUMBER}: Loading data from {INPUT_FILE}")
worker_df = pd.read_csv(INPUT_FILE)


# --- CHANGE: Robust Checkpoint Loading and Resuming ---
song_chars_responses = []
if os.path.exists(CHECKPOINT_FILE):
    print(f"WORKER {PART_NUMBER}: Found existing checkpoint. Loading previous responses...")
    with open(CHECKPOINT_FILE, "r") as f:
        song_chars_responses = json.load(f)

# Determine the starting row by the number of responses already saved
start_row = len(song_chars_responses)
print(f"WORKER {PART_NUMBER}: Resuming from row {start_row} of {len(worker_df)}.")
### WHAT THIS CHANGE DOES:
# This makes your script truly robust. If a worker stops for any reason, you can just
# restart it, and it will load its progress and continue where it left off,
# saving you from re-requesting thousands of IDs.

# --- Main Processing Loop (with .iloc for resuming) ---
checkpoint_interval = 100
# Use .iloc[start_row:] to slice the dataframe and start from the correct place
for idx, row in worker_df.iloc[start_row:].iterrows():
    loop_start_time = time.time()

    chartmetric_id = row.get("chartmetric_ids")
    # --- Add worker ID to logging for clarity ---
    print(f"WORKER {PART_NUMBER} | Processing row {idx}: Chartmetric ID = {chartmetric_id}")
    logging.info(f"WORKER {PART_NUMBER} | Processing row {idx}: Chartmetric ID = {chartmetric_id}")

    # No need to check for pd.isnull, the controller script already dropped them.
    
    if pd.isnull(chartmetric_id):
        print(f"Row {idx} has no Chartmetric ID. Skipping.")
        logging.info(f"Row {idx} has no Chartmetric ID. Skipping.")
        continue
    
    try:
        song_chars = get_songchars_ids(chartmetric_id)
    except Exception as e:
        # --- CHANGE: Add worker ID to logging for clarity ---
        print(f"WORKER {PART_NUMBER} | Error processing Chartmetric ID {chartmetric_id} at row {idx}: {e}")
        logging.error(f"WORKER {PART_NUMBER} | Error processing Chartmetric ID {chartmetric_id} at row {idx}: {e}")
        song_chars = None
    
    # Append the response (or None) to our list.
    song_chars_responses.append(song_chars)
    
    
    
    # --- DYNAMIC SLEEP LOGIC ---
    # REMOVED: time.sleep(0.3)

    # Calculate how long the API call and processing took
    loop_end_time = time.time()
    elapsed_time = loop_end_time - loop_start_time

    # Calculate the remaining time to sleep to hit the target rate
    sleep_duration = TIME_PER_REQUEST - elapsed_time

    # If the request was fast, sleep for the remaining time.
    # If the request was slow (elapsed_time > TIME_PER_REQUEST), don't sleep at all.
    if sleep_duration > 0:
        time.sleep(sleep_duration)
    
    # --- Checkpointing Logic ---
    # We now check based on the length of the response list
    if len(song_chars_responses) % checkpoint_interval == 0:
        with open(CHECKPOINT_FILE, "w") as f:
            json.dump(song_chars_responses, f, indent=2)
        print(f"WORKER {PART_NUMBER} | Checkpoint saved at row {idx}")
        logging.info(f"WORKER {PART_NUMBER} | Checkpoint saved at row {idx}")

# --- Final Save ---
print(f"WORKER {PART_NUMBER}: Loop finished. Saving final data...")
with open(CHECKPOINT_FILE, "w") as f:
    json.dump(song_chars_responses, f, indent=2)
print(f"WORKER {PART_NUMBER}: All tasks complete.")