	# Author: Alexander Staub
	## Last changed: 2025.02.13
	## Purpose: Getting the chartmetrics IDs for a list of songs


In [2]:
#installing packages
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os
import numpy as np

# CHANGE ACCORDING TO WORKER

In [None]:
# ================== WORKER-SPECIFIC CONFIGURATION ==================
# --- CHANGE: This is the ONLY line you will edit in each copied worker notebook. ---
PART_NUMBER = 1  # For worker 1, set to 1. For worker 2, set to 2, etc.
# ===================================================================


In [None]:
#Setup the logging of the errors
# --- IMPORTANT --- change to the correct worker number
logging.basicConfig(
    filename=f'chartmetric_api_metadata_worker_{PART_NUMBER}.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

In [4]:
# Define API host and your refresh token
HOST = 'https://api.chartmetric.com'
with open("chartmetric_refresh_token.txt", "r") as f:
    REFRESH_TOKEN = f.read().strip()

In [5]:
# Retrieve an access token using the refresh token
token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})

# Check if the token was retrieved successfully
if token_response.status_code != 200:

    # Log the error and raise an exception
    logging.error(f"Token retrieval error: {token_response.status_code}")
    raise Exception(f"Error: received {token_response.status_code} from /api/token")

# Extract the access token from the response
access_token = token_response.json()['token']

# Define the headers for the API requests
headers = {'Authorization': f'Bearer {access_token}'}

# Defining the get_request

Robust request logic that:
- backs off for a max of 26 hours in retries
- logs all erros it encounters


In [6]:


# --- Robust get_request Function ---
def get_request(endpoint, params=None, max_retries=5):
    backoff = 1  # initial backoff in seconds (used if header data is missing)
    for attempt in range(max_retries):
        try:
            response = requests.get(f"{HOST}{endpoint}", headers=headers, params=params)
        except Exception as ex:
            logging.error(f"Network error on attempt {attempt+1} for {endpoint}: {ex}")
            time.sleep(backoff)
            backoff *= 2
            continue

# Log the response status code and rate limit headers
        logging.info(f"Request to {endpoint} returned {response.status_code}. RateLimit headers: {response.headers}")

# Check if the response status code is 200
        if response.status_code == 200:
            return response.json()

# Handle different types of errors
# 401: Token may have expired; refresh it
        elif response.status_code == 401:
            # Token may have expired; refresh it
            logging.warning(f"401 error for {endpoint}. Refreshing token.")
            token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
            if token_response.status_code != 200:
                logging.error(f"Token refresh failed: {token_response.status_code}")
                raise Exception(f"Token refresh failed with status {token_response.status_code}")
            new_token = token_response.json()['token']
            headers['Authorization'] = f'Bearer {new_token}'
            time.sleep(backoff)
            backoff *= 2

# 429: Rate limit exceeded; wait and retry
        elif response.status_code == 429:
            # Rate limit exceeded.
            reset_timestamp = response.headers.get("X-RateLimit-Reset")
            if reset_timestamp:
                # Wait until the time provided by the API
                sleep_time = int(reset_timestamp) - int(time.time())
                if sleep_time < 0:
                    sleep_time = backoff
            else:
                # No wait time provided by the API; compute one that totals 26 hours over all retries.
                total_wait_limit = 26 * 3600  # total wait time in seconds (26 hours)
                # Sum exponential weights for remaining attempts: for i from current attempt to max_retries-1
                remaining_weights = sum(2 ** i for i in range(attempt, max_retries))
                # Use the weight for the current attempt to assign a fraction of the total wait.
                sleep_time = total_wait_limit * (2 ** attempt / remaining_weights)
            logging.warning(f"429 error for {endpoint}. Sleeping for {sleep_time} seconds (attempt {attempt+1}/{max_retries}).")
            time.sleep(sleep_time)
            backoff *= 2

# 500: Server error; wait and retry
        elif response.status_code >= 500:
            logging.warning(f"Server error {response.status_code} for {endpoint}. Retrying after {backoff} seconds.")
            time.sleep(backoff)
            backoff *= 2

        else:
            logging.error(f"Error {response.status_code} for {endpoint}: {response.text}")
            raise Exception(f"Error: received {response.status_code} from {endpoint}")

# If the loop completes without returning, raise an exception
    raise Exception(f"Max retries exceeded for endpoint {endpoint}")

# CHANGE ACCORDING TO WORKER

In [None]:
# --- Cell 7: WORKER CONFIGURATION AND EXECUTION ---


# --- CHANGE: Dynamic rate limit calculation ---
TOTAL_RATE_LIMIT = 3.5
NUM_WORKERS = 3  # Must match the controller script
# Each worker gets an equal share of the rate limit.
TIME_PER_REQUEST = 1 / (TOTAL_RATE_LIMIT / NUM_WORKERS) 
### WHAT THIS CHANGE DOES:
# This is the most important change for avoiding rate limits. It ensures that the sum of all
# workers' requests does not exceed your total allowance.

checkpoint_interval = 1000  # How often to save progress (e.g., every 1000 records)

# Dynamic file path generation ---
WORKER_INPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/worker_inputs_chartmetric_ids/"
ID_OUTPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/"

INPUT_FILE = os.path.join(WORKER_INPUT_DIR, f"ids_part_{PART_NUMBER}.csv")
# Each worker gets its own output directory to prevent file conflicts
CHECKPOINT_DIR = os.path.join(ID_OUTPUT_DIR, f"part_{PART_NUMBER}") 
CHECKPOINT_FILE = os.path.join(CHECKPOINT_DIR, "chartmetric_ids_checkpoint.csv")

### WHAT THIS CHANGE DOES:
# This makes the script a reusable template. By changing only PART_NUMBER at the top,
# the script automatically targets the correct input file (e.g., `ids_part_1.csv`)
# and creates a unique, safe output directory (e.g., `.../part_1/`) for its checkpoints.

os.makedirs(CHECKPOINT_DIR, exist_ok=True)


# --- CHANGE: Load worker-specific data ---
print(f"WORKER {PART_NUMBER}: Loading data from {INPUT_FILE}")
worker_df = pd.read_csv(INPUT_FILE)

# --- ADD THIS LINE TO REMOVE BAD DATA ---
worker_df.dropna(subset=['spotify_isrc'], inplace=True)

In [8]:
# --- Function to Retrieve Chartmetric ID for an ISRC ---
def get_chartmetric_ids(isrc):
    endpoint = f"/api/track/isrc/{isrc}/get-ids"
    try:
        response = get_request(endpoint)
    
    # Log the response status code and rate limit headers
    except Exception as e:
        logging.error(f"Failed to get Chartmetric ID for ISRC {isrc}: {e}")
        return None

    # Expecting response["obj"] to be a non-empty list
    if response.get("obj") and isinstance(response["obj"], list) and len(response["obj"]) > 0:

        # Extract the chartmetric_ids from the first element of the list
        cm_ids = response["obj"][0].get("chartmetric_ids", None)

        # Check if cm_ids is a non-empty list
        if cm_ids and isinstance(cm_ids, list) and len(cm_ids) > 0:
            try:
                return float(cm_ids[0])
            
            # Log conversion errors
            except Exception as conv_err:
                logging.error(f"Conversion error for ISRC {isrc}: {conv_err}")
                return None
    return None



In [None]:
print("--- Initializing Main Processing Loop ---")

# A temporary list to hold new results before writing to file
results_buffer = []

try:
    # --- STEP 2a: Load the set of already processed ISRCs for fast lookups ---
    # This is the "resume" part. We check the checkpoint file to see what's already done.
    processed_isrcs = set()
    if os.path.exists(CHECKPOINT_FILE):
        print(f"Checkpoint file found. Loading processed ISRCs from: {CHECKPOINT_FILE}")
        logging.info(f"Loading processed ISRCs from checkpoint: {CHECKPOINT_FILE}")
        try:
            # We only need the 'spotify_isrc' column to know what's already done.
            processed_df = pd.read_csv(CHECKPOINT_FILE, usecols=['spotify_isrc'])
            processed_isrcs = set(processed_df['spotify_isrc'].dropna().unique())
            print(f"Loaded {len(processed_isrcs):,} previously completed ISRCs. They will be skipped.")
            logging.info(f"Loaded {len(processed_isrcs):,} previously completed ISRCs.")
        except (pd.errors.EmptyDataError, KeyError, FileNotFoundError):
            print("Checkpoint file is empty or invalid. A new one will be created.")
            logging.warning("Checkpoint file was found but is empty or invalid.")
            # If the file is broken, we start fresh and create the header.
            pd.DataFrame(columns=['spotify_isrc', 'chartmetric_ids']).to_csv(CHECKPOINT_FILE, index=False)
    else:
        print("No checkpoint file found. Creating a new one with headers.")
        logging.info("No checkpoint file found. Creating a new one.")
        # If the file doesn't exist, create it with the necessary columns.
        pd.DataFrame(columns=['spotify_isrc', 'chartmetric_ids']).to_csv(CHECKPOINT_FILE, index=False)

    
    
    # --- STEP 2: Iterate through the main dataframe ---
    total_rows = len(worker_df)
    print(f"Beginning iteration over {total_rows} total rows.")
    
    for idx, row in worker_df.iterrows():
        isrc = row['spotify_isrc']
        
        # --- EFFICIENT RESUME LOGIC ---
        # If we have already processed this ISRC in a previous run, skip it immediately.
        if isrc in processed_isrcs:
            continue

        loop_start_time = time.time()
        print(f"Processing row {idx}/{total_rows}: ISRC = {isrc}")

        try:
            chartmetric_id = get_chartmetric_ids(isrc)
        except Exception as e:
            print(f"Error processing ISRC {isrc} at row {idx}: {e}")
            logging.error(f"Error processing ISRC {isrc} at row {idx}: {e}")
            chartmetric_id = None
        
        # Add the new result to our temporary buffer instead of the giant dataframe
        results_buffer.append({
            'spotify_isrc': isrc, 
            'chartmetric_ids': chartmetric_id
        })
        
        # Add to our in-memory set to avoid re-processing in the *same* run if there are duplicates
        processed_isrcs.add(isrc)
        
        logging.info(f"Buffered ISRC {isrc}: Chartmetric ID = {chartmetric_id}")

        # --- DYNAMIC SLEEP LOGIC ---
        elapsed_time = time.time() - loop_start_time
        sleep_duration = TIME_PER_REQUEST - elapsed_time
        if sleep_duration > 0:
            time.sleep(sleep_duration)
        
        # --- FAST APPEND-BASED CHECKPOINT ---
        # When the buffer is full, append it to the checkpoint file. This is very fast.
        if len(results_buffer) >= checkpoint_interval:
            print(f"--- Checkpoint interval reached. Appending {len(results_buffer)} new results... ---")
            checkpoint_df = pd.DataFrame(results_buffer)
            # Use mode='a' (append) and header=False to efficiently add to the existing file
            checkpoint_df.to_csv(CHECKPOINT_FILE, mode='a', header=False, index=False)
            results_buffer = [] # Reset the buffer for the next batch
            print(f"--- Batch appended to checkpoint file. ---")
            logging.info(f"Appended {len(checkpoint_df)} rows to checkpoint.")

finally:
    # --- FINAL SAVE OF REMAINING RESULTS ---
    # This ensures that even if the loop is interrupted, the last batch of results is saved.
    if results_buffer:
        print(f"\nLoop finished or was interrupted. Appending {len(results_buffer)} final results...")
        final_df = pd.DataFrame(results_buffer)
        final_df.to_csv(CHECKPOINT_FILE, mode='a', header=False, index=False)
        print(f"Final batch appended to: {CHECKPOINT_FILE}")
        logging.info(f"Appended {len(final_df)} final rows to checkpoint.")
    
    print("\nProcessing complete.")
    logging.info("All processing is complete.")

--- Initializing Main Processing Loop ---
No checkpoint file found. Creating a new one with headers.
Beginning iteration over 3169134 total rows.
Processing row 0/3169134: ISRC = GBARL0100013
Processing row 1/3169134: ISRC = GBARL0100173
Processing row 2/3169134: ISRC = FR2X41547576
Processing row 3/3169134: ISRC = ARA340800036
Processing row 4/3169134: ISRC = BEY900700058
Processing row 5/3169134: ISRC = BEQ028000010
Processing row 6/3169134: ISRC = BEZ050000047
Processing row 7/3169134: ISRC = NLA307200010
Processing row 8/3169134: ISRC = BEK011900408
Processing row 9/3169134: ISRC = BED010200806
Processing row 10/3169134: ISRC = BEC010100040
Processing row 11/3169134: ISRC = BEY900900138
Processing row 12/3169134: ISRC = GBAKW0100229
Processing row 13/3169134: ISRC = NLA910160001
Processing row 14/3169134: ISRC = FRO060200490
Processing row 15/3169134: ISRC = DEN060200635
Processing row 16/3169134: ISRC = BEY301200423
Processing row 17/3169134: ISRC = USVT10100031
Processing row 18/