	# Author: Alexander Staub
	## Last changed: 2025.02.13
	## Purpose: Getting the chartmetrics IDs for a list of songs


In [2]:
#installing packages
import time
import requests
import logging
import pandas as pd
import os
import numpy as np

In [3]:
#Setup the logging of the errors
logging.basicConfig(
    filename='chartmetric_api.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

In [4]:
# Define API host and your refresh token
HOST = 'https://api.chartmetric.com'
with open("chartmetric_refresh_token.txt", "r") as f:
    REFRESH_TOKEN = f.read().strip()

In [5]:
# Retrieve an access token using the refresh token
token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})

# Check if the token was retrieved successfully
if token_response.status_code != 200:

    # Log the error and raise an exception
    logging.error(f"Token retrieval error: {token_response.status_code}")
    raise Exception(f"Error: received {token_response.status_code} from /api/token")

# Extract the access token from the response
access_token = token_response.json()['token']

# Define the headers for the API requests
headers = {'Authorization': f'Bearer {access_token}'}

# Defining the get_request

Robust request logic that:
- backs off for a max of 26 hours in retries
- logs all erros it encounters


In [6]:


# --- Robust get_request Function ---
def get_request(endpoint, params=None, max_retries=5):
    backoff = 1  # initial backoff in seconds (used if header data is missing)
    for attempt in range(max_retries):
        try:
            response = requests.get(f"{HOST}{endpoint}", headers=headers, params=params)
        except Exception as ex:
            logging.error(f"Network error on attempt {attempt+1} for {endpoint}: {ex}")
            time.sleep(backoff)
            backoff *= 2
            continue

# Log the response status code and rate limit headers
        logging.info(f"Request to {endpoint} returned {response.status_code}. RateLimit headers: {response.headers}")

# Check if the response status code is 200
        if response.status_code == 200:
            return response.json()

# Handle different types of errors
# 401: Token may have expired; refresh it
        elif response.status_code == 401:
            # Token may have expired; refresh it
            logging.warning(f"401 error for {endpoint}. Refreshing token.")
            token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
            if token_response.status_code != 200:
                logging.error(f"Token refresh failed: {token_response.status_code}")
                raise Exception(f"Token refresh failed with status {token_response.status_code}")
            new_token = token_response.json()['token']
            headers['Authorization'] = f'Bearer {new_token}'
            time.sleep(backoff)
            backoff *= 2

# 429: Rate limit exceeded; wait and retry
        elif response.status_code == 429:
            # Rate limit exceeded.
            reset_timestamp = response.headers.get("X-RateLimit-Reset")
            if reset_timestamp:
                # Wait until the time provided by the API
                sleep_time = int(reset_timestamp) - int(time.time())
                if sleep_time < 0:
                    sleep_time = backoff
            else:
                # No wait time provided by the API; compute one that totals 26 hours over all retries.
                total_wait_limit = 26 * 3600  # total wait time in seconds (26 hours)
                # Sum exponential weights for remaining attempts: for i from current attempt to max_retries-1
                remaining_weights = sum(2 ** i for i in range(attempt, max_retries))
                # Use the weight for the current attempt to assign a fraction of the total wait.
                sleep_time = total_wait_limit * (2 ** attempt / remaining_weights)
            logging.warning(f"429 error for {endpoint}. Sleeping for {sleep_time} seconds (attempt {attempt+1}/{max_retries}).")
            time.sleep(sleep_time)
            backoff *= 2

# 500: Server error; wait and retry
        elif response.status_code >= 500:
            logging.warning(f"Server error {response.status_code} for {endpoint}. Retrying after {backoff} seconds.")
            time.sleep(backoff)
            backoff *= 2

        else:
            logging.error(f"Error {response.status_code} for {endpoint}: {response.text}")
            raise Exception(f"Error: received {response.status_code} from {endpoint}")

# If the loop completes without returning, raise an exception
    raise Exception(f"Max retries exceeded for endpoint {endpoint}")

# Use isrc to access song characteristics:
- load in a subset of around 5000 of the spotify songs with spotify IDs
- Loop over them to get the chartmetric IDs and any other further information accessible
- use chartmetric IDs to get the information of relevance
- setup code in a way that allows us to get information for our relevant songs

In [7]:
# The single source of truth for all tracks that need to be processed.
# CHANGE: musicbrainz data
ORIGINAL_DATA_FILE = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/Spotify/1980_2000_songs_artists/musicbrainz_spotify_combined_track_artist_final.csv"

# CHANGE: Chart data
# ORIGINAL_DATA_FILE = 

# The checkpoint file where all completed results are stored.
CHECKPOINT_FILE = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/chartmetric_track_id_checkpoints/chartmetric_ids_checkpoint.csv"


print("--- Loading Full Original Dataset ---")
logging.info(f"Loading the master dataset from: {ORIGINAL_DATA_FILE}")

# Your original loading and cleaning logic
spotify_album = pd.read_csv(ORIGINAL_DATA_FILE)
spotify_merged = spotify_album # Assuming this is the merge logic you need
spotify_fetch = spotify_merged.drop_duplicates(subset='spotify_isrc', keep='first').reset_index(drop=True)

print(f"Master dataframe loaded with {len(spotify_fetch):,} rows to process.")
logging.info(f"Master dataframe loaded with {len(spotify_fetch):,} rows.")


--- Loading Full Original Dataset ---
Master dataframe loaded with 3,169,134 rows to process.


In [8]:
# --- Function to Retrieve Chartmetric ID for an ISRC ---
def get_chartmetric_ids(isrc):
    endpoint = f"/api/track/isrc/{isrc}/get-ids"
    try:
        response = get_request(endpoint)
    
    # Log the response status code and rate limit headers
    except Exception as e:
        logging.error(f"Failed to get Chartmetric ID for ISRC {isrc}: {e}")
        return None

    # Expecting response["obj"] to be a non-empty list
    if response.get("obj") and isinstance(response["obj"], list) and len(response["obj"]) > 0:

        # Extract the chartmetric_ids from the first element of the list
        cm_ids = response["obj"][0].get("chartmetric_ids", None)

        # Check if cm_ids is a non-empty list
        if cm_ids and isinstance(cm_ids, list) and len(cm_ids) > 0:
            try:
                return float(cm_ids[0])
            
            # Log conversion errors
            except Exception as conv_err:
                logging.error(f"Conversion error for ISRC {isrc}: {conv_err}")
                return None
    return None



In [None]:
# --- Main Processing with EFFICIENT, RESUMABLE Checkpointing ---
TIME_PER_REQUEST = 0.28 
checkpoint_interval = 1000  # Set to 1000 as requested


print("--- Initializing Main Processing Loop ---")

# A temporary list to hold new results before writing to file
results_buffer = []

try:
    # --- STEP 2a: Load the set of already processed ISRCs for fast lookups ---
    # This is the "resume" part. We check the checkpoint file to see what's already done.
    processed_isrcs = set()
    if os.path.exists(CHECKPOINT_FILE):
        print(f"Checkpoint file found. Loading processed ISRCs from: {CHECKPOINT_FILE}")
        logging.info(f"Loading processed ISRCs from checkpoint: {CHECKPOINT_FILE}")
        try:
            # We only need the 'spotify_isrc' column to know what's already done.
            processed_df = pd.read_csv(CHECKPOINT_FILE, usecols=['spotify_isrc'])
            processed_isrcs = set(processed_df['spotify_isrc'].dropna().unique())
            print(f"Loaded {len(processed_isrcs):,} previously completed ISRCs. They will be skipped.")
            logging.info(f"Loaded {len(processed_isrcs):,} previously completed ISRCs.")
        except (pd.errors.EmptyDataError, KeyError, FileNotFoundError):
            print("Checkpoint file is empty or invalid. A new one will be created.")
            logging.warning("Checkpoint file was found but is empty or invalid.")
            # If the file is broken, we start fresh and create the header.
            pd.DataFrame(columns=['spotify_isrc', 'chartmetric_ids']).to_csv(CHECKPOINT_FILE, index=False)
    else:
        print("No checkpoint file found. Creating a new one with headers.")
        logging.info("No checkpoint file found. Creating a new one.")
        # If the file doesn't exist, create it with the necessary columns.
        pd.DataFrame(columns=['spotify_isrc', 'chartmetric_ids']).to_csv(CHECKPOINT_FILE, index=False)

    
    
    # --- STEP 2: Iterate through the main dataframe ---
    total_rows = len(spotify_fetch)
    print(f"Beginning iteration over {total_rows} total rows.")
    
    for idx, row in spotify_fetch.iterrows():
        isrc = row['spotify_isrc']
        
        # --- EFFICIENT RESUME LOGIC ---
        # If we have already processed this ISRC in a previous run, skip it immediately.
        if isrc in processed_isrcs:
            continue

        loop_start_time = time.time()
        print(f"Processing row {idx}/{total_rows}: ISRC = {isrc}")

        try:
            chartmetric_id = get_chartmetric_ids(isrc)
        except Exception as e:
            print(f"Error processing ISRC {isrc} at row {idx}: {e}")
            logging.error(f"Error processing ISRC {isrc} at row {idx}: {e}")
            chartmetric_id = None
        
        # Add the new result to our temporary buffer instead of the giant dataframe
        results_buffer.append({
            'spotify_isrc': isrc, 
            'chartmetric_ids': chartmetric_id
        })
        
        # Add to our in-memory set to avoid re-processing in the *same* run if there are duplicates
        processed_isrcs.add(isrc)
        
        logging.info(f"Buffered ISRC {isrc}: Chartmetric ID = {chartmetric_id}")

        # --- DYNAMIC SLEEP LOGIC ---
        elapsed_time = time.time() - loop_start_time
        sleep_duration = TIME_PER_REQUEST - elapsed_time
        if sleep_duration > 0:
            time.sleep(sleep_duration)
        
        # --- FAST APPEND-BASED CHECKPOINT ---
        # When the buffer is full, append it to the checkpoint file. This is very fast.
        if len(results_buffer) >= checkpoint_interval:
            print(f"--- Checkpoint interval reached. Appending {len(results_buffer)} new results... ---")
            checkpoint_df = pd.DataFrame(results_buffer)
            # Use mode='a' (append) and header=False to efficiently add to the existing file
            checkpoint_df.to_csv(CHECKPOINT_FILE, mode='a', header=False, index=False)
            results_buffer = [] # Reset the buffer for the next batch
            print(f"--- Batch appended to checkpoint file. ---")
            logging.info(f"Appended {len(checkpoint_df)} rows to checkpoint.")

finally:
    # --- FINAL SAVE OF REMAINING RESULTS ---
    # This ensures that even if the loop is interrupted, the last batch of results is saved.
    if results_buffer:
        print(f"\nLoop finished or was interrupted. Appending {len(results_buffer)} final results...")
        final_df = pd.DataFrame(results_buffer)
        final_df.to_csv(CHECKPOINT_FILE, mode='a', header=False, index=False)
        print(f"Final batch appended to: {CHECKPOINT_FILE}")
        logging.info(f"Appended {len(final_df)} final rows to checkpoint.")
    
    print("\nProcessing complete.")
    logging.info("All processing is complete.")

--- Initializing Main Processing Loop ---
Checkpoint file found. Loading processed ISRCs from: //bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/chartmetric_track_id_checkpoints/chartmetric_ids_checkpoint.csv
Loaded 176,999 previously completed ISRCs. They will be skipped.
Beginning iteration over 3169134 total rows.
Processing row 13565/3169134: ISRC = nan
Processing row 177000/3169134: ISRC = FR6V80309811
Processing row 177001/3169134: ISRC = FRX850500016
Processing row 177002/3169134: ISRC = USBS50510433
Processing row 177003/3169134: ISRC = DEE219265101
Processing row 177004/3169134: ISRC = DEE219265107
Processing row 177005/3169134: ISRC = DEE219265108
Processing row 177006/3169134: ISRC = DEE219265113
Processing row 177007/3169134: ISRC = USAR10200427
Processing row 177008/3169134: ISRC = USA370513720
Processing row 177009/3169134: ISRC = FRW110300749
Processing row 177010/3169134: ISRC = FRW110300750
Processing row 177011/3169134: ISRC = FRW110300751
Processi

In [None]:
#merge in the chartmetric ids into the original dataframe
chartmetric_ids_df = pd.read_csv("//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/chartmetric_track_id_checkpoints/chartmetric_ids_checkpoint.csv")

spotify_fetch = pd.merge(spotify_fetch, chartmetric_ids_df, on='spotify_isrc', how='left')

In [None]:
#transform the object type into float 
spotify_fetch["chartmetric_ids"] = pd.to_numeric(spotify_fetch["chartmetric_ids"], errors='coerce')

#transform into integer
spotify_fetch["chartmetric_ids"] = spotify_fetch["chartmetric_ids"].astype("Int64")



In [13]:
#retain only the rows with unique chartmetric_ids
spotify_fetch_unique = spotify_fetch.drop_duplicates(subset="chartmetric_ids")

Saving the files needs to take into account the version that I have already saved in the past as I am not able to run the code remotely

In [14]:

import os

# Define the final file path
# NEED TO CHECK:  the suffix
filepath = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/"

#give the file a name
file_name = "chartmetric_ids_mb_matched.csv"

#paste filepath and file_name together and call the variable final_filepath

final_filepath = os.path.join(filepath, file_name)

# Save the dataframe to the final_filepath
spotify_fetch_unique.to_csv(final_filepath, index=False)
print(f"Saved file as: {final_filepath}")

# Log the final message in the log file
logging.info("Completed processing all ISRC codes.")

Saved file as: //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_sample.csv
