	# Author: Alexander Staub
	## Last changed: 2025.02.13
	## Purpose: Getting the chartmetrics IDs for a list of songs


In [1]:
#installing packages
import time
import requests
import logging
import pandas as pd

In [2]:
#Setup the logging of the errors
logging.basicConfig(
    filename='chartmetric_api.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

In [3]:
# Define API host and your refresh token
HOST = 'https://api.chartmetric.com'
with open("chartmetric_refresh_token.txt", "r") as f:
    REFRESH_TOKEN = f.read().strip()

In [4]:
# Retrieve an access token using the refresh token
token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})

# Check if the token was retrieved successfully
if token_response.status_code != 200:

    # Log the error and raise an exception
    logging.error(f"Token retrieval error: {token_response.status_code}")
    raise Exception(f"Error: received {token_response.status_code} from /api/token")

# Extract the access token from the response
access_token = token_response.json()['token']

# Define the headers for the API requests
headers = {'Authorization': f'Bearer {access_token}'}

# Defining the get_request

Robust request logic that:
- backs off for a max of 26 hours in retries
- logs all erros it encounters


In [5]:


# --- Robust get_request Function ---
def get_request(endpoint, params=None, max_retries=5):
    backoff = 1  # initial backoff in seconds (used if header data is missing)
    for attempt in range(max_retries):
        try:
            response = requests.get(f"{HOST}{endpoint}", headers=headers, params=params)
        except Exception as ex:
            logging.error(f"Network error on attempt {attempt+1} for {endpoint}: {ex}")
            time.sleep(backoff)
            backoff *= 2
            continue

# Log the response status code and rate limit headers
        logging.info(f"Request to {endpoint} returned {response.status_code}. RateLimit headers: {response.headers}")

# Check if the response status code is 200
        if response.status_code == 200:
            return response.json()

# Handle different types of errors
# 401: Token may have expired; refresh it
        elif response.status_code == 401:
            # Token may have expired; refresh it
            logging.warning(f"401 error for {endpoint}. Refreshing token.")
            token_response = requests.post(f'{HOST}/api/token', json={'refreshtoken': REFRESH_TOKEN})
            if token_response.status_code != 200:
                logging.error(f"Token refresh failed: {token_response.status_code}")
                raise Exception(f"Token refresh failed with status {token_response.status_code}")
            new_token = token_response.json()['token']
            headers['Authorization'] = f'Bearer {new_token}'
            time.sleep(backoff)
            backoff *= 2

# 429: Rate limit exceeded; wait and retry
        elif response.status_code == 429:
            # Rate limit exceeded.
            reset_timestamp = response.headers.get("X-RateLimit-Reset")
            if reset_timestamp:
                # Wait until the time provided by the API
                sleep_time = int(reset_timestamp) - int(time.time())
                if sleep_time < 0:
                    sleep_time = backoff
            else:
                # No wait time provided by the API; compute one that totals 26 hours over all retries.
                total_wait_limit = 26 * 3600  # total wait time in seconds (26 hours)
                # Sum exponential weights for remaining attempts: for i from current attempt to max_retries-1
                remaining_weights = sum(2 ** i for i in range(attempt, max_retries))
                # Use the weight for the current attempt to assign a fraction of the total wait.
                sleep_time = total_wait_limit * (2 ** attempt / remaining_weights)
            logging.warning(f"429 error for {endpoint}. Sleeping for {sleep_time} seconds (attempt {attempt+1}/{max_retries}).")
            time.sleep(sleep_time)
            backoff *= 2

# 500: Server error; wait and retry
        elif response.status_code >= 500:
            logging.warning(f"Server error {response.status_code} for {endpoint}. Retrying after {backoff} seconds.")
            time.sleep(backoff)
            backoff *= 2

        else:
            logging.error(f"Error {response.status_code} for {endpoint}: {response.text}")
            raise Exception(f"Error: received {response.status_code} from {endpoint}")

# If the loop completes without returning, raise an exception
    raise Exception(f"Max retries exceeded for endpoint {endpoint}")

# Use isrc to access song characteristics:
- load in a subset of around 5000 of the spotify songs with spotify IDs
- Loop over them to get the chartmetric IDs and any other further information accessible
- use chartmetric IDs to get the information of relevance
- setup code in a way that allows us to get information for our relevant songs

In [None]:
#read in the first 8000 lines from the file "AD_spotify_accoustic_char_250k.csv" in the data/raw_data/Spotify folder and call it spotify_fetch
# the data directory is 2 directories down from the current directory
# it should be a pandas dataframe and the csv has headers

import pandas as pd

#sample
spotify_sample = pd.read_csv("//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/Spotify/1980_2000_songs_artists/musicbrainz_spotify_combined_track_artist_final.csv", nrows=5000)

#load the -matched song + artist dataset
#spotify_album = pd.read_csv("//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/Spotify/1980_2000_songs_artists/musicbrainz_spotify_combined_track_artist_final.csv")

#load the chart songs dataset
#spotify_charts = pd.read_csv()

#load the unmatched song + artist dataset
#spotify_unmatched_transformed = pd.read_csv()

In [7]:
#Operations to merge above datasets
spotify_merged = spotify_sample

In [8]:
# remove duplicate rows from the spotify_fetch dataframe based on isrcs
spotify_fetch = spotify_merged.drop_duplicates(subset='spotify_isrc', keep='first')


In [9]:
# --- Function to Retrieve Chartmetric ID for an ISRC ---
def get_chartmetric_ids(isrc):
    endpoint = f"/api/track/isrc/{isrc}/get-ids"
    try:
        response = get_request(endpoint)
    
    # Log the response status code and rate limit headers
    except Exception as e:
        logging.error(f"Failed to get Chartmetric ID for ISRC {isrc}: {e}")
        return None

    # Expecting response["obj"] to be a non-empty list
    if response.get("obj") and isinstance(response["obj"], list) and len(response["obj"]) > 0:

        # Extract the chartmetric_ids from the first element of the list
        cm_ids = response["obj"][0].get("chartmetric_ids", None)

        # Check if cm_ids is a non-empty list
        if cm_ids and isinstance(cm_ids, list) and len(cm_ids) > 0:
            try:
                return float(cm_ids[0])
            
            # Log conversion errors
            except Exception as conv_err:
                logging.error(f"Conversion error for ISRC {isrc}: {conv_err}")
                return None
    return None



In [11]:
# --- Main Processing with Checkpointing & DYNAMIC Throttling ---
TIME_PER_REQUEST = 0.28 

checkpoint_file = "//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/chartmetric_track_id_checkpoints/chartmetric_ids_checkpoint.csv"
checkpoint_interval = 100  # save every 100 processed rows

for idx, row in spotify_fetch.iterrows():
    # Record the start time of the loop
    loop_start_time = time.time()
    
    isrc = row['spotify_isrc']
    print(f"Processing row {idx}: ISRC = {isrc}")

    # Skip if ISRC already processed
    if pd.notnull(spotify_fetch.at[idx, "chartmetric_ids"]):
        print(f"Row {idx} already processed. Skipping.")
        continue

    try:
        chartmetric_id = get_chartmetric_ids(isrc)
    except Exception as e:
        print(f"Error processing ISRC {isrc} at row {idx}: {e}")
        logging.error(f"Error processing ISRC {isrc} at row {idx}: {e}")
        chartmetric_id = None

    spotify_fetch.at[idx, "chartmetric_ids"] = chartmetric_id
    print(f"Row {idx} processed: ISRC = {isrc} -> Chartmetric ID = {chartmetric_id}")
    logging.info(f"Processed row {idx}, ISRC {isrc}: Chartmetric ID = {chartmetric_id}")

    # --- DYNAMIC SLEEP LOGIC ---
    # REMOVED: time.sleep(0.3)

    # Calculate how long the API call and processing took
    loop_end_time = time.time()
    elapsed_time = loop_end_time - loop_start_time

    # Calculate the remaining time to sleep to hit the target rate
    sleep_duration = TIME_PER_REQUEST - elapsed_time

    # If the request was fast, sleep for the remaining time.
    # If the request was slow (elapsed_time > TIME_PER_REQUEST), don't sleep at all.
    if sleep_duration > 0:
        time.sleep(sleep_duration)
    
    # Save a checkpoint periodically
    if idx % checkpoint_interval == 0 and idx > 0:
        spotify_fetch.to_csv(checkpoint_file, index=False)
        print(f"Checkpoint saved at row {idx}")
        logging.info(f"Checkpoint saved at row {idx}")

Processing row 0: ISRC = GBARL0100013
Row 0 already processed. Skipping.
Processing row 1: ISRC = GBARL0100173
Row 1 already processed. Skipping.
Processing row 2: ISRC = FR2X41547576
Row 2 already processed. Skipping.
Processing row 3: ISRC = ARA340800036
Row 3 already processed. Skipping.
Processing row 4: ISRC = BEY900700058
Row 4 already processed. Skipping.
Processing row 5: ISRC = BEQ028000010
Row 5 already processed. Skipping.
Processing row 6: ISRC = BEZ050000047
Row 6 already processed. Skipping.
Processing row 7: ISRC = NLA307200010
Row 7 already processed. Skipping.
Processing row 8: ISRC = BEK011900408
Row 8 already processed. Skipping.
Processing row 9: ISRC = BED010200806
Row 9 already processed. Skipping.
Processing row 10: ISRC = BEC010100040
Row 10 already processed. Skipping.
Processing row 11: ISRC = BEY900900138
Row 11 already processed. Skipping.
Processing row 12: ISRC = GBAKW0100229
Row 12 already processed. Skipping.
Processing row 13: ISRC = NLA910160001
Row 13

In [12]:
#transform the object type into float 
spotify_fetch["chartmetric_ids"] = pd.to_numeric(spotify_fetch["chartmetric_ids"], errors='coerce')

#transform into integer
spotify_fetch["chartmetric_ids"] = spotify_fetch["chartmetric_ids"].astype("Int64")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_fetch["chartmetric_ids"] = pd.to_numeric(spotify_fetch["chartmetric_ids"], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_fetch["chartmetric_ids"] = spotify_fetch["chartmetric_ids"].astype("Int64")


In [13]:
#retain only the rows with unique chartmetric_ids
spotify_fetch_unique = spotify_fetch.drop_duplicates(subset="chartmetric_ids")

Saving the files needs to take into account the version that I have already saved in the past as I am not able to run the code remotely

In [None]:

import os

# Define the final file path
# NEED TO CHECK:  the suffix
filepath = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/"

#give the file a name
#file_name = "chartmetric_ids_sample.csv"

#paste filepath and file_name together and call the variable final_filepath

final_filepath = os.path.join(filepath, file_name)

# Save the dataframe to the final_filepath
spotify_fetch_unique.to_csv(final_filepath, index=False)
print(f"Saved file as: {final_filepath}")

# Log the final message in the log file
logging.info("Completed processing all ISRC codes.")

Saved file as: //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_sample.csv
