	# Author: Alexander Staub
	## Last changed: 2025.06.23
	## Purpose: Using the chartmetric IDs to get song level metadata post spotify data collection


In [1]:
#installing packages
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os

In [2]:
#--- Cell 1: Configuration ---

NUM_WORKERS = 3  # Must match the number of workers you ran
# --- This should point to the directory containing your "part_1", "part_2", etc. folders ---
METADATA_OUTPUT_DIR = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/"
# --- This is the path for your final, combined output file. ---
FINAL_OUTPUT_FILE = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/complete_charmetric_chars.json"


In [3]:
# --- Cell 2: Combine Worker Results ---
all_responses = []
print("Starting to merge results from all workers...")

for i in range(NUM_WORKERS):
    part_number = i + 1
    # --- The path is dynamically constructed to find each worker's checkpoint file. ---
    file_path = os.path.join(METADATA_OUTPUT_DIR, f"part_{part_number}", "song_chars_checkpoint.json")
    
    if os.path.exists(file_path):
        print(f"Loading {file_path}...")
        with open(file_path, "r") as f:
            data = json.load(f)
            # Use extend to add all items from the loaded list to the master list
            all_responses.extend(data)
    else:
        print(f"WARNING: Could not find result file for worker {part_number} at {file_path}")

print(f"\nSuccessfully merged a total of {len(all_responses)} responses.")


Starting to merge results from all workers...
Loading //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/part_1\song_chars_checkpoint.json...
Loading //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/part_2\song_chars_checkpoint.json...
Loading //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/part_3\song_chars_checkpoint.json...

Successfully merged a total of 4573 responses.


In [4]:
# define a function to extract necessary information from the search output
def extract_song_info(search_output):
    # Extract the main object
    obj = search_output.get('obj', {})
    
    # Artist: take first artist if available
    if obj.get('artists') and len(obj['artists']) > 0:
        artist = obj['artists'][0]
        artist_id = artist.get('id', None)
        artist_name = artist.get('name', None)
        artist_label = artist.get('label', None)
        artist_booking_agent = artist.get('booking_agent', None)
        artist_general_manager = artist.get('general_manager', None)
    else:
        artist_id = artist_name = artist_label = artist_booking_agent = artist_general_manager = None

    # Albums: select the album with the earliest release date
    if obj.get('albums') and len(obj['albums']) > 0:
        def parse_date(album):
            try:
                return datetime.strptime(album.get('release_date', ''), '%Y-%m-%d')
            except Exception:
                return datetime.max
        sorted_albums = sorted(obj['albums'], key=parse_date)
        earliest_album = sorted_albums[0]
        album_id = earliest_album.get('id', None)
        album_name = earliest_album.get('name', None)
        album_release_date = earliest_album.get('release_date', None)
        album_label = earliest_album.get('label', None)
    else:
        album_id = album_name = album_release_date = album_label = None

    # Use a pipe '|' as delimiter for multiple values
    delimiter = '|'
    
    # Moods: concatenate mood names
    if obj.get('moods') and len(obj['moods']) > 0:
        moods = delimiter.join([m.get('name', '') for m in obj['moods']])
    else:
        moods = None
    
    # Activities: concatenate activity names
    if obj.get('activities') and len(obj['activities']) > 0:
        activities = delimiter.join([a.get('name', '') for a in obj['activities']])
    else:
        activities = None
    
    # Songwriters: concatenate songwriter names
    if obj.get('songwriters') and len(obj['songwriters']) > 0:
        songwriters = delimiter.join(obj['songwriters'])
    else:
        songwriters = None
    
    # songwriterIds is not present in the example so we assign None
    songwriterIds = None

    # Create a one-row DataFrame with the desired columns
    data = {
        'chartmetric_ids': obj.get('id', None),
        'Name': obj.get('name', None),
        'Composer_name': obj.get('composer_name', None),
        'Artist_id': artist_id,
        'Artist_name': artist_name,
        'Artist_label': artist_label,
        'Artist_booking_agent': artist_booking_agent,
        'Artist_general_manager': artist_general_manager,
        'Albums_id': album_id,
        'Albums_name': album_name,
        'Albums_release_date': album_release_date,
        'Albums_label': album_label,
        'Tags': obj.get('tags', None),
        'Moods': moods,
        'Activities': activities,
        'Songwriters': songwriters,
        'songwriterIds': songwriterIds,
        'Tempo': obj.get('tempo', None),
        'Duration_ms': obj.get('duration_ms', None)
    }
    
    return pd.DataFrame([data])

In [5]:
# --- Cell 4: Parse All Responses and Create Final DataFrame ---

print("Parsing all responses...")
extracted_rows = []
for resp in all_responses:
    # Filter out any None values that resulted from API errors
    if resp is not None:
        extracted_df = extract_song_info(resp)
        extracted_rows.append(extracted_df)

if extracted_rows:
    final_extracted_df = pd.concat(extracted_rows, ignore_index=True)
    print(f"Successfully parsed data into a DataFrame with {len(final_extracted_df)} rows.")
    
    # Save the final, clean data
    final_extracted_df.to_json(FINAL_OUTPUT_FILE, orient="records", lines=True)
    print(f"Final merged and processed data saved to: {FINAL_OUTPUT_FILE}")
else:
    print("No data was extracted. Final file not saved.")

Parsing all responses...


  final_extracted_df = pd.concat(extracted_rows, ignore_index=True)


Successfully parsed data into a DataFrame with 4573 rows.
Final merged and processed data saved to: //bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/complete_charmetric_chars.json


In [6]:
# load the chartmetric_ids_spotify data as a dataframe
chartmetric_ids_spotify = pd.read_csv(
    "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_sample.csv"
)

In [7]:
# --- Step 3: Join the Extracted Data to the Original DataFrame ---
# It is assumed that the 'id' column in song_chars_extracted matches the 'id' column in spotify_sample.
merged_song_chars = chartmetric_ids_spotify.merge(final_extracted_df, on="chartmetric_ids", how="left")

In [None]:
#safe the final dataframe
# Save as JSON (records-oriented with one JSON object per line)

#sample dataset
merged_song_chars.to_json("Z:/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/sample_charmetric_chars.json", orient="records", lines=True)

#the songs + artist 1980-2000 dataset
# merged_song_chars.to_json("//bigdata.wu.ac.at/delpero/Data_alexander/data/incidental/chartmetric/chartmetric_metadata/all_1980_2000_charmetric_chars.json", orient="records", lines=True)


# The code used to derive an example and extract information

In [26]:
# --- Function to Retrieve song characteristics from Chartmetric ID ---
def get_songchars_ids(chartmetric_id):
    endpoint = f"/api/track/{chartmetric_id}"
    try:
        response = get_request(endpoint)
        # Log the response status code and rate limit headers
        logging.info(f"Successfully retrieved song chars for chartmetric id {chartmetric_id}")
    except Exception as e:
        logging.error(f"Failed to get song chars for chartmetric id {chartmetric_id}: {e}")
        return None

    # Extract the song characteristics from the response
    song_chars = response

    return song_chars

In [34]:
pprint(chartmetric_ids_spotify["chartmetric_ids"][4])

15447513.0


In [None]:
# trial run with a single ID 

search_output = get_songchars_ids(chartmetric_ids_spotify["chartmetric_ids"][0])


pprint(search_output)

In [40]:
#trial the function with the search output

test_df = extract_song_info(search_output)