	# Author: Alexander Staub
	## Last changed: 2025.08.26
	## Purpose: Using the chartmetric IDs to get song level metadata post spotify data collection


In [1]:
#installing packages
import time
import requests
import logging
import pandas as pd
from pprint import pprint
from datetime import datetime
import json
import os
import glob
from typing import Dict, Any, List, Optional
import gc  # For garbage collection
import csv

In [2]:
#--- Cell 1: Configuration ---

# Set your paths here - these become notebook-wide variables you can inspect
METADATA_OUTPUT_DIR =  "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/"
OUTPUT_CSV_PATH = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/complete_charmetric_chars.csv"
OUTPUT_PARQUET_PATH = "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/complete_charmetric_chars.parquet"

# Configuration parameters - easy to modify and experiment with
BATCH_SIZE = 100  # Process this many files at a time
WRITE_CHUNK_SIZE = 20000  # Write to CSV after processing this many records



In [3]:
# Cell 2: Define the extraction function
def extract_song_info_dict(search_output: Dict[str, Any]) -> Dict[str, Any]:
    """Extract song info as a dictionary (more memory efficient than DataFrame)"""
    obj = search_output.get('obj', {})
    
    # Artist information
    artist_id = artist_name = artist_label = artist_booking_agent = artist_general_manager = None
    if obj.get('artists') and len(obj['artists']) > 0:
        artist = obj['artists'][0]
        artist_id = artist.get('id')
        artist_name = artist.get('name')
        artist_label = artist.get('label')
        artist_booking_agent = artist.get('booking_agent')
        artist_general_manager = artist.get('general_manager')
    
    # Album information (earliest release)
    album_id = album_name = album_release_date = album_label = None
    if obj.get('albums') and len(obj['albums']) > 0:
        def parse_date(album):
            try:
                return datetime.strptime(album.get('release_date', ''), '%Y-%m-%d')
            except:
                return datetime.max
        
        sorted_albums = sorted(obj['albums'], key=parse_date)
        earliest_album = sorted_albums[0]
        album_id = earliest_album.get('id')
        album_name = earliest_album.get('name')
        album_release_date = earliest_album.get('release_date')
        album_label = earliest_album.get('label')
    
    # Delimiter for multiple values
    delimiter = ','
    
    # Moods
    moods = None
    if obj.get('moods') and len(obj['moods']) > 0:
        moods = delimiter.join([m.get('name', '') for m in obj['moods'] if m.get('name')])
    
    # Activities
    activities = None
    if obj.get('activities') and len(obj['activities']) > 0:
        activities = delimiter.join([a.get('name', '') for a in obj['activities'] if a.get('name')])
    
    # Songwriters
    songwriters = None
    if obj.get('songwriters') and len(obj['songwriters']) > 0:
        songwriters = delimiter.join(obj['songwriters'])
    
    return {
        'chartmetric_ids': obj.get('id'),
        'cm_track_title': obj.get('name'),
        'cm_composer_name': obj.get('composer_name'),
        'cm_artist_id': artist_id,
        'cm_artist_credit': artist_name,
        'cm_artist_label': artist_label,
        'Artist_booking_agent': artist_booking_agent,
        'Artist_general_manager': artist_general_manager,
        'cm_release_id': album_id,
        'cm_name_release': album_name,
        'cm_release_date': album_release_date,
        'cm_albums_label': album_label,
        'cm_genres': obj.get('tags'),
        'cm_moods': moods,
        'cm_activities': activities,
        'cm_songwriters': songwriters,
        'cm_songwriterIds': None,
        'cm_tempo': obj.get('tempo'),
        'cm_duration_ms': obj.get('duration_ms')
    }

print("Extraction function defined")

Extraction function defined


In [4]:
# Cell 3: Explore your data structure 
# Find and examine the files
search_pattern = os.path.join(METADATA_OUTPUT_DIR, "part_*", "responses", "batch_*.json")
all_batch_files = sorted(glob.glob(search_pattern))

print(f"Found {len(all_batch_files)} batch files")
print(f"Total estimated size: ~{len(all_batch_files) * 15} MB")
print("\nFirst 5 files:")
for file in all_batch_files[:5]:
    print(f"  - {os.path.basename(file)}")

# Optional: Peek at one file to understand structure
if all_batch_files:
    with open(all_batch_files[0], 'r') as f:
        sample_data = json.load(f)
    print(f"\nFirst file contains {len(sample_data)} records")
    print("Sample record structure:")
    if sample_data:
        # Test the extraction on one record
        sample_extracted = extract_song_info_dict(sample_data[0])
        print(f"Extracted fields: {list(sample_extracted.keys())}")

Found 2904 batch files
Total estimated size: ~43560 MB

First 5 files:
  - batch_00001.json
  - batch_00002.json
  - batch_00003.json
  - batch_00004.json
  - batch_00005.json

First file contains 1000 records
Sample record structure:
Extracted fields: ['chartmetric_ids', 'cm_track_title', 'cm_composer_name', 'cm_artist_id', 'cm_artist_credit', 'cm_artist_label', 'Artist_booking_agent', 'Artist_general_manager', 'cm_release_id', 'cm_name_release', 'cm_release_date', 'cm_albums_label', 'cm_genres', 'cm_moods', 'cm_activities', 'cm_songwriters', 'cm_songwriterIds', 'cm_tempo', 'cm_duration_ms']


In [5]:
# Cell 4: Main processing function (for CSV output)
def process_batch_files_incrementally():
    """Process JSON files in batches and write to CSV incrementally"""
    
    # Use the global variables defined in Cell 1
    global all_batch_files, total_records_processed, failed_files, failed_records
    
    total_files = len(all_batch_files)
    
    print(f"Starting processing of {total_files} files")
    print(f"Processing in batches of {BATCH_SIZE} files")
    print(f"Writing to CSV every {WRITE_CHUNK_SIZE} records")
    print("-" * 60)
    
    # Initialize CSV writer
    csv_file = open(OUTPUT_CSV_PATH, 'w', newline='', encoding='utf-8')
    csv_writer = None
    
    # Track progress - these become inspectable variables
    total_records_processed = 0
    records_buffer = []
    failed_files = []
    failed_records = []
    
    try:
        # Process files in batches
        for batch_start in range(0, total_files, BATCH_SIZE):
            batch_end = min(batch_start + BATCH_SIZE, total_files)
            batch_files = all_batch_files[batch_start:batch_end]
            
            print(f"\n📦 Processing batch {batch_start//BATCH_SIZE + 1}/{(total_files + BATCH_SIZE - 1)//BATCH_SIZE}")
            print(f"   Files {batch_start + 1} to {batch_end} of {total_files}")
            
            # Process each file in the batch
            for file_idx, file_path in enumerate(batch_files, start=batch_start):
                try:
                    with open(file_path, 'r') as f:
                        batch_data = json.load(f)
                    
                    # Process each record
                    for record_idx, record in enumerate(batch_data):
                        try:
                            extracted_data = extract_song_info_dict(record)
                            records_buffer.append(extracted_data)
                            
                            # Initialize CSV writer with headers from first record
                            if csv_writer is None and extracted_data:
                                csv_writer = csv.DictWriter(csv_file, fieldnames=list(extracted_data.keys()))
                                csv_writer.writeheader()
                            
                            # Write to CSV when buffer is full
                            if len(records_buffer) >= WRITE_CHUNK_SIZE:
                                csv_writer.writerows(records_buffer)
                                csv_file.flush()  # Ensure data is written to disk
                                total_records_processed += len(records_buffer)
                                print(f" Written {total_records_processed:,} records to CSV")
                                records_buffer = []
                                
                        except Exception as e:
                            failed_records.append({
                                'file': file_path,
                                'record_index': record_idx,
                                'error': str(e)
                            })
                            if len(failed_records) <= 10:
                                print(f" Failed to process record {record_idx}: {e}")
                    
                    # Clear the batch_data from memory
                    del batch_data
                    
                    # Progress indicator
                    if (file_idx + 1) % 10 == 0:
                        print(f" Processed {file_idx + 1}/{total_files} files")
                        
                except Exception as e:
                    failed_files.append({'file': file_path, 'error': str(e)})
                    print(f" Error reading {os.path.basename(file_path)}: {e}")
            
            # Force garbage collection after each batch
            gc.collect()
        
        # Write any remaining records
        if records_buffer and csv_writer:
            csv_writer.writerows(records_buffer)
            total_records_processed += len(records_buffer)
            print(f"\n✓ Written final {len(records_buffer)} records")
        
        print(f"\n{'='*60}")
        print(f" PROCESSING COMPLETE")
        print(f"Total records processed: {total_records_processed:,}")
        print(f"Failed files: {len(failed_files)}")
        print(f"Failed records: {len(failed_records)}")
        print(f"Output saved to: {OUTPUT_CSV_PATH}")
        print(f"{'='*60}")
        
        # Save error logs if any
        if failed_files or failed_records:
            error_log_path = OUTPUT_CSV_PATH.replace('.csv', '_errors.json')
            with open(error_log_path, 'w') as f:
                json.dump({
                    'failed_files': failed_files,
                    'failed_records': failed_records[:1000]  # Limit to first 1000 errors
                }, f, indent=2)
            print(f"Error log saved to: {error_log_path}")
            
    finally:
        csv_file.close()
    
    return total_records_processed, failed_files, failed_records

In [9]:
# Cell 5: Alternative - Process smaller batch for testing
# Useful for testing before running the full processing
def process_test_batch(num_files=10):
    """Process just a few files for testing"""
    test_files = all_batch_files[:num_files]
    test_records = []
    
    print(f"Processing {num_files} files for testing...")
    
    for file_path in test_files:
        with open(file_path, 'r') as f:
            batch_data = json.load(f)
        
        for record in batch_data:
            test_records.append(extract_song_info_dict(record))
    
    # Create DataFrame
    df_test = pd.DataFrame(test_records)
    print(f"Created test DataFrame with {len(df_test)} records")
    
    return df_test

# Uncomment to run a test batch first
df_test = process_test_batch(10)
df_test.head()

Processing 10 files for testing...
Created test DataFrame with 9411 records


Unnamed: 0,chartmetric_ids,cm_track_title,cm_composer_name,cm_artist_id,cm_artist_credit,cm_artist_label,Artist_booking_agent,Artist_general_manager,cm_release_id,cm_name_release,cm_release_date,cm_albums_label,cm_genres,cm_moods,cm_activities,cm_songwriters,cm_songwriterIds,cm_tempo,cm_duration_ms
0,12486264,Uptown Girl - Radio Edit,Billy Joel,208701,Westlife,,,,1170038.0,Coast To Coast,2000-01-01,RCA Records Label,"soft pop,pop,teen pop,rock,modern rock,alterna...","affectionate,nostalgic,sarcastic",daydreaming,Billy Joel,,129.032,187666.0
1,12486289,Let's Dance - Radio Edit,"Sean Conlon, Martin Harrington, Ash Howes, Ric...",208971,Five,,,,703326.0,Let's Dance,2001-01-16,RCA Camden,"pop,teen pop,dance,rock,electronic,soft pop","energetic,entertaining,fun,funky","dancy,partying","Sean Conlon,Martin Harrington,Ash Howes,Richar...",,118.024,219853.0
2,16790822,My Boyfriend,J. Ballard,222162,Bubbles,,,,1585098.0,Rock the World,2002-01-01,PCA Music,dance,,,J. Ballard,,106.024,181840.0
3,10926819,1 2 3,"Ramon Garriga, Frank Madero, Puerta",138307,El Simbolo,,,,210241.0,Éxitos,2005-01-01,Hit Designers,"latin pop,latin",fun,"dancy,bonding,daydreaming","Ramon Garriga,Frank Madero,Puerta",,131.023,219933.0
4,11060010,Tele-Romeo,"Peter Jules Gillis, Miguel Jose Eric Wiels, Al...",109874,K3,,,,1163710.0,Tele Romeo,2001-01-01,Studio 100,"dutch pop,dutch children's music","catchy,cheerful,entertaining,fun,childlike,ene...","summer,roadtrip","Peter Jules Gillis,Miguel Jose Eric Wiels,Alai...",,130.011,199575.0


In [6]:

#Cell 6: Run the processing
total_processed, errors_files, errors_records = process_batch_files_incrementally()

Starting processing of 2904 files
Processing in batches of 100 files
Writing to CSV every 20000 records
------------------------------------------------------------

📦 Processing batch 1/30
   Files 1 to 100 of 2904
 Processed 10/2904 files
 Processed 20/2904 files
 Written 20,000 records to CSV
 Processed 30/2904 files
 Processed 40/2904 files
 Written 40,000 records to CSV
 Processed 50/2904 files
 Failed to process record 977: 'NoneType' object has no attribute 'get'
 Processed 60/2904 files
 Written 60,000 records to CSV
 Processed 70/2904 files
 Processed 80/2904 files
 Written 80,000 records to CSV
 Processed 90/2904 files
 Processed 100/2904 files

📦 Processing batch 2/30
   Files 101 to 200 of 2904
 Written 100,000 records to CSV
 Failed to process record 663: 'NoneType' object has no attribute 'get'
 Failed to process record 681: 'NoneType' object has no attribute 'get'
 Processed 110/2904 files
 Processed 120/2904 files
 Written 120,000 records to CSV
 Processed 130/2904 file

In [None]:
# Cell 7: Alternative - Parquet output (more efficient for large datasets)
def process_to_parquet_chunks():
    """Process to Parquet format in chunks"""
    import pyarrow as pa
    import pyarrow.parquet as pq
    
    print(f"Processing {len(all_batch_files)} files to Parquet format")
    
    writer = None
    chunk_data = []
    total_processed = 0
    chunk_size = 50000
    
    try:
        for file_idx, file_path in enumerate(all_batch_files):
            try:
                with open(file_path, 'r') as f:
                    batch_data = json.load(f)
                
                for record in batch_data:
                    try:
                        extracted = extract_song_info_dict(record)
                        chunk_data.append(extracted)
                        
                        if len(chunk_data) >= chunk_size:
                            df_chunk = pd.DataFrame(chunk_data)
                            
                            if writer is None:
                                table = pa.Table.from_pandas(df_chunk)
                                writer = pq.ParquetWriter(OUTPUT_PARQUET_PATH, table.schema)
                            
                            table = pa.Table.from_pandas(df_chunk)
                            writer.write_table(table)
                            
                            total_processed += len(chunk_data)
                            print(f"Processed {total_processed:,} records")
                            chunk_data = []
                            
                            del df_chunk, table
                            gc.collect()
                            
                    except Exception as e:
                        print(f"Warning: Failed to process record: {e}")
                
                del batch_data
                
                if (file_idx + 1) % 10 == 0:
                    print(f"Processed {file_idx + 1}/{len(all_batch_files)} files")
                    
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        
        # Write remaining data
        if chunk_data:
            df_chunk = pd.DataFrame(chunk_data)
            if writer is None:
                table = pa.Table.from_pandas(df_chunk)
                writer = pq.ParquetWriter(OUTPUT_PARQUET_PATH, table.schema)
            else:
                table = pa.Table.from_pandas(df_chunk)
                writer.write_table(table)
            total_processed += len(chunk_data)
        
        print(f"\nTotal records processed: {total_processed:,}")
        
    finally:
        if writer:
            writer.close()
    
    return total_processed

## Merge the musicbrainz data with the characteristics

In [7]:
# load the chartmetric_ids_spotify data as a dataframe
chartmetric_ids_spotify_mb = pd.read_csv(
    "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_mb_matched.csv"
)

In [8]:
final_extracted_df = pd.read_csv(OUTPUT_CSV_PATH)

In [9]:
# --- Step 3: Join the Extracted Data to the Original DataFrame ---
# It is assumed that the 'id' column in song_chars_extracted matches the 'id' column in spotify_sample.
merged_song_chars = chartmetric_ids_spotify_mb.merge(final_extracted_df, on="chartmetric_ids", how="left")

In [10]:
# --- REWORKING THE DATAFRAME ---

# dropping columns
#- spotify url
# - artist booking agent
# - artist general manager

merged_song_chars = merged_song_chars.drop(columns=[
    'spotify_url',
    'Artist_booking_agent',
    'Artist_general_manager'
])


In [11]:


#rename columns
# release_date -> spotify_release_date
# name_recording -> mb_name_recording
# name_artist_credit -> mb_name_artist_credit

merged_song_chars = merged_song_chars.rename(columns={
    'release_date': 'spotify_release_date',
    'name_recording': 'mb_name_recording',
    'name_artist_credit': 'mb_name_artist_credit'
})


# reorder columns
# id_release, id_track, id_recording, id_artist_credit, mbid_track, mbid_recording, spotify_track_id, spotify_isrc, chartmetric_ids
# mb_name_recording, cm_track_title, spotify_track_title, mb_name_artist_credit, spotify_artist_credit, cm_artist_credit, spotify_album_name, cm_name_release
# spotify_release_date, cm_release_date, cm_artist_label, cm_album_label, name_medium_format
# cm_genres, cm_moods, cm_activities, cm_songwriters, cm_songwriterIds, cm_tempo, cm_duration_ms
# rest

merged_song_chars = merged_song_chars[[
    'id_release', 'id_track', 'id_recording', 'id_artist_credit',
    'mbid_track', 'mbid_recording', 'spotify_track_id', 'spotify_isrc', 'chartmetric_ids',
    'mb_name_recording', 'cm_track_title', 'spotify_track_title',
    'mb_name_artist_credit', 'spotify_artist_name', 'cm_artist_credit',
    'spotify_album_name', 'cm_name_release', 'spotify_release_date', 'cm_release_date',
    'cm_artist_label', 'cm_albums_label', 'name_medium_format',
    'cm_genres', 'cm_moods', 'cm_activities', 'cm_songwriters', 'cm_songwriterIds',
    'cm_tempo', 'cm_duration_ms']]



In [12]:
#safe the final dataframe
# Save as JSON (records-oriented with one JSON object per line)

#sample dataset
# merged_song_chars.to_json("Z:/Data_alexander/data/raw_data/chartmetric/chartmetric_chars/charmetric_chars_mb.json", orient="records", lines=True)

#the songs + artist 1980-2000 dataset
merged_song_chars.to_json("//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_final_merged/mb_1980_2000_charmetric_chars.json", orient="records", lines=True)


## merge the song characteristics to the chart songs

In [13]:
# load the chartmetric_ids_spotify data as a dataframe
chartmetric_ids_spotify_charts = pd.read_csv(
    "//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_ids/chartmetric_ids_chart_songs_matched.csv"
)

In [14]:
# do a left join 
# --- Step 3: Join the Extracted Data to the Original DataFrame ---
# It is assumed that the 'id' column in song_chars_extracted matches the 'id' column in spotify_sample.
merged_song_chars_charts = chartmetric_ids_spotify_charts.merge(final_extracted_df, on="chartmetric_ids", how="left")

In [16]:
# --- REWORKING THE DATAFRAME ---

# dropping columns
#- spotify url
# - artist booking agent
# - artist general manager

merged_song_chars_charts = merged_song_chars_charts.drop(columns=[
    'spotify_url',
    'Artist_booking_agent',
    'Artist_general_manager'
])

In [18]:
#safe the final dataframe
# Save as JSON (records-oriented with one JSON object per line)

#the songs + artist 1980-2000 dataset
merged_song_chars_charts.to_json("//bigdata.wu.ac.at/delpero/Data_alexander/data/raw_data/chartmetric/chartmetric_final_merged/charts_1980_2000_charmetric_chars.json", orient="records", lines=True)

# Data Checks

In [22]:
# Check for missing values
missing_counts = merged_song_chars_charts.isnull().sum()
print("Missing values per column:")
print(missing_counts[missing_counts > 0]/len(merged_song_chars_charts) * 100)

Missing values per column:
name_recording             0.004160
name_artist_credit         0.004160
song_artist                2.113145
label                     69.030782
artist_song               97.886855
country                   13.294509
tf_name_recording          0.004160
tf_name_artist_credit      0.004160
cm_track_title             0.303661
cm_composer_name          33.173877
cm_artist_id               0.320300
cm_artist_credit           0.320300
cm_artist_label          100.000000
cm_release_id              0.374376
cm_name_release            0.374376
cm_release_date            0.374376
cm_albums_label            0.682196
cm_genres                  0.303661
cm_moods                  19.147255
cm_activities             55.732113
cm_songwriters            33.173877
cm_songwriterIds         100.000000
cm_tempo                  13.057404
cm_duration_ms             0.611481
dtype: float64


In [23]:
# get a random sample of 100 rows and save as random_sample_songs_mb 
random_sample_songs_mb = merged_song_chars.sample(n=100, random_state=42)

# same for the charts dataset
random_sample_songs_charts = merged_song_chars_charts.sample(n=100, random_state=42)

In [40]:
#trial the function with the search output

test_df = extract_song_info(search_output)