In [3]:
%pip install tabulate


Note: you may need to restart the kernel to use updated packages.


In [7]:
import json
import pandas as pd
from pathlib import Path

# --- Configuration ---
# Use the exact path to the 'event' file inside the mbdump folder
MB_EVENT_FILE = Path("/workspaces/ticket-heroes/data/external/event") 

def process_musicbrainz_dump(file_path: Path) -> pd.DataFrame:
    """Reads the JSONL event dump file line by line and extracts features."""
    
    event_records = []
    
    # We use 'with open' to ensure the large file is closed properly
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f):
            try:
                # 1. Parse the line into a JSON object
                event = json.loads(line)
                
                # We only want concerts/festivals, not general 'events'
                if event.get('type') not in ['Concert', 'Festival', 'Tour']:
                    continue
                
                # 2. Extract Performer/Artist Names
                artists = []
                for artist_credit in event.get('artist-credit', []):
                    if 'artist' in artist_credit and 'name' in artist_credit['artist']:
                        artists.append(artist_credit['artist']['name'])

                # 3. Extract Core Metadata
                record = {
                    'mbid': event.get('id'),
                    'name': event.get('name'),
                    'artist_names': " / ".join(artists),
                    'start_date': event.get('life-span', {}).get('begin'),
                    'place_name': event.get('place', {}).get('name'),
                    # Example Feature: Use tags as a categorical feature
                    'tags': [t['name'] for t in event.get('tags', []) if 'name' in t], 
                    'type': event.get('type')
                }
                event_records.append(record)
                
            except json.JSONDecodeError as e:
                print(f"Skipping malformed JSON line {line_number}: {e}")
            
            # Optional: Break after a small number for testing
            # if line_number > 10000: break

    print(f"Successfully processed {len(event_records)} events.")
    return pd.DataFrame(event_records)

# --- Execution ---
if __name__ == "__main__":
    df_features = process_musicbrainz_dump(MB_EVENT_FILE)
    
    if not df_features.empty:
        # Save the structured data to Parquet
        df_features.to_parquet("data/processed/musicbrainz_structured_features.parquet", index=False)
        print("\n--- Feature Extraction Complete ---")
        print(df_features.head().to_markdown())
    else:
        print("No valid events found after filtering.")

Successfully processed 98389 events.

--- Feature Extraction Complete ---
|    | mbid                                 | name                                                         | artist_names   | start_date   | place_name   | tags   | type    |
|---:|:-------------------------------------|:-------------------------------------------------------------|:---------------|:-------------|:-------------|:-------|:--------|
|  0 | b971d65c-8a56-485f-971f-e2d22bf2b88b | Jazzchor der Universität Bonn at Trinitatiskirche            |                | 2014-07-23   |              | []     | Concert |
|  1 | f5d30095-5c4d-4a6a-8a50-65d7c56d8515 | Jazzchor der Universität Bonn at Theatersaal des Augustinums |                | 2014-07-25   |              | []     | Concert |
|  2 | 054e5968-5f94-435b-a07a-785bf17636a9 | Markus Land Quintett at Bonn Hbf/Thomas-Mann-Straße          |                | 2014-09-26   |              | []     | Concert |
|  3 | 29d57bf5-2aba-40fa-9596-9638728a3df6 | Sax i