In [207]:
import os
import jams

GUITARSET_ROOT = "../data/guitarset"
AUDIO_DIR = os.path.join(GUITARSET_ROOT, "audio_mono-pickup_mix")
ANNOTATION_DIR = os.path.join(GUITARSET_ROOT, "annotation")

In [208]:
jams_files = []

for f in os.listdir(ANNOTATION_DIR):
    full_path = os.path.join(ANNOTATION_DIR, f)
    jams_files.append(full_path)

print("Total JAMS files:", len(jams_files))
print(jams_files[0])

Total JAMS files: 360
../data/guitarset/annotation/03_SS1-100-C#_comp.jams


In [209]:
jam = jams.load(jams_files[0])
print(jam.file_metadata)

{
  "title": "03_SS1-100-C#_comp",
  "artist": "",
  "release": "",
  "duration": 28.80000000000001,
  "identifiers": {},
  "jams_version": "0.3.1"
}


In [210]:
def normalize_chord_annotations(label):
    # TODO:
    #  - Add 7ths to the chord
    #  - Normalize chords based on the key mode;
    #    if the excerpt is in Eb, then D# should be rewritten as Eb (e.g., 04_jazz3-137-eb).

    """
    Normalize GuitarSet chord labels to triads.

    Parameters
    ----------
    label : str
        expected format: root:quality(...)/bass

    Returns
    -------
    str
        root:maj | root:min | root:dim | root:aug | N

    Input format:
        root:quality(...)/bass

    Output format:
        root:maj | root:min | root:dim | root:aug | N
    """

    try:
        root, rest = label.split(":", 1)
    except ValueError:
        return "N"

    rest = rest.lower().strip()
    if rest.startswith("min"):
        q = "min"
    elif rest.startswith("dim"):
        q = "dim"
    elif rest.startswith("aug"):
        q = "aug"
    else:
        q = "maj"

    return f"{root}:{q}"

def extract_chord_annotations(jam, normalized=True):
    """
    Extract performed chord annotations from a GuitarSet JAMS object.

    Parameters
    ----------
    jam : jams.JAMS
        A JAMS object containing GuitarSet annotations.

    normalized : bool, optional
        If True (default), normalize chords to triads (maj/min/dim/aug).
        If False, keep the original chord labels from the JAMS file.

    Returns
    -------
    List[Dict[str, Any]]
        A list of chord segments, where each segment is represented as a dictionary:
            - 'start': float, start time in seconds
            - 'end': float, end time in seconds
            - 'label': str, chord label (normalized or original)
    """
    # Use the second "chord" annotation namespace (performed chords)
    chord_ann = jam.annotations.search(namespace="chord")[1]
    chords = []

    for c in chord_ann:
        label = c.value
        normalized_label = normalize_chord_annotations(label)

        chords.append({
            "start": c.time,
            "end": c.time + c.duration,
            "label": normalized_label if normalized else label
        })

    return chords

In [211]:
def extract_beat_position(jam):
    """
    Extract beat_position annotations from a GuitarSet JAMS object.

    Parameters
    ----------
    jam : jams.JAMS
        A JAMS object containing GuitarSet annotations.

    Returns
    -------
    List[Dict[str, Any]]
        A list of beat_position segments, where each segment is represented as a dictionary:
            - 'time': float, start time in seconds
            - 'duration': float, duration of the beat in seconds
            - 'measure': int, the measure number
            - 'position': int, the position within the measure (in beats)
            - 'time_signature': str, e.g., "4/4", "6/8", "2/2"
    """
    beat_position_ann = jam.annotations.search(namespace="beat_position")
    if not beat_position_ann:
        return []

    beat_position_data = beat_position_ann[0].data
    beat_positions = []

    for bp in beat_position_data:
        value = bp.value
        beat_positions.append({
            "time": bp.time,
            "duration": bp.duration,
            "measure": value['measure'],
            "position": value['position'],
            "time_signature": f"{value['num_beats']}/{value['beat_units']}"
        })

    return beat_positions

In [212]:
def extract_key_mode(jam):
    """
    Extract key_mode annotations from a GuitarSet JAMS object.

    Parameters
    ----------
    jam : jams.JAMS
        A JAMS object containing GuitarSet annotations.

    Returns
    -------
    List[Dict[str, Any]]
        A list of key_mode segments, where each segment is represented as a dictionary:
            - 'value': str, key_mode of the excerpt
            - 'tonic': str, home pitch of the excerpt
            - 'mode': str, a scale that being used
    """
    key_mode_ann = jam.annotations.search(namespace="key_mode")
    if not key_mode_ann:
        return []

    key_mode_data = key_mode_ann[0].data
    key_modes = []

    for km in key_mode_data:
        tonic, mode = km.value.split(":")
        key_modes.append({
            "value": km.value,
            "tonic": tonic,
            "mode": mode,
        })

    if len(key_modes) != 1:
        print("WARNING: multiple key_mode annotations found!")

    return key_modes

In [213]:
dataset_index = []

for jams_path in jams_files:
    jam = jams.load(jams_path)

    basename = os.path.basename(jams_path).replace(".jams", "")
    audio_path = os.path.join(AUDIO_DIR, basename + "_mix" + ".wav")

    if not os.path.exists(audio_path):
        continue

    dataset_index.append({
        "id": basename,
        "audio_path": audio_path,
        "jams_path": jams_path,
        "chords": extract_chord_annotations(jam, True),
        "beat_position": extract_beat_position(jam),
        "key_mode": extract_key_mode(jam),
    })

print("Total usable tracks:", len(dataset_index))

Total usable tracks: 360


In [215]:
import pandas as pd

songs = dataset_index

with pd.ExcelWriter("../data/xlsx_excerpts/excerpts.xlsx", engine="openpyxl") as writer:
    for i, song in enumerate(songs):
        meta = {
            "id": song["id"],
            "audio_path": song["audio_path"],
            "jams_path": song["jams_path"],
        }

        chords_df = pd.DataFrame(song["chords"])
        beats_df = pd.DataFrame(song["beat_position"])
        key_modes_df = pd.DataFrame(song["key_mode"])

        # make sure both have the same length by outer-joining on index
        merged = pd.concat(
            [key_modes_df, chords_df, beats_df],
            axis=1
        )

        # add meta-columns (repeat automatically)
        for k, v in meta.items():
            merged[k] = v

        # reorder columns (meta first)
        cols = ["id", "audio_path", "jams_path"] + \
               [c for c in merged.columns if c not in ("id","audio_path","jams_path")]
        merged = merged[cols]

        sheet = f"{song['id']}"
        merged.to_excel(writer, index=False, sheet_name=sheet)

        # autosize columns
        ws = writer.sheets[sheet]
        for col in ws.columns:
            max_len = 0
            col_letter = col[0].column_letter
            for cell in col:
                try:
                    val = str(cell.value)
                except:
                    val = ""
                max_len = max(max_len, len(val))
            ws.column_dimensions[col_letter].width = max_len + 2