While the dataset is relatively clean, there are still things that have issues, sich as non-valid modes, and  unsupported characters

In [3]:
import re
import pandas as pd

In [7]:
import re

ABC_HEADER_FIELDS = ["X", "T", "M", "L", "K"]

VALID_MODES = {
    "major", "minor", "dorian", "mixolydian"
}

def strip_inline_key_changes(abc: str) -> str:
    """Removes all inline [K:...] key changes from the tune body."""
    return re.sub(r'\[K:[^\]]+\]', '', abc)


def strip_existing_header_key(abc: str) -> str:
    """Removes any existing top-level K: header."""
    return re.sub(r'^K:[^\n]*\n?', '', abc, flags=re.MULTILINE)


def is_valid_key(mode: str) -> bool:
    """Checks if the given key string is in a supported format like 'Dminor', 'Gmixolydian'."""
    match = re.match(r'^([A-G][b#]?)(major|minor|dorian|mixolydian)$', mode, re.IGNORECASE)
    return match is not None


def get_transpose_distance(mode_str: str) -> int:
    """Returns semitone shift needed to bring the tonic to C."""
    tonic_match = re.match(r'^([A-G][b#]?)(.*)', mode_str, re.IGNORECASE)
    if not tonic_match:
        raise ValueError(f"Cannot parse tonic from mode string: {mode_str}")
    note = tonic_match.group(1).upper()
    distance = {
        "C": 0, "D": -2, "E": -4, "F": -5,
        "G": -7, "A": -9, "B": -11
    }
    return distance.get(note, 0)


def sanitize_abc_tokens(abc_str):
    """
    Removes unsupported or broken accidentals and inline key changes.
    - Removes ^^, ^=, ==, =, etc.
    - Strips any [K:...] inside tune body
    """
    abc_str = re.sub(r'\^=|==|\^\^|=', '', abc_str)
    abc_str = re.sub(r'\[K:[^\]]+\]', '', abc_str)
    return abc_str


def rebuild_header(row: dict) -> str:
    """
    Constructs a standard ABC header using DataFrame row fields.
    Expects: 'setting_id', 'name', 'meter', 'mode'
    """
    return f"""X:{row['setting_id']}
T:{row['name']}
M:{row['meter']}
K:{row['mode']}
"""


def clean_abc(row: dict) -> str:
    """
    Full normalization pipeline:
    - Strip invalid/malformed accidentals
    - Remove existing K: lines (header + inline)
    - Rebuild header using trusted metadata
    - Return cleaned ABC string
    """
    raw_abc = row['abc']
    cleaned_body = sanitize_abc_tokens(strip_inline_key_changes(strip_existing_header_key(raw_abc)))
    header = rebuild_header(row)
    return header + cleaned_body.strip() + "\n"


In [4]:
df= pd.read_csv('/home/devcontainers/git/Tune_Similarity/raw_data/tunes.csv')

In [9]:
df['full_abc'] = df.apply(clean_abc, axis=1)

In [12]:
df.to_csv('data/cleaned_data.csv')