[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gomar0801/BachGen/blob/main/notebooks/Pipeline%20complet%20BachGen.ipynb)

In [2]:
%%capture
# Clone the BachGen repository
!rm -rf BachGen && git clone https://github.com/gomar0801/BachGen.git
!chmod +x ./BachGen/scripts/setup.sh
!./BachGen/scripts/setup.sh

In [None]:
!rm -rf BachGen && git clone https://github.com/gomar0801/BachGen.git

Cloning into 'BachGen'...
remote: Enumerating objects: 414, done.[K
remote: Counting objects:   0% (1/224)[Kremote: Counting objects:   1% (3/224)[Kremote: Counting objects:   2% (5/224)[Kremote: Counting objects:   3% (7/224)[Kremote: Counting objects:   4% (9/224)[Kremote: Counting objects:   5% (12/224)[Kremote: Counting objects:   6% (14/224)[Kremote: Counting objects:   7% (16/224)[Kremote: Counting objects:   8% (18/224)[Kremote: Counting objects:   9% (21/224)[Kremote: Counting objects:  10% (23/224)[Kremote: Counting objects:  11% (25/224)[Kremote: Counting objects:  12% (27/224)[Kremote: Counting objects:  13% (30/224)[Kremote: Counting objects:  14% (32/224)[Kremote: Counting objects:  15% (34/224)[Kremote: Counting objects:  16% (36/224)[Kremote: Counting objects:  17% (39/224)[Kremote: Counting objects:  18% (41/224)[Kremote: Counting objects:  19% (43/224)[Kremote: Counting objects:  20% (45/224)[Kremote: Counting objects:  21% (4

In [3]:
from bachgen.download_data import download_all
from bachgen.extract import extract_archive
from bachgen.data_filter import load_and_filter_piano_classical
from bachgen.mxl_to_musicxml import convert_mxl_to_musicxml
from bachgen.display_and_play_partition import display_and_play
from bachgen.score_to_tokens_solution_all2 import MusicXML_to_tokens
from bachgen.tokens_to_musicxml import convert_tokens_to_musicxml

  import pkg_resources


# Telechargement des données

In [None]:
download_all()

⬇️  Téléchargement depuis https://zenodo.org/records/15571083/files/mxl.tar.gz?download=1 ...
✅ Fichier téléchargé : data/mxl.tar.gz
⬇️  Téléchargement depuis https://zenodo.org/records/15571083/files/PDMX.csv?download=1 ...
✅ Fichier téléchargé : data/PDMX.csv


In [None]:
extract_folder=extract_archive("/content/data/mxl.tar.gz")

📦 Extraction de /content/data/mxl.tar.gz vers data/mxl ...
✅ Extraction terminée dans : data/mxl


# Filtrage

In [None]:
df_piano_classical = load_and_filter_piano_classical()

🎹 24008 partitions de piano (genre classique) trouvées.


# MXL to MusicXML

In [None]:
import warnings
from music21 import musicxml
warnings.simplefilter("ignore", musicxml.xmlToM21.MusicXMLWarning)

In [None]:
import os
from pathlib import Path
from tqdm import tqdm
import concurrent.futures

MXL_ROOT = Path("data/mxl")
OUT_DIR  = Path("data/musicxml_classical_piano_convert")
OUT_DIR.mkdir(parents=True, exist_ok=True)

PATH_COL = "mxl"  # ou "mxl_path" selon ton CSV
TIMEOUT  = 10     # secondes max par fichier

# 1) Nettoyage du dataframe
df_clean = (
    df_piano_classical
    .loc[df_piano_classical[PATH_COL].notna(), [PATH_COL]]
    .copy()
)
df_clean[PATH_COL] = df_clean[PATH_COL].astype(str)
df_clean = df_clean[df_clean[PATH_COL].str.lower().str.endswith(".mxl")]
df_clean = df_clean.drop_duplicates(subset=[PATH_COL])

errors = []
done = 0

# Fonction wrapper pour conversion
def convert_with_timeout(in_path, out_path):
    convert_mxl_to_musicxml(str(in_path), str(out_path))

# Boucle principale
for rel in tqdm(df_clean[PATH_COL].values, total=len(df_clean)):
    in_path = MXL_ROOT / rel

    if not in_path.exists():
        errors.append(("missing", str(in_path)))
        continue

    out_name = in_path.stem + ".musicxml"
    out_path = OUT_DIR / out_name

    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(convert_with_timeout, in_path, out_path)
        try:
            future.result(timeout=TIMEOUT)
            done += 1
        except concurrent.futures.TimeoutError:
            errors.append((str(in_path), f"Timeout > {TIMEOUT}s"))
        except Exception as e:
            errors.append((str(in_path), str(e)))

print(f"✅ Conversion terminée: {done} fichiers créés dans {OUT_DIR}")
if errors:
    print(f"⚠️ {len(errors)} problème(s). Exemples:")
    for e in errors[:10]:
        print("  -", e)


 53%|█████▎    | 12723/24004 [2:27:27<2:10:44,  1.44it/s]


KeyboardInterrupt: 

In [None]:
!cp -r /content/data/musicxml_classical_piano_convert /content/drive/MyDrive/Stage\ MusicXML/

In [None]:
from pathlib import Path
import shutil

def pack_musicxml_dir(src="data/musicxml_classical_piano_convert",
                      out_zip="musicxml_classical_piano_convert.zip"):
    src = Path(src)
    assert src.exists() and src.is_dir(), f"Dossier introuvable: {src}"

    # shutil.make_archive ajoute l’extension, donc on lui passe le chemin sans .zip
    out_base = Path(out_zip).with_suffix("")
    archive_path = shutil.make_archive(str(out_base), 'zip', root_dir=str(src), base_dir=".")
    print(f"✅ Archive créée: {archive_path}")

pack_musicxml_dir()

✅ Archive créée: /content/musicxml_classical_piano_convert.zip


# MusicXML to Tokens (+statistics)

## Sans statistique

In [None]:
from pathlib import Path

SRC_DIR = Path("data/musicxml_classical_piano")  # dossier avec les .musicxml
OUT_DIR = Path("data/tokens_classical_piano")
OUT_DIR.mkdir(parents=True, exist_ok=True)

for xml_file in SRC_DIR.glob("*.musicxml"):
    try:
        tokens = MusicXML_to_tokens(str(xml_file))
        out_file = OUT_DIR / (xml_file.stem + ".txt")
        with open(out_file, "w", encoding="utf-8") as f:
            f.write(" ".join(tokens))
        print(f"✅ {xml_file.name} → {out_file.name}")
    except Exception as e:
        print(f"❌ {xml_file.name} -> {e}")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  → Backup : -40320
    • Note: Rest, Voice: 2, Start: 0, Duration: 30240, Chord: False
    • Note: G4, Voice: 2, Start: 30240, Duration: 10080, Chord: False
[Résultat] 10 notes extraites avec positions.


  → 10 notes extraites avec positions
  → Notes triées par position :
    • Pos     0, Voice 1, Note: G4
    • Pos     0, Voice 1, Note: D5
    • Pos     0, Voice 2, Note: Rest
    • Pos 10080, Voice 1, Note: A4
    • Pos 10080, Voice 1, Note: D5
    • Pos 20160, Voice 1, Note: G4
    • Pos 20160, Voice 1, Note: D5
    • Pos 30240, Voice 1, Note: C5
    • Pos 30240, Voice 2, Note: G4
    • Pos 35280, Voice 1, Note: D5

  → Position 0 : 3 élément(s)
    ⚠ Silences superposés détectés, ils sont ignorés.
    ↳ Durée harmonisée de l'accord : 10080

  → Position 10080 : 2 élément(s)
    ↳ Durée harmonisée de l'accord : 10080

  → Position 20160 : 2 élément(s)
    ↳ Durée harmonisée de l'accord : 10

KeyboardInterrupt: 

## Avec Statistiques

In [None]:
from pathlib import Path
from io import StringIO
from contextlib import redirect_stdout
import csv

from bachgen.score_to_tokens_solution_all2 import MusicXML_to_tokens

SRC_DIR = Path("data/musicxml_classical_piano_convert")   # your .musicxml folder
TOK_DIR = Path("data/tokens_classical_piano_convert")
TOK_DIR.mkdir(parents=True, exist_ok=True)

STATS_CSV = Path("data/tokenization_stats.csv")

def tokenize_with_stats(xml_path: Path, note_name=True):
    """
    Runs MusicXML_to_tokens while capturing its debug prints,
    then parses those prints to compute per-piece statistics.
    Returns (tokens, stats_dict).
    """
    buf = StringIO()
    with redirect_stdout(buf):
        tokens = MusicXML_to_tokens(str(xml_path), note_name=note_name)
    log = buf.getvalue().splitlines()

    # --- Parse debug lines ---
    # Anchors based on the messages you already print:
    # - "Note transparente détectée, ignorée"
    # - "Silences superposés détectés, ils sont ignorés."
    # - "↳ Durée harmonisée de l'accord :"
    # - "[note_to_tokens] Traitement d'une note ou d'un groupe"
    # - "  → Rest detected"
    total_notes_seen      = 0   # every time note_to_tokens starts processing a note/rest
    rests_kept            = 0   # rests that reached note_to_tokens and were kept
    rests_ignored_overlap = 0   # rests ignored in rewrite step (overlapped)
    transparent_ignored   = 0   # notes with print-object="no", ignored
    harmonize_events      = 0   # positions where harmonization line was printed

    for line in log:
        if "[note_to_tokens] Traitement d'une note ou d'un groupe" in line:
            total_notes_seen += 1
        elif "Note transparente détectée, ignorée" in line:
            transparent_ignored += 1
        elif "→ Rest detected" in line:
            rests_kept += 1
        elif "Silences superposés détectés, ils sont ignorés" in line:
            rests_ignored_overlap += 1
        elif "Durée harmonisée de l'accord" in line:
            harmonize_events += 1

    # --- Compute percentages (defensive against /0) ---
    # 1) Transparent notes % over all items that hit note_to_tokens (includes transparents)
    trans_pct = (transparent_ignored / total_notes_seen * 100.0) if total_notes_seen else 0.0

    # 2) Overlapped rests % = ignored rests / (ignored + kept rests)
    total_rests_considered = rests_kept + rests_ignored_overlap
    overlap_rest_pct = (rests_ignored_overlap / total_rests_considered * 100.0) if total_rests_considered else 0.0

    # 3) Harmonized chord duration % = harmonize_events / total accord positions observed in logs.
    #    We approximate denominator with harmonize_events (i.e., report the rate as count per piece),
    #    and still expose the raw count so you can interpret it.
    #    If you later add a line like "→ Position ... : n élément(s)" we could refine this.
    harmonize_rate_pct = None  # unknown denominator; we’ll report the raw count instead

    stats = {
        "file": xml_path.name,
        "total_items_seen": total_notes_seen,
        "transparent_ignored": transparent_ignored,
        "rests_kept": rests_kept,
        "rests_ignored_overlap": rests_ignored_overlap,
        "harmonize_events": harmonize_events,
        "transparent_pct": round(trans_pct, 3),
        "overlap_rest_pct": round(overlap_rest_pct, 3),
        # leave harmonize_rate_pct as None; we’ll put just the count
    }
    return tokens, stats

# ---- Run over all files and save tokens + stats ----
all_stats = []
for xml_file in sorted(SRC_DIR.rglob("*.musicxml")):  # rglob for subfolders; use glob if flat
    try:
        tokens, stats = tokenize_with_stats(xml_file, note_name=True)
        # save tokens
        out_txt = TOK_DIR / (xml_file.stem + ".txt")
        out_txt.write_text(" ".join(tokens), encoding="utf-8")
        all_stats.append(stats)
        print(f"✅ {xml_file.relative_to(SRC_DIR)}  "
              f"[transp {stats['transparent_pct']}% | overl.rest {stats['overlap_rest_pct']}% | "
              f"harmonize_events={stats['harmonize_events']}]")
    except Exception as e:
        print(f"❌ {xml_file} -> {e}")

# write CSV summary
with STATS_CSV.open("w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=[
            "file", "total_items_seen", "transparent_ignored",
            "rests_kept", "rests_ignored_overlap", "harmonize_events",
            "transparent_pct", "overlap_rest_pct"
        ],
    )
    writer.writeheader()
    writer.writerows(all_stats)

print(f"\n📊 Stats écrites dans: {STATS_CSV}")
print(f"🧾 Tokens enregistrés dans: {TOK_DIR}")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
✅ QmYWqrym1rdScFw6CQ21CijuSseutGiudRJvsutL4VbPNY.musicxml  [transp 0.0% | overl.rest 69.231% | harmonize_events=38]
✅ QmYWrvqfjZ8oWf7reSCCdW9iwCZrdPQZ4Go6YP1y7wPTJ4.musicxml  [transp 0.0% | overl.rest 100.0% | harmonize_events=65]
✅ QmYWun4B1NopTox1tUcSbaDDnGwzxczQ7uRr9BfXN2j6Hy.musicxml  [transp 3.571% | overl.rest 87.5% | harmonize_events=20]
✅ QmYX2aqk5cHmV7CkJhHVj6P8jEvyykTSGepymXXiuLmfy6.musicxml  [transp 0.0% | overl.rest 0.0% | harmonize_events=71]
✅ QmYX3wgQXBaxVcFX1eRnqXP9oKamToqBGCjkJRFuaeSFed.musicxml  [transp 0.0% | overl.rest 63.636% | harmonize_events=23]
✅ QmYX3xRhGw6Z8G53zLY5LJMkZfdyqHfsMaYK7ftPGmWtw7.musicxml  [transp 1.439% | overl.rest 91.304% | harmonize_events=84]
✅ QmYX445hkG73EMyP5QkjSg2qHFon7CUYTSK2dArSXvxCEn.musicxml  [transp 7.843% | overl.rest 0.0% | harmonize_events=129]
✅ QmYX6tHEXwTVgiRiXroNTuFHZ5A4uZFanFPtEWpwxxhDjm.musicxml  [transp 0.0% | overl.rest 0.0% | harmon

In [None]:
!cp -r /content/data/tokens_classical_piano_convert /content/drive/MyDrive/Stage\ MusicXML/

In [None]:
!cp -r /content/data/tokenization_stats.csv /content/drive/MyDrive/Stage\ MusicXML/

# Training (Model GPT2 from scratch)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!cp -r /content/drive/MyDrive/Stage\ MusicXML/tokens_classical_piano_convert /content/data/