[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gomar0801/BachGen/blob/main/notebooks/Pipeline%20complet%20BachGen.ipynb)

In [1]:
%%capture
# Clone the BachGen repository
!rm -rf BachGen && git clone https://github.com/gomar0801/BachGen.git
!chmod +x ./BachGen/scripts/setup.sh
!./BachGen/scripts/setup.sh

In [None]:
!rm -rf BachGen && git clone https://github.com/gomar0801/BachGen.git

Cloning into 'BachGen'...
remote: Enumerating objects: 414, done.[K
remote: Counting objects:   0% (1/224)[Kremote: Counting objects:   1% (3/224)[Kremote: Counting objects:   2% (5/224)[Kremote: Counting objects:   3% (7/224)[Kremote: Counting objects:   4% (9/224)[Kremote: Counting objects:   5% (12/224)[Kremote: Counting objects:   6% (14/224)[Kremote: Counting objects:   7% (16/224)[Kremote: Counting objects:   8% (18/224)[Kremote: Counting objects:   9% (21/224)[Kremote: Counting objects:  10% (23/224)[Kremote: Counting objects:  11% (25/224)[Kremote: Counting objects:  12% (27/224)[Kremote: Counting objects:  13% (30/224)[Kremote: Counting objects:  14% (32/224)[Kremote: Counting objects:  15% (34/224)[Kremote: Counting objects:  16% (36/224)[Kremote: Counting objects:  17% (39/224)[Kremote: Counting objects:  18% (41/224)[Kremote: Counting objects:  19% (43/224)[Kremote: Counting objects:  20% (45/224)[Kremote: Counting objects:  21% (4

In [2]:
from bachgen.download_data import download_all
from bachgen.extract import extract_archive
from bachgen.data_filter import load_and_filter_piano_classical
from bachgen.mxl_to_musicxml import convert_mxl_to_musicxml
from bachgen.display_and_play_partition import display_and_play
from bachgen.score_to_tokens_solution_all2 import MusicXML_to_tokens
from bachgen.tokens_to_musicxml import convert_tokens_to_musicxml

  import pkg_resources


# Telechargement des données

In [None]:
download_all()

⬇️  Téléchargement depuis https://zenodo.org/records/15571083/files/mxl.tar.gz?download=1 ...


In [None]:
extract_folder=extract_archive("/content/data/mxl.tar.gz")

# Filtrage

In [None]:
df_piano_classical = load_and_filter_piano_classical()

In [None]:
import warnings
from music21 import musicxml
warnings.simplefilter("ignore", musicxml.xmlToM21.MusicXMLWarning)

In [10]:
import os
from pathlib import Path
from tqdm import tqdm

MXL_ROOT = Path("data/mxl")
OUT_DIR  = Path("data/musicxml_classical_piano")
OUT_DIR.mkdir(parents=True, exist_ok=True)

PATH_COL = "mxl"  # ou "mxl_path" selon ton CSV

# 1) Garde uniquement les lignes avec un chemin non nul et bien typé
df_clean = (
    df_piano_classical
    .loc[df_piano_classical[PATH_COL].notna(), [PATH_COL]]
    .copy()
)
df_clean[PATH_COL] = df_clean[PATH_COL].astype(str)

# 2) Filtre: ne garder que les .mxl (évite les .xml/.json/.mid, etc.)
df_clean = df_clean[df_clean[PATH_COL].str.lower().str.endswith(".mxl")]

# (Optionnel) supprime les doublons de chemin
df_clean = df_clean.drop_duplicates(subset=[PATH_COL])

errors = []
done = 0

for rel in tqdm(df_clean[PATH_COL].values, total=len(df_clean)):
    in_path = MXL_ROOT / rel  # Path gère bien les / et \

    if not in_path.exists():
        errors.append(("missing", str(in_path)))
        continue

    out_name = in_path.stem + ".musicxml"
    out_path = OUT_DIR / out_name

    try:
        convert_mxl_to_musicxml(str(in_path), str(out_path))
        done += 1
    except Exception as e:
        errors.append((str(in_path), str(e)))

print(f"✅ Conversion terminée: {done} fichiers créés dans {OUT_DIR}")
if errors:
    print(f"⚠️ {len(errors)} problème(s). Exemples:")
    for e in errors[:10]:
        print("  -", e)

 25%|██▌       | 6111/24004 [1:02:42<3:03:36,  1.62it/s]


KeyboardInterrupt: 

In [11]:
from pathlib import Path
import shutil

def pack_musicxml_dir(src="data/musicxml_classical_piano",
                      out_zip="musicxml_classical_piano.zip"):
    src = Path(src)
    assert src.exists() and src.is_dir(), f"Dossier introuvable: {src}"

    # shutil.make_archive ajoute l’extension, donc on lui passe le chemin sans .zip
    out_base = Path(out_zip).with_suffix("")
    archive_path = shutil.make_archive(str(out_base), 'zip', root_dir=str(src), base_dir=".")
    print(f"✅ Archive créée: {archive_path}")

pack_musicxml_dir()

✅ Archive créée: /content/musicxml_classical_piano.zip
