In [None]:
from miditok import REMI, TokenizerConfig
from symusic import Score

# Create tokenizer with configuration
config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)

# Convert MusicXML to tokens
midi = Score("musicxml_sample/minimal.musicxml") 
tokens = tokenizer(midi)

# Train BPE for efficiency
tokenizer.train(vocab_size=30000, files_paths=files_paths)

In [6]:
!pip install torch
!pip install transformers
!pip install miditok


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting miditok
  Downloading miditok-3.0.6.post1-py3-none-any.whl.metadata (10 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.8-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting pySmartDL (from symusic>=0.5.0->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Downloading miditok-3.0.6.post1-py3-none-any.whl (159 kB)
Downloading symusic-0.5.8-cp313-cp313-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import torch
from transformers import AutoModelForCausalLM
from miditok import REMI
from symusic import Score

# Load the pretrained tokenizer
tokenizer = REMI.from_pretrained("Natooz/Maestro-REMI-bpe20k")

# Use with model
model = AutoModelForCausalLM.from_pretrained("Natooz/Maestro-REMI-bpe20k", 
                                           trust_remote_code=True, 
                                           torch_dtype="auto")

  return cls(**input_dict, **kwargs)


In [9]:
import math
from collections import Counter
import random

def calculate_vocabulary_explosion():
    """Demonstrate the combinatorial explosion problem with simple vocabularies"""
    print("=" * 60)
    print("VOCABULARY SIZE EXPLOSION ANALYSIS")
    print("=" * 60)
    
    # Basic music parameters
    pitch_range = 88  # Piano keys (21-108)
    velocity_levels = 32  # Common discretization
    duration_values = 20  # Common note durations (whole, half, quarter, etc.)
    position_values = 32  # Beat positions within a bar
    
    print(f"Basic musical attributes:")
    print(f"  Pitches: {pitch_range}")
    print(f"  Velocities: {velocity_levels}")
    print(f"  Durations: {duration_values}")
    print(f"  Positions: {position_values}")
    
    # Simple vocabulary approach
    simple_vocab_size = pitch_range + velocity_levels + duration_values + position_values
    print(f"\n📋 SIMPLE VOCABULARY:")
    print(f"  Size (sum of attributes): {simple_vocab_size} tokens")
    
    # But this doesn't capture note combinations!
    # Each note needs ALL attributes simultaneously
    note_combinations = pitch_range * velocity_levels * duration_values
    print(f"\n💥 COMBINATORIAL EXPLOSION:")
    print(f"  Possible single notes: {note_combinations:,} combinations")
    print(f"  This is just for ONE note at ONE position!")
    
    # Multi-note scenarios (chords, sequences)
    chord_combinations = note_combinations ** 3  # 3-note chords
    print(f"  Possible 3-note chords: {chord_combinations:e} combinations")
    print(f"  Storage required: {chord_combinations * 4 / (1024**3):.1f} TB just for vocabulary!")
    
    return simple_vocab_size, note_combinations

def demonstrate_sequence_length_problem():
    """Show how sequence length becomes prohibitive without BPE"""
    print("\n" + "=" * 60)
    print("SEQUENCE LENGTH PROBLEM")
    print("=" * 60)
    
    # Example: Simple C major scale
    c_major_scale = [
        ("C4", 60, 80, 1.0),  # Note: (name, pitch, velocity, duration)
        ("D4", 62, 80, 1.0),
        ("E4", 64, 80, 1.0),
        ("F4", 65, 80, 1.0),
        ("G4", 67, 80, 1.0),
        ("A4", 69, 80, 1.0),
        ("B4", 71, 80, 1.0),
        ("C5", 72, 80, 2.0),
    ]
    
    print("Example: C Major Scale (8 notes)")
    print("\n🔴 WITHOUT BPE (Simple vocabulary):")
    
    tokens_without_bpe = []
    for i, (name, pitch, velocity, duration) in enumerate(c_major_scale):
        note_tokens = [
            f"Bar_1",
            f"Position_{i*2}",  # Assuming 8th note positions
            f"Pitch_{pitch}",
            f"Velocity_{velocity}",
            f"Duration_{duration}"
        ]
        tokens_without_bpe.extend(note_tokens)
        print(f"  Note {name}: {note_tokens}")
    
    print(f"\nTotal tokens: {len(tokens_without_bpe)}")
    print(f"Average tokens per note: {len(tokens_without_bpe) / len(c_major_scale):.1f}")
    
    print("\n🟢 WITH BPE (Learned patterns):")
    # Simulate BPE learning common patterns
    bpe_patterns = {
        ("Pitch_60", "Velocity_80", "Duration_1.0"): "NOTE_C4_normal",
        ("Pitch_62", "Velocity_80", "Duration_1.0"): "NOTE_D4_normal", 
        ("Pitch_64", "Velocity_80", "Duration_1.0"): "NOTE_E4_normal",
        ("Position_0", "NOTE_C4_normal"): "START_C4_normal",
        ("Bar_1", "Position_0"): "BAR1_POS0",
    }
    
    # This is simplified - real BPE would learn many more patterns
    tokens_with_bpe = ["BAR1_POS0", "NOTE_C4_normal", "Position_2", "NOTE_D4_normal", 
                      "Position_4", "NOTE_E4_normal", "Position_6", "Pitch_65", 
                      "Velocity_80", "Duration_1.0"]  # Mix of BPE and base tokens
    
    print(f"  Optimized tokens: {tokens_with_bpe}")
    print(f"Total tokens: {len(tokens_with_bpe)}")
    print(f"Reduction: {((len(tokens_without_bpe) - len(tokens_with_bpe)) / len(tokens_without_bpe) * 100):.1f}%")

def analyze_music_patterns():
    """Analyze why musical patterns matter for BPE"""
    print("\n" + "=" * 60)
    print("MUSICAL PATTERN ANALYSIS")
    print("=" * 60)
    
    # Simulate a dataset of musical patterns
    print("Common musical patterns that BPE can learn:")
    
    patterns = {
        "I-V-vi-IV progression": ["Chord_C", "Chord_G", "Chord_Am", "Chord_F"],
        "Ascending scale": ["Pitch_60", "Pitch_62", "Pitch_64", "Pitch_65"],
        "Common rhythm": ["Duration_0.5", "Duration_0.5", "Duration_1.0"],
        "Forte dynamics": ["Velocity_100", "Velocity_105", "Velocity_110"],
        "Beat pattern": ["Position_0", "Position_2", "Position_4", "Position_6"],
    }
    
    print("\n🎵 Frequent musical sequences in training data:")
    for pattern_name, tokens in patterns.items():
        print(f"  {pattern_name}:")
        print(f"    Individual tokens: {tokens}")
        print(f"    BPE could learn: '{pattern_name.replace(' ', '_').upper()}'")
        print(f"    Compression: {len(tokens)} → 1 token ({len(tokens)}x reduction)")
    
    # Demonstrate sparsity problem
    print(f"\n📊 SPARSITY PROBLEM:")
    print(f"  Most 3-token combinations appear < 5 times in dataset")
    print(f"  But some combinations (like C-E-G chord) appear 1000+ times")
    print(f"  BPE learns frequent patterns, ignores rare ones")

def compare_efficiency():
    """Compare computational efficiency"""
    print("\n" + "=" * 60)
    print("COMPUTATIONAL EFFICIENCY COMPARISON")
    print("=" * 60)
    
    # Transformer complexity is O(n²) where n is sequence length
    seq_lengths = {
        "Simple vocab": 1000,
        "BPE vocab": 300,
    }
    
    print("Transformer attention complexity (O(n²)):")
    for method, length in seq_lengths.items():
        complexity = length ** 2
        relative_time = complexity / (seq_lengths["BPE vocab"] ** 2)
        print(f"  {method:12s}: {length:4d} tokens → {complexity:,} operations ({relative_time:.1f}x time)")
    
    # Memory usage
    print(f"\nMemory usage (attention matrices):")
    for method, length in seq_lengths.items():
        memory_mb = (length ** 2 * 4) / (1024 ** 2)  # 4 bytes per float
        print(f"  {method:12s}: {memory_mb:.1f} MB per layer")

def show_real_bpe_example():
    """Show how BPE actually works step by step for music"""
    print("\n" + "=" * 60)
    print("BPE TRAINING PROCESS FOR MUSIC")
    print("=" * 60)
    
    # Simulated corpus of tokenized music
    corpus = [
        ["Pitch_60", "Velocity_80", "Duration_1.0", "Pitch_64", "Velocity_80", "Duration_1.0"],
        ["Pitch_60", "Velocity_80", "Duration_0.5", "Pitch_62", "Velocity_80", "Duration_0.5"],
        ["Pitch_64", "Velocity_80", "Duration_1.0", "Pitch_67", "Velocity_80", "Duration_1.0"],
        ["Pitch_60", "Velocity_80", "Duration_1.0", "Pitch_64", "Velocity_80", "Duration_1.0"],
    ]
    
    print("Step 1: Initial corpus (tokenized music pieces)")
    for i, piece in enumerate(corpus):
        print(f"  Piece {i+1}: {piece}")
    
    # Count bigrams
    bigram_counts = Counter()
    for piece in corpus:
        for i in range(len(piece) - 1):
            bigram = (piece[i], piece[i+1])
            bigram_counts[bigram] += 1
    
    print(f"\nStep 2: Count all bigrams (adjacent token pairs)")
    for bigram, count in bigram_counts.most_common():
        print(f"  {bigram}: {count} times")
    
    # Most frequent merge
    most_frequent = bigram_counts.most_common(1)[0]
    print(f"\nStep 3: Merge most frequent pair")
    print(f"  Merging: {most_frequent[0]} → 'Pitch_60_Velocity_80'")
    print(f"  This pattern appeared {most_frequent[1]} times")
    
    print(f"\nStep 4: Repeat process...")
    print(f"  After many iterations, we get complex patterns like:")
    print(f"    'C_major_triad' ← Pitch_60 + Velocity_80 + Duration_1.0 + Pitch_64 + ...")
    print(f"    'eighth_note_run' ← Duration_0.5 + Duration_0.5 + Duration_0.5 + ...")



In [10]:
"""Run all analyses"""
print("MUSIC BPE vs SIMPLE VOCABULARY: COMPREHENSIVE ANALYSIS")
print("=" * 80)

calculate_vocabulary_explosion()
demonstrate_sequence_length_problem()
analyze_music_patterns()
compare_efficiency()
show_real_bpe_example()

print("\n" + "=" * 80)
print("SUMMARY: WHY SIMPLE VOCABULARIES DON'T WORK")
print("=" * 80)
print("🔴 Problems with simple vocabularies:")
print("  1. Combinatorial explosion (millions of possible note combinations)")
print("  2. Very long sequences (5+ tokens per note)")
print("  3. Sparsity (most combinations are rare)")
print("  4. No pattern capture (can't learn musical idioms)")
print("  5. Computational inefficiency (O(n²) transformer complexity)")

print("\n🟢 How BPE solves these problems:")
print("  1. Learns only frequent patterns (manageable vocabulary)")
print("  2. Compresses sequences (fewer tokens per musical idea)")
print("  3. Captures musical structure (chords, scales, rhythms)")
print("  4. Better semantic embeddings (tokens represent musical concepts)")
print("  5. Faster training and inference (shorter sequences)")

print(f"\n💡 Key insight: Music isn't just sequences of attributes—")
print(f"   it's sequences of MUSICAL PATTERNS that BPE can discover!")

MUSIC BPE vs SIMPLE VOCABULARY: COMPREHENSIVE ANALYSIS
VOCABULARY SIZE EXPLOSION ANALYSIS
Basic musical attributes:
  Pitches: 88
  Velocities: 32
  Durations: 20
  Positions: 32

📋 SIMPLE VOCABULARY:
  Size (sum of attributes): 172 tokens

💥 COMBINATORIAL EXPLOSION:
  Possible single notes: 56,320 combinations
  This is just for ONE note at ONE position!
  Possible 3-note chords: 1.786438e+14 combinations
  Storage required: 665500.0 TB just for vocabulary!

SEQUENCE LENGTH PROBLEM
Example: C Major Scale (8 notes)

🔴 WITHOUT BPE (Simple vocabulary):
  Note C4: ['Bar_1', 'Position_0', 'Pitch_60', 'Velocity_80', 'Duration_1.0']
  Note D4: ['Bar_1', 'Position_2', 'Pitch_62', 'Velocity_80', 'Duration_1.0']
  Note E4: ['Bar_1', 'Position_4', 'Pitch_64', 'Velocity_80', 'Duration_1.0']
  Note F4: ['Bar_1', 'Position_6', 'Pitch_65', 'Velocity_80', 'Duration_1.0']
  Note G4: ['Bar_1', 'Position_8', 'Pitch_67', 'Velocity_80', 'Duration_1.0']
  Note A4: ['Bar_1', 'Position_10', 'Pitch_69', 'Velo

🎹 PITCH (0-127)

What it is: How high or low the note sounds
MIDI range: 0-127 (Middle C = 60)
Examples:

Low pitch (40): Deep bass note
Middle pitch (60): Middle C on piano
High pitch (96): Soprano singing high note



💥 VELOCITY (1-127)

What it is: How hard/loud the note is played
MIDI range: 1-127 (0 = note off)
Examples:

Low velocity (20): Gentle whisper, soft piano
Medium velocity (64): Normal playing
High velocity (120): Forte, powerful strike



⏱️ DURATION (in beats)

What it is: How long the note plays
Common values:

0.25 = Sixteenth note (very fast)
1.0 = Quarter note (standard beat)
4.0 = Whole note (very long)



Complete Musical Note = All Three Together
Example: Piano melody note
├── Pitch: 72 (C5 - high C)
├── Velocity: 45 (soft, gentle)
└── Duration: 2.0 (half note - sustained)
Result: A high, soft, long note