MIDI TOKENIZATION FOR MACHINE LEARNING

In [1]:
!pip install miditok
!pip install tokenizers



In [2]:
from miditok import REMI
from miditok.utils import get_midi_programs
from miditoolkit import MidiFile
from pathlib import Path
from miditok.constants import CHORD_MAPS


In [3]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True) #mounts the google drive where the dataset is located

Mounted at /content/gdrive


Initiate the tokenizer:

In [3]:
# Our parameters
pitch_range = range(21, 109)
beat_res = {(0, 4): 8, (4, 12): 4}
nb_velocities = 32
additional_tokens = {'Chord': True, 'Rest': True, 'Tempo': True,
                     'rest_range': (2, 8),  # (half, 8 beats)
                     'nb_tempos': 32,  # nb of tempo bins
                     'TimeSignature': False,
                     'tempo_range': (40, 250),  # (min, max)
                     'Program': False,
                     "chord_maps": CHORD_MAPS,
                     "chord_tokens_with_root_note": True,
                     "chord_unknown": False}
special_tokens = ["PAD", "BOS", "EOS"]

#creates tokenizer
tokenizer = REMI(pitch_range, beat_res, nb_velocities, additional_tokens, special_tokens=special_tokens
                        )

Paths to the dataset

In [4]:
midi_paths = list(Path("/content/gdrive/MyDrive/ala/MIDI/").glob('**/*.mid'))
print(len(midi_paths))

2507


A validation of MIDI files - discarding data that is not usable for machine learning


In [5]:
def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    if midi.max_tick < 10 * midi.ticks_per_beat:
        return False  # this MIDI is too short
    return True

Converting MIDI to .json tokens

In [6]:
data_augmentation_offsets = [2, 2, 1]   # will perform data augmentation on 2 pitch octaves,
tokenizer.tokenize_midi_dataset(        # 2 velocity and 1 duration values
    midi_paths,
    Path("/content/gdrive/MyDrive/ala/REMI/tokens_noBPE"),
    midi_valid,
    data_augmentation_offsets
)

Tokenizing MIDIs (Desktop/tokens_noBPE): 100%|██████████| 2507/2507 [00:42<00:00, 59.68it/s] 
Performing data augmentation: 100%|██████████| 2488/2488 [00:46<00:00, 53.28it/s]


Learning the vocabulary with Byte-Paired Encoding

In [8]:
tokenizer.learn_bpe(
    vocab_size=1000,
    tokens_paths=list(Path("/content/gdrive/MyDrive/ala/REMI/tokens_noBPE").glob('**/*.json')),
    out_dir=Path("/content/gdrive/MyDrive/ala/REMI/tokens_BPE"),
)

Loading token files: 100%|██████████| 24385/24385 [00:05<00:00, 4100.43it/s]


Converting the tokenized musics into tokens with BPE


In [9]:
tokenizer.apply_bpe_to_dataset(
    Path("/content/gdrive/MyDrive/ala/REMI/tokens_noBPE"),
    Path("/content/gdrive/MyDrive/ala/REMI/tokens_BPE")
)



Applying BPE to dataset: 100%|██████████| 24385/24385 [01:11<00:00, 341.49it/s]


In [13]:
tokenizer.save_params("/content/gdrive/MyDrive/ala/REMI/params.json")