MIDI TOKENIZATION FOR MACHINE LEARNING

In [None]:
!pip install miditok
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.2


In [1]:
from miditok import REMI, MIDILike
from miditok.utils import get_midi_programs
from miditoolkit import MidiFile
from pathlib import Path
from miditok.constants import CHORD_MAPS


In [2]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True) #mounts the google drive where the dataset is located

ModuleNotFoundError: No module named 'google.colab'

Initiate the tokenizer:

In [3]:
# Our parameters
pitch_range = range(21, 109)
beat_res = {(0, 4): 8, (4, 12): 4}
nb_velocities = 32
additional_tokens = {'Chord': True, 'Rest': True, 'Tempo': True,
                     'rest_range': (2, 8),  # (half, 8 beats)
                     'nb_tempos': 32,  # nb of tempo bins
                     'tempo_range': (40, 250),  # (min, max)
                     'Program': False,
                     "chord_maps": CHORD_MAPS,
                     "chord_tokens_with_root_note": True,
                     "chord_unknown": False}
special_tokens = ["PAD", "BOS", "EOS"]

In [14]:
#creates tokenizer
tokenizer = MIDILike(pitch_range, beat_res, nb_velocities, additional_tokens, special_tokens=special_tokens)

Paths to the dataset

In [15]:
midi_paths = list(Path('C:/Users/simas/Downloads/dummy dataset').glob('**/*.mid'))

Checking length of the dataset

In [16]:
print(len(midi_paths))

12


A validation of MIDI files - discarding data that is not usable for machine learning


In [17]:

def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    if midi.max_tick < 10 * midi.ticks_per_beat:
        return False  # this MIDI is too short
    return True

Converting MIDI to .json tokens

In [18]:
data_augmentation_offsets = [2, 2, 1]   # will perform data augmentation on 2 pitch octaves,
tokenizer.tokenize_midi_dataset(        # 2 velocity and 1 duration values
    midi_paths,
    Path('C:/Users/simas/Downloads/dummy dataset/noBPE'),
    midi_valid
)

Tokenizing MIDIs (dummy dataset/noBPE): 100%|██████████| 12/12 [00:00<00:00, 214.86it/s]


Learning the vocabulary with Byte-Paired Encoding

In [19]:
tokenizer.learn_bpe(
    vocab_size=50,
    tokens_paths=list(Path('C:/Users/simas/Downloads/dummy dataset/noBPE').glob('**/*.json')),
    out_dir=Path('C:/Users/simas/Downloads/dummy dataset/BPE')
)

Loading token files: 100%|██████████| 12/12 [00:00<00:00, 12032.43it/s]


Converting the tokenized musics into tokens with BPE


In [20]:
tokenizer.apply_bpe_to_dataset(
    Path('C:/Users/simas/Downloads/dummy dataset/noBPE'),
    Path('C:/Users/simas/Downloads/dummy dataset/BPE')
)



Applying BPE to dataset: 100%|██████████| 12/12 [00:00<00:00, 859.37it/s]
