In [1]:
import os
import numpy as np
import pandas as pd
import miditok
from tqdm import tqdm

import torch

Initialize tokenizer

In [None]:
TOKENIZER_PARAMS = {
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_tempos": True,
    "use_programs": True,
    "one_token_stream_for_programs": True,
    "use_time_signatures": True
}
tokenizer = miditok.REMI(miditok.TokenizerConfig(**TOKENIZER_PARAMS))

Load midi files

In [None]:
midi_paths = [os.path.join("giant_midi_piano", midi_file) for midi_file in os.listdir("giant_midi_piano")]
# midi_paths = [os.path.join("giant_midi_piano", midi_file.replace('.mid', '').replace('.', '').replace(',', '').replace("'", '').replace(' ', '_') + ".mid") for midi_file in os.listdir("giant_midi_piano")]
files_len = len(midi_paths)
print(f"Midi dataset len: {files_len}")

In [4]:
# Optional: rename midi files, as the tokenizer sometimes is not working with strange filenames. This will rename all the midi files, so better save a copy before executing
for i, midi_path in enumerate(midi_paths):
    os.rename(midi_path, os.path.join("giant_midi_piano", f"track_{i}.mid"))

Split midi tracks into chunks of the same length, optionally with overlap

In [10]:
def split_tokens(input_list, n, overlap=0):
    if len(input_list) < n:
        return [input_list]
    return [input_list[i:i + n] for i in range(0, len(input_list), n - overlap)]

In [11]:
CHUNK_MAX_LEN = 512

In [None]:
midi_tokens_dataset_list = []
df_chunks_list = []
chunk_idx = 0
for midi_path in tqdm(midi_paths):
    filename = os.path.basename(midi_path).replace(".mid", '')
    try:
        token_ids = tokenizer.encode(midi_path).ids
    except:
        print("Error tokenizing file", filename)
        continue
    token_ids.insert(0, 1)
    token_ids.append(2)
    chunks = split_tokens(token_ids, CHUNK_MAX_LEN + 1, overlap=32)
    for i, chunk in enumerate(chunks):
        tokens_len = len(chunk)
        if tokens_len < (CHUNK_MAX_LEN + 1): # for now, no padding is applied, so non-complete chunks at the end will be discarded
            continue

        chunk_dict = {
            "filename": filename,
            "chunk_idx": chunk_idx,
            "chunk_sentence_idx": i
        }
        chunk_idx += 1
        midi_tokens_dataset_list.append(np.array(chunk))
        df_chunks_list.append(chunk_dict)

midi_tokens_dataset = np.array(midi_tokens_dataset_list)
print("midi_tokens_dataset shape:", midi_tokens_dataset.shape)
midi_df = pd.DataFrame(df_chunks_list, columns=["filename", "chunk_idx", "chunk_sentence_idx"])
print(f"Total chunks: {len(midi_df)}")
midi_df.head()

Save dataset numpy array and descriptive csv

In [13]:
np.save("midi_tokens_dataset_512.npy", midi_tokens_dataset)
midi_df.to_csv("midi_dataset_512.csv", index=False)

Save unique tokens

In [None]:
midi_tokens_dataset = torch.tensor(midi_tokens_dataset, dtype=torch.int64)
unique_tokens = torch.unique(midi_tokens_dataset)
np.save("unique_tokens.npy", unique_tokens.numpy())
vocab_size = len(unique_tokens)
print("vocab_size", vocab_size)
unique_tokens