In [26]:
from typing import List, Tuple, Dict, Callable, Any, Union
from functools import partial
from pathlib import Path
from copy import deepcopy
import json

from miditok import MIDILike, MIDITokenizer
from miditoolkit import MidiFile
from tqdm import tqdm
from transformers.data.data_collator import DataCollatorMixin
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation, BatchNormalization
from keras.optimizers import RMSprop, Adam
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint


In [39]:
tokens_path = Path("C:/Users/simas/OneDrive/Desktop/alawais midi/tokens_BPE")
tokens_paths = list(tokens_path.glob("**/*.json"))
tokenizer = MIDILike()
vocab_size = 500

In [3]:
class MIDIDataset(tf.keras.utils.Sequence):
    r"""Dataset for generator training

    :param files_paths: list of paths to files to load.
    :param tokenizer: tokenizer object, to use to load MIDIs instead of tokens. (default: None)
    """

    def __init__(self, files_paths: List[Path], min_seq_len: int, max_seq_len: int, tokenizer: MIDITokenizer = None):
        samples = []

        for file_path in tqdm(files_paths, desc=f'Loading data: {files_paths[0].parent}'):
            if file_path.suffix in ["mid", "midi", "MID", "MIDI"]:
                midi = MidiFile(file_path)
                for _ in range(len(midi.instruments) - 1):
                    del midi.instruments[1]  # removes all tracks except first one
                tokens = tokenizer.midi_to_tokens(midi)[0].ids
            else:
                with open(file_path) as json_file:
                    tokens = json.load(json_file)['ids'][0]  # first track
            i = 0
            while i < len(tokens):
                if i >= len(tokens) - min_seq_len:
                    break  # last sample is too short
                samples.append(np.array(tokens[i:i + max_seq_len]))
                i += len(samples[-1])  # could be replaced with max_seq_len

        self.samples = samples

    def __getitem__(self, idx) -> Dict[str, np.ndarray]:
        return {"input_ids": self.samples[idx], "labels": self.samples[idx]}
    
    def __len__(self) -> int: return len(self.samples)

    def __repr__(self): return self.__str__()

    def __str__(self) -> str: return 'No data loaded' if len(self) == 0 else f'{len(self.samples)} samples'


def _pad_batch(examples: List[Dict[str, np.ndarray]], pad_token: int) -> np.ndarray:
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""

    length_of_first = examples[0]["input_ids"].shape[0]

    # Check if padding is necessary.
    are_tensors_same_length = all(x["input_ids"].shape[0] == length_of_first for x in examples)
    if are_tensors_same_length:
        return np.stack([e["input_ids"] for e in examples], axis=0)

    # Creating the full tensor and filling it with our data.
    return tf.keras.preprocessing.sequence.pad_sequences([e["input_ids"] for e in examples], padding='post', value=pad_token)

class DataCollatorGen:
    def __init__(self, pad_token: int):
        """Collator that simply pad the input sequences.
        Input_ids will be padded with the pad token given, while labels will be
        padded with -100.

        :param pad_token: pas token
        """
        self.pad_token = pad_token

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, np.ndarray]:
        x, y = _pad_batch(batch, self.pad_token), _pad_batch(batch, -100)
        return {"input_ids": x, "labels": y}


In [4]:
dataset = MIDIDataset(
    tokens_paths, max_seq_len=512, min_seq_len=256, 
)

Loading data: C:\Users\simas\OneDrive\Desktop\alawais midi\tokens_BPE: 100%|██████████| 198390/198390 [02:56<00:00, 1121.87it/s]


In [13]:
# split dataset into train, validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = tf.data.Dataset.from_generator(
    lambda: dataset, output_types={"input_ids": tf.int32, "labels": tf.int32}
).take(train_size), tf.data.Dataset.from_generator(
    lambda: dataset, output_types={"input_ids": tf.int32, "labels": tf.int32}
).skip(train_size)

In [14]:
print(f"Train dataset size: {train_size}, validation dataset size: {val_size}")

Train dataset size: 64844, validation dataset size: 16211


In [50]:
#print shape of first 5 samples
for x in train_dataset.take(5):
    print(x["input_ids"].shape)



(338,)
(363,)
(369,)
(362,)
(388,)
