In [None]:
!pip install miditok
!pip install tokenizers
!pip install transformers

import tensorflow as tf
from tensorflow import keras
import numpy
import miditok
import tqdm
from miditok import MIDILike, MIDITokenizer, OctupleMono
import pathlib
from miditok.constants import CHORD_MAPS
from transformers import TFGPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, GenerationConfig, pipeline
from miditoolkit import MidiFile


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting miditok
  Downloading miditok-2.0.5-py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.4/94.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting miditoolkit>=0.1.16
  Downloading miditoolkit-0.1.16-py3-none-any.whl (20 kB)
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido, miditoolkit, miditok
Successfully installed miditok-2.0.5 miditoolkit-0.1.16 mido-1.2.10
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/

In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
#creates tokenizer
tokenizer = OctupleMono(params = "/content/gdrive/MyDrive/dummy/config.txt")

# Creates model
config = GPT2Config( 
    vocab_size=1000,
    n_positions=4,
    n_embd=64,
    n_layer=4,
    n_head=4,
    n_inner=128,
    resid_pdrop=.1,
    embd_pdrop=.1,
    attn_pdrop=.1,
    padding_token_id=tokenizer[0, 'PAD_None'],
    bos_token_id=tokenizer[0, 'BOS_None'],
    eos_token_id=tokenizer[0, 'EOS_None'],

)
model = TFGPT2LMHeadModel(config)                      

In [None]:
model_path = "/content/gdrive/MyDrive/dummy/models/OCTMONO1"


model.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at /content/gdrive/MyDrive/dummy/models/OCTMONO1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel at 0x7fb6415eb3d0>

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
# from pathlib import Path
# from miditok.utils import get_midi_programs
# from miditoolkit import MidiFile

# tokenizer.learn_bpe(
#     vocab_size=1000,
#     tokens_paths=list(Path("/content/gdrive/MyDrive/tokens_noBPE").glob('**/*.json')),
#     out_dir=Path('/content/gdrive/MyDrive/tokens_BPE')
# )

In [None]:
midi_path = '/content/gdrive/MyDrive/dummy/test midi/1.mid'

midi = MidiFile(midi_path)
token = tokenizer(midi)  # automatically detects MidiFile, paths or tokens before converting them

In [None]:
print(token)
print(token[0].ids)
len(token[0].ids)

[TokSequence(tokens=[['Pitch_78', 'Velocity_99', 'Duration_0.2.8', 'Position_0', 'Bar_0', 'Tempo_141'], ['Pitch_83', 'Velocity_99', 'Duration_0.2.8', 'Position_0', 'Bar_0', 'Tempo_141'], ['Pitch_86', 'Velocity_99', 'Duration_0.2.8', 'Position_0', 'Bar_0', 'Tempo_141'], ['Pitch_78', 'Velocity_99', 'Duration_0.2.8', 'Position_12', 'Bar_0', 'Tempo_141'], ['Pitch_82', 'Velocity_99', 'Duration_0.2.8', 'Position_12', 'Bar_0', 'Tempo_141'], ['Pitch_86', 'Velocity_99', 'Duration_0.2.8', 'Position_12', 'Bar_0', 'Tempo_141'], ['Pitch_79', 'Velocity_99', 'Duration_0.2.8', 'Position_24', 'Bar_0', 'Tempo_141'], ['Pitch_83', 'Velocity_99', 'Duration_0.2.8', 'Position_24', 'Bar_0', 'Tempo_141'], ['Pitch_86', 'Velocity_99', 'Duration_0.2.8', 'Position_24', 'Bar_0', 'Tempo_141'], ['Pitch_76', 'Velocity_99', 'Duration_0.2.8', 'Position_8', 'Bar_1', 'Tempo_141'], ['Pitch_82', 'Velocity_99', 'Duration_0.2.8', 'Position_8', 'Bar_1', 'Tempo_141']], ids=[[60, 27, 4, 3, 3, 18], [65, 27, 4, 3, 3, 18], [68, 27,

11

In [None]:
# savetok = Path('/content/gdrive/MyDrive/example midi/token2.json')

# tokenizer.save_tokens(token, savetok)

In [None]:
# import os, json

# json_token = '/content/gdrive/MyDrive/example midi/token2.json'

# i = open(json_token)

# data = json.load(i)

In [None]:
# import json
# import os

# directory = '/content/gdrive/MyDrive/example midi/tokens/'


# for fname in os.listdir(directory):                     # for each file in the directory
#     with open(os.path.join(directory, fname)) as i:     # open the file
#         data = json.load(i)   

In [None]:
# print(data)

In [None]:
prompt = [token[0].ids]



In [None]:
tf.shape(prompt)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 1, 11,  6], dtype=int32)>

In [None]:
prompt = tf.constant(prompt)  # Convert prompt to a 2D tensor

In [None]:
prompt = tf.Variable(prompt).numpy().tolist()

prompt

[[[60, 27, 4, 3, 3, 18],
  [65, 27, 4, 3, 3, 18],
  [68, 27, 4, 3, 3, 18],
  [60, 27, 4, 15, 3, 18],
  [64, 27, 4, 15, 3, 18],
  [68, 27, 4, 15, 3, 18],
  [61, 27, 4, 27, 3, 18],
  [65, 27, 4, 27, 3, 18],
  [68, 27, 4, 27, 3, 18],
  [58, 27, 4, 11, 4, 18],
  [64, 27, 4, 11, 4, 18]]]

In [None]:
generation_config = GenerationConfig(
    max_new_tokens=512,  # extends samples by 512 tokens
    num_beams=1,        # no beam search
    do_sample=True,     # but sample instead
    temperature=0.9,
    top_k=15,
    top_p=0.95,
    epsilon_cutoff=3e-4,
    eta_cutoff=1e-3,
)

In [None]:
outputs = model.generate(prompt, generation_config)

ValueError: ignored

In [None]:
outputs = tf.Variable(outputs).numpy().tolist()
print('old token:', token[0].ids)

new_data = token
new_data[0].ids = outputs

print('new token:', new_data[0].ids)

old token: [328, 15, 205, 27, 205, 51, 343, 39, 201, 42, 427, 430, 54, 204, 231, 886, 130, 714, 991, 614, 11, 205, 23, 615, 35, 202, 400, 53, 700, 42, 204, 231, 123, 683, 127, 738, 99, 111, 8, 205, 20, 615, 294, 32, 205, 35, 359, 39, 751, 700, 637, 224, 56, 351, 54, 344, 475, 96, 108, 10, 205, 22, 205, 53, 344, 142, 142, 216, 120, 974, 127, 130, 217, 141, 605, 226, 98, 110, 139, 15, 205, 27, 615, 39, 201, 42, 427, 430, 54, 204, 231, 886, 130, 714, 991, 142, 11, 205, 23, 615, 35, 202, 400, 56, 700, 42, 204, 231, 123, 753, 127, 738, 99, 111, 8, 205, 20, 615, 294, 32, 205, 35, 359, 39, 751, 700, 58, 207, 232, 96, 108, 10, 205, 22, 615, 120, 974, 127, 738, 146, 232, 98, 110, 15, 205, 27, 205, 51, 343, 39, 201, 42, 427, 430, 54, 204, 231, 886, 130, 714, 991, 614, 11, 205, 23, 615, 35, 202, 400, 53, 700, 42, 204, 231, 123, 683, 127, 738, 99, 111, 8, 205, 20, 615, 294, 32, 205, 35, 359, 39, 751, 700, 637, 224, 56, 351, 54, 344, 475, 96, 108, 10, 205, 22, 205, 53, 344, 142, 142, 216, 120, 974,

Not sure what the issue is, as the data seems to be in the correct type.



Tried changing architecture, as suggested, however now I am getting this error.


In [None]:
programs = get_midi_programs(midi)
generated_midi = tokenizer(new_data[0].ids, programs=programs)

generated_midi.dump('/content/gdrive/MyDrive/example midi/file.mid')