In [1]:
import collections
import datetime
import os
import numpy as np
import pathlib
import pretty_midi
import tensorflow as tf
import json

In [2]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Sampling rate for audio playback
# _SAMPLING_RATE = 16000

In [3]:
dir_path = os.path.abspath("./Jsb16thSeparated.json")
print(dir_path)

file = open(dir_path)

# returns JSON object as 
# a dictionary
data = json.load(file)

/Users/tonyfernandes/Desktop/Projects/ML/BachHarmonization/Jsb16thSeparated.json


In [4]:
rawTEST = data["test"]
rawTRAIN = data["train"]
rawCV = data["valid"]
def data_to_numpy(list):
    gigalist = np.array(list[0])
    for i in range(1, len(list)):
        gigalist = np.append(gigalist, np.array(list[i]), axis=0)
    return gigalist

TRAIN = data_to_numpy(rawTRAIN)
TEST = data_to_numpy(rawTEST)
CV = data_to_numpy(rawCV)

print(TRAIN.shape)
print(CV.shape)
print(TEST.shape)


def min_max_find(TRAIN):
    min = TRAIN[0,0]
    max = TRAIN[0,0]
    for time in range(TRAIN.shape[0]):
        for voicing in range(4):
            if(TRAIN[time, voicing] < min):
                min = TRAIN[time, voicing]
            if(TRAIN[time, voicing] > max):
                max = TRAIN[time, voicing]
    return [min, max]

print(min_max_find(TRAIN))
print(min_max_find(np.add(TRAIN, 1)))
#min_max_find(TEST)
#min_max_find(CV)

TRAIN = np.add(TRAIN, 1)
TEST = np.add(TEST, 1)
CV = np.add(CV, 1)


(55228, 4)
(18408, 4)
(18900, 4)
[-1, 81]
[0, 82]


In [5]:
seq_length = 64

ids_dataset = tf.data.Dataset.from_tensor_slices(TRAIN)
sequences = ids_dataset.batch(seq_length, drop_remainder=True)

def split_input_target(sequence):
    input = sequence[:,0]
    target = sequence[:, 1:]
    
    return input, target

dataset = sequences.map(split_input_target)

In [6]:
#batch the dataset for input
batch_size = 60
buffer_size = TRAIN.shape[0]//10  # the number of items in the dataset
batched_ds = (dataset
            .shuffle(buffer_size)
            .batch(batch_size, drop_remainder=True)
            .cache()
            .prefetch(tf.data.experimental.AUTOTUNE))

In [7]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Input, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

input_shape = (seq_length,)
num_notes = 83
embed_dim = 32
lstm_lyrs = 128

input_ = Input(shape=input_shape)
x = Embedding(num_notes, embed_dim, input_length=seq_length)(input_)
x2 = LSTM(lstm_lyrs, return_sequences=True, activation='tanh')(x)

outputs = []
for _ in range(3):  # Alto, Tenor, Bass
    output = TimeDistributed(Dense(num_notes, activation='softmax'))(x2)
    outputs.append(output)

# Stack the outputs into a single tensor
output_ = tf.stack(outputs, axis=2)

model = Model(inputs=input_, outputs=output_)

def custom_loss(y_true, y_pred):
    # y_true shape: (batch_size, seq_length, 3)
    # y_pred shape: (batch_size, seq_length, 3, num_notes)
    return SparseCategoricalCrossentropy()(y_true, y_pred)

# In the create_model function:
model.compile(
    optimizer='adam',
    loss=custom_loss
)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 64, 32)       2656        ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 64, 128)      82432       ['embedding[0][0]']              
                                                                                                  
 time_distributed (TimeDistribu  (None, 64, 83)      10707       ['lstm[0][0]']                   
 ted)                                                                                         

2024-08-18 13:01:59.528181: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-18 13:01:59.528873: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-18 13:01:59.529228: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [8]:
 history = model.fit(batched_ds, epochs=100)

Epoch 1/100


2024-08-18 13:02:03.034663: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [55228,4]
	 [[{{node Placeholder/_0}}]]
2024-08-18 13:02:03.035027: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [55228,4]
	 [[{{node Placeholder/_0}}]]
2024-08-18 13:02:03.135590: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_d

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [9]:
def manual_sequencing(raw_data):
    num_sequences = (raw_data.shape[0] - seq_length) // seq_length
    
    inputs = []
    targets = []
    
    for i in range(num_sequences):
        start = i * seq_length
        end = start + seq_length
        
        inputs.append(raw_data[start:end, 0])  # Soprano voice
        targets.append(raw_data[start:end, 1:])  # Alto, Tenor, Bass voices
        
    return np.array(inputs), np.array(targets)

In [10]:
CV_in, CV_target = manual_sequencing(CV)
CV_logits = model.predict(CV_in)

CV_pred = np.argmax(CV_logits, axis=-1)

CV_dataset = tf.data.Dataset.from_tensor_slices((CV_in, CV_target))
CV_dataset = CV_dataset.batch(batch_size)  

loss = model.evaluate(CV_dataset, verbose=0)
print(f'CV loss: {loss}')

# Calculate accuracy for each voice
accuracies = []
for voice in range(3):  # Alto, Tenor, Bass
    correct = np.sum(CV_pred[:, :, voice] == CV_target[:, :, voice])
    total = np.prod(CV_target[:, :, voice].shape)
    accuracy = correct / total
    accuracies.append(accuracy)

print(f'Percentage of correct notes for alto,tenor,bass in CV: {accuracies}')



2024-08-18 13:03:13.865149: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-18 13:03:13.865614: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-18 13:03:13.866086: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

CV loss: 2.029677629470825
Percentage of correct notes for alto,tenor,bass in CV: [0.3462631118881119, 0.3541848776223776, 0.28114073426573427]


2024-08-18 13:03:14.081978: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-18 13:03:14.082380: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-18 13:03:14.082776: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [11]:
#Our data being divided by time sequences works well for processing and neural networks but 
#to play these actual notes back we need to convert each repeated number in each voicing into 
#a note with (note pitch, note duration)
#The output of this function should be a list of the 4 voicings
#Each voicing is a list of sequences
#Each sequence is a list of notes
#Each note is a tuple: (note pitch, note duration)
#So the shape of the output is [4, num_sequences, num_notes (could be different for each sequence)] and the data type of each
#note is a tuple of size 2
def playback(input_soprano, ATB_predictions):
    
    #convert each sequence of 64 16th notes to the actual note/silence played for a duration
    def to_real_notes(voicing):
        num_sequences = voicing.shape[0]
        res = [[] for _ in range(num_sequences)]
        for sequence_index in range(num_sequences):
            
            seqs = voicing[sequence_index]
            curDur = 0
            curNote = seqs[0]
            for i in range(seqs.shape[0]):
                if(seqs[i] == curNote):
                    curDur += 1
                else:
                    #We subtract 1 from current note because we added 1 to all notes while preprocessing input
                    this_note = (curNote-1, curDur)
                    res[sequence_index].append(this_note)
                    curNote = seqs[i]
                    curDur = 1
                #if last note in sequence, we must save it regardless
                if(i == seqs.shape[0] - 1):
                    this_note = (curNote-1, curDur)
                    res[sequence_index].append(this_note)
            # print(f'seq_index: {sequence_index}')
            # print(seqs)
            # print(res[sequence_index])
            # print("-"*50)
        #Returns a list of sequences. Each sequence contains some number of notes. 
        #Each note is represented by (note pitch, note duration) where pitch is the MIDI number and duration the number of 16th notes
        return res
    
    soprano_notes = to_real_notes(input_soprano)
    alto_notes = to_real_notes(ATB_predictions[:, :, 0])
    tenor_notes = to_real_notes(ATB_predictions[:, :, 1])
    bass_notes = to_real_notes(ATB_predictions[:, :, 2])

    return [soprano_notes, alto_notes, tenor_notes, bass_notes]

In [12]:
def write_MIDI_File(NOTES, BPM, seq_indices, out_file_name):
    midi_data = pretty_midi.PrettyMIDI()
    piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
    sop_piano = pretty_midi.Instrument(program=piano_program)
    alto_piano = pretty_midi.Instrument(program=piano_program)
    ten_piano = pretty_midi.Instrument(program=piano_program)
    bass_piano = pretty_midi.Instrument(program=piano_program)

    pianos = [sop_piano, alto_piano, ten_piano, bass_piano]
    
    #SPS = sixteenth notes per second
    SPS = (BPM/60)*4

    for voice_index in range(4):
        time = 0

        for j in range(len(seq_indices)):
            cur_notes = NOTES[voice_index][seq_indices[j]]
        
            for i in range(len(cur_notes)):
                note_pitch = cur_notes[i][0]
                note_duration = cur_notes[i][1]/SPS
                if(note_pitch != -1):
                    note = pretty_midi.Note(velocity=100, pitch=note_pitch, start=time, end=time+note_duration)
                    pianos[voice_index].notes.append(note)
                time += note_duration
    
    midi_data.instruments.append(sop_piano)
    midi_data.instruments.append(alto_piano)
    midi_data.instruments.append(ten_piano)
    midi_data.instruments.append(bass_piano)
    midi_data.write(out_file_name)

In [13]:
CV_PRED_NOTES = playback(CV_in, CV_pred)
CV_ACTUAL_NOTES = playback(CV_in, CV_target)

write_MIDI_File(CV_PRED_NOTES, 80, [0, 1, 2], 'CVPred012.mid')
write_MIDI_File(CV_ACTUAL_NOTES, 80, [0, 1, 2], 'CVActual012.mid')

In [14]:
TEST_in, TEST_targets = manual_sequencing(TEST)
TEST_logits = model.predict(TEST_in)



In [15]:
def temperature_sampling(gye, temperatures):
    #don't modify the original array
    data_logits = np.copy(gye)
    for voice in range(3):
        data_logits[:,:,voice,:] = np.log(data_logits[:,:,voice,:]) / temperatures[voice]
    
    data_logits -= np.max(data_logits, axis=-1, keepdims=True)
    data_logits = np.exp(data_logits)
    data_logits /= np.sum(data_logits, axis=-1, keepdims=True)
    
    sampling = np.zeros((data_logits.shape[0], data_logits.shape[1], 3), dtype=int)
    
    for seq in range(data_logits.shape[0]):
        for step in range(data_logits.shape[1]):
            for voice in range(data_logits.shape[2]):
                sampling[seq, step, voice] = np.random.choice(data_logits.shape[3], p=data_logits[seq, step, voice])
    return sampling

temps = [0.001,0.001,0.002]

TEST_pred = temperature_sampling(TEST_logits, temps)
print("pred             actual")
for i in range(15):
    print(TEST_pred[0,i], "     ", TEST_targets[0,i])
#print(TEST_pred.shape)

pred             actual
[63 58 51]       [61 58 54]
[63 58 51]       [61 58 54]
[63 58 51]       [61 58 54]
[63 58 51]       [61 58 54]
[66 58 54]       [61 56 53]
[66 58 54]       [61 56 53]
[66 63 47]       [61 56 53]
[66 63 47]       [61 56 53]
[66 61 54]       [61 54 54]
[66 61 54]       [61 54 54]
[66 61 49]       [61 56 53]
[65 61 49]       [61 56 53]
[61 58 42]       [63 58 51]
[61 58 42]       [63 58 51]
[61 58 42]       [65 59 51]


In [16]:
TEST_PRED_NOTES = playback(TEST_in, TEST_pred)
TEST_ACTUAL_NOTES = playback(TEST_in, TEST_targets)

sequences = [[0,1,2],[3,4,5,6,7]]
BPM = 80

for i in range(len(sequences)):
    write_MIDI_File(TEST_PRED_NOTES, BPM, sequences[i], 'TESTPred' + str(i) + '.mid')
    write_MIDI_File(TEST_ACTUAL_NOTES, BPM, sequences[i], 'TESTActual' + str(i) + '.mid')