In [11]:
import numpy as np
import tensorflow as tf
import pandas as pd
import collections
import fluidsynth
import glob
import pretty_midi
from IPython import display
from typing import Dict, List, Optional, Sequence, Tuple

In [12]:
sampling_rate = 44100

def display_audio(pm, seconds=30):
	waveform = pm.fluidsynth(fs=sampling_rate)
  # Take a sample of the generated waveform to mitigate kernel resets
	waveform_short = waveform[:seconds*sampling_rate]
	return display.Audio(waveform_short, rate=sampling_rate)

pm = pretty_midi.PrettyMIDI()
# Create an instrument instance and add it to the PrettyMIDI object
instrument = pretty_midi.Instrument(program=0, is_drum=False, name='acoustic grand piano')
pm.instruments.append(instrument)
print(pm.instruments)
instrument = pm.instruments[0]

[Instrument(program=0, is_drum=False, name="acoustic grand piano")]


In [13]:
def midi_to_notes(midi_file):
	pm = pretty_midi.PrettyMIDI(midi_file)
	instrument = pm.instruments[0]
	notes = collections.defaultdict(list)
	sorted_notes = sorted(instrument.notes , key=lambda note:note.start)
	prev_start = sorted_notes[0].start

	for note in sorted_notes:
		start = note.start
		end = note.end
		notes["pitch"].append(note.pitch)
		notes["start"].append(start)
		notes["end"].append(end)
		notes["step"].append(start - prev_start)
		notes["duration"].append(end - start)
		prev_start = start
	return pd.DataFrame({name:np.array(value) for name,value in notes.items()})

raw_notes = midi_to_notes('x (43).mid')
note_names = np.vectorize(pretty_midi.note_number_to_name)
sample_note_names = note_names(raw_notes["pitch"])

In [14]:
def notes_to_midi(
  notes: pd.DataFrame,
  out_file: str,
  instrument_name: str,
  velocity: int = 100,  # note loudness
) -> pretty_midi.PrettyMIDI:

  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          instrument_name))

  prev_start = 0
  for i, note in notes.iterrows():
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start

  pm.instruments.append(instrument)
  pm.write(out_file)
  return pm

In [15]:
num_files = 5
all_notes = []
filenames = glob.glob('*.mid') # Get a list of all MIDI files in the current directory
for f in filenames[:num_files] :
	notes = midi_to_notes(f)
	all_notes.append(notes)
all_notes = pd.concat(all_notes)
print(all_notes)
key_order = ["pitch" , "step" , "duration"]
train_notes = np.stack([all_notes[key] for key in key_order] , axis = 1)
notes_ds=tf.data.Dataset.from_tensor_slices(train_notes)
notes_ds.element_spec

     pitch       start         end       step  duration
0       79    0.800000    0.831250   0.000000  0.031250
1       79    1.759375    1.791667   0.959375  0.032292
2       79    2.079167    2.111458   0.319792  0.032292
3       79    2.527083    2.559375   0.447917  0.032292
4       79    3.007292    3.039583   0.480208  0.032292
..     ...         ...         ...        ...       ...
175     48  181.215625  181.247917   2.880208  0.032292
176     48  196.063542  196.095833  14.847917  0.032292
177     47  197.791667  197.823958   1.728125  0.032292
178     48  198.335417  198.367708   0.543750  0.032292
179     47  206.943750  206.975000   8.608333  0.031250

[180 rows x 5 columns]


TensorSpec(shape=(3,), dtype=tf.float64, name=None)

In [16]:
seq_length = 20
vocab_size = 128
def create_sequences(dataset,seq_length,vocab_size=128):
	sequences = []
	targets = []
	num_seq = train_notes.shape[0] - seq_length
	for i in range(num_seq):
		sequence = train_notes[i:i+seq_length - 1,:] / [vocab_size, 1 ,1]
		target = train_notes[i+seq_length] / vocab_size
		sequences.append(sequence)
		targets.append(target)
	sequences = np.array(sequences)
	targets = np.array(targets)
	print(sequences.shape , targets.shape)
	dataset = tf.data.Dataset.from_tensor_slices((sequences,{"pitch":targets[:,0] , "step":targets[:,1] ,"duration" :targets[:,2]}))
	return dataset
seq_ds = create_sequences(notes_ds, 21, vocab_size)
batch_size =64
buffer_size = 5000
train_ds = seq_ds.shuffle(buffer_size).batch(batch_size)
train_ds.element_spec

(159, 20, 3) (159, 3)


(TensorSpec(shape=(None, 20, 3), dtype=tf.float64, name=None),
 {'pitch': TensorSpec(shape=(None,), dtype=tf.float64, name=None),
  'step': TensorSpec(shape=(None,), dtype=tf.float64, name=None),
  'duration': TensorSpec(shape=(None,), dtype=tf.float64, name=None)})

In [17]:
layer = tf.keras.layers
learning_rate = 0.005
input_data = tf.keras.Input(shape=(seq_length , 3))
x= layer.LSTM(128)(input_data)
outputs = {
	"pitch":tf.keras.layers.Dense(64 , name = "pitch")(x),
	"step":tf.keras.layers.Dense(1 , name = "step")(x),
	"duration":tf.keras.layers.Dense(1 , name = "duration")(x),
}
model = tf.keras.Model(input_data , outputs)

loss ={
	"pitch" : tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
	"step": tf.keras.losses.MeanSquaredError(),
	"duration":tf.keras.losses.MeanSquaredError(),
}
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
model.compile(loss=loss , loss_weights={
		'pitch': 0.05,
		'step': 1.0,
		'duration':1.0,
	}, optimizer = optimizer)

model.summary()

In [18]:
model.fit(train_ds , epochs = 200)

hist = model.predict(train_ds)
print(hist["duration"].shape)

Epoch 1/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - duration_loss: 0.0802 - loss: 0.3892 - pitch_loss: 4.2234 - step_loss: 0.0691
Epoch 2/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - duration_loss: 0.0148 - loss: 0.2171 - pitch_loss: 3.9878 - step_loss: 0.0051
Epoch 3/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - duration_loss: 0.0053 - loss: 0.2013 - pitch_loss: 3.8260 - step_loss: 0.0034
Epoch 4/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - duration_loss: 0.0068 - loss: 0.1904 - pitch_loss: 3.6262 - step_loss: 0.0021
Epoch 5/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - duration_loss: 0.0025 - loss: 0.1725 - pitch_loss: 3.3096 - step_loss: 0.0025
Epoch 6/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - duration_loss: 0.0047 - loss: 0.1388 - pitch_loss: 2.5491 - step_loss: 0.0029
Epoch 7/200
[1m

In [19]:
def predict_next_note(
notes , keras_model , temperature):

	assert temperature > 0
	inputs = np.expand_dims(notes , 0)
	predictions = model.predict(inputs)
	pitch_logits = predictions['pitch']
	step = predictions["step"]
	duration = predictions["duration"]
	pitch_logits /= temperature
	pitch = tf.random.categorical(pitch_logits , num_samples = 1)
	pitch = tf.squeeze(pitch , axis = -1)
	duration = tf.squeeze(duration , axis =-1)
	step = tf.squeeze(step,axis = -1)
	step = tf.maximum(0,step)
	duration = tf.maximum(0 , duration)
	return int(pitch) , float(step) , float(duration)

temperature = 2.0
num_predictions = 1200

sample_notes = np.stack([raw_notes[key] for key in key_order], axis=1)

# The initial sequence of notes and the pitch is normalized similar to training sequences
input_notes = (
	sample_notes[:seq_length] / np.array([vocab_size, 1, 1]))

generated_notes = []
prev_start = 0
for _ in range(num_predictions):
	pitch, step, duration = predict_next_note(input_notes, model, temperature)
	start = prev_start + step
	end = start + duration
	input_note = (pitch, step, duration)
	generated_notes.append((*input_note, start, end))
	input_notes = np.delete(input_notes, 0, axis=0)
	input_notes = np.append(input_notes, np.expand_dims(input_note, 0), axis=0)
	prev_start = start

generated_notes = pd.DataFrame(
	generated_notes, columns=(*key_order, 'start', 'end'))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 291ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6

In [20]:
out_file = 'gfgmusicgnerate.mid'
instrument_name= pretty_midi.program_to_instrument_name(instrument.program)
out_pm = notes_to_midi(
	generated_notes, out_file=out_file, instrument_name=instrument_name)
display_audio(out_pm , 500)

