# Interactive Machine Learning -- Music Accompaniment

### Yinmiao Li

### Load Packages

In [268]:
from pylab import *
import copy
import pretty_midi
import os
import librosa             # The librosa library
import librosa.display     # librosa's display module (for plotting features)
import IPython.display     # IPython's display module (for in-line audio)
import matplotlib.pyplot as plt # matplotlib plotting functions
import matplotlib.style as ms   # plotting style
import numpy as np              # numpy numerical functions
ms.use('seaborn-muted')         # fancy plot designs
from __future__ import print_function # use the print() function from Python3
import random

import tensorflow as tf #import tensorflow
from tensorflow import keras #import keras from tensorflow
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Input, LSTM, Dense

In [191]:
# try with pretty_midi
midi_test = pretty_midi.PrettyMIDI('Nottingham/train/hpps_simple_chords_37.mid') #load one midi data for trying
for itm in midi_test.instruments:
    print(itm)
    print(type(itm.notes[2]))
    print(itm.notes)

Instrument(program=0, is_drum=False, name="Aunt Hessie's White Horse")
<class 'pretty_midi.containers.Note'>
[Note(start=0.000000, end=0.498958, pitch=74, velocity=105), Note(start=0.500000, end=0.998958, pitch=67, velocity=105), Note(start=1.000000, end=1.498958, pitch=69, velocity=80), Note(start=1.500000, end=1.998958, pitch=71, velocity=95), Note(start=2.000000, end=2.498958, pitch=72, velocity=80), Note(start=2.500000, end=2.748958, pitch=74, velocity=105), Note(start=2.750000, end=3.248958, pitch=74, velocity=80), Note(start=3.250000, end=3.498958, pitch=74, velocity=80), Note(start=3.500000, end=3.998958, pitch=74, velocity=95), Note(start=4.000000, end=4.498958, pitch=74, velocity=80), Note(start=4.500000, end=4.748958, pitch=74, velocity=105), Note(start=4.750000, end=5.248958, pitch=74, velocity=80), Note(start=5.250000, end=5.498958, pitch=74, velocity=80), Note(start=5.500000, end=5.998958, pitch=74, velocity=95), Note(start=6.000000, end=6.498958, pitch=74, velocity=80), N

In [44]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'Nottingham/train'
# look for all the files in the folder
itemPath = [] # store all the files path in list
for file in os.walk(data_path):
    for itm in file:
        for item in itm:
            if item.endswith(".mid"):
                itemPath.append(str(data_path+"/" + item)) 
print(itemPath)

['Nottingham/train/jigs_simple_chords_36.mid', 'Nottingham/train/jigs_simple_chords_159.mid', 'Nottingham/train/jigs_simple_chords_165.mid', 'Nottingham/train/jigs_simple_chords_171.mid', 'Nottingham/train/reels_simple_chords_279.mid', 'Nottingham/train/jigs_simple_chords_213.mid', 'Nottingham/train/jigs_simple_chords_207.mid', 'Nottingham/train/playford_simple_chords_13.mid', 'Nottingham/train/reels_simple_chords_127.mid', 'Nottingham/train/reels_simple_chords_133.mid', 'Nottingham/train/xmas_simple_chords_9.mid', 'Nottingham/train/hpps_simple_chords_21.mid', 'Nottingham/train/reels_simple_chords_319.mid', 'Nottingham/train/reels_simple_chords_325.mid', 'Nottingham/train/reels_simple_chords_457.mid', 'Nottingham/train/reels_simple_chords_331.mid', 'Nottingham/train/reels_simple_chords_456.mid', 'Nottingham/train/reels_simple_chords_330.mid', 'Nottingham/train/reels_simple_chords_324.mid', 'Nottingham/train/reels_simple_chords_318.mid', 'Nottingham/train/hpps_simple_chords_20.mid', 'No

In [201]:
# differentiate the lead melody and the accompanied chords here
lead_input = []# lead melody data
accp_input = []# accompaniment data
for i in itemPath:
    midi_file_read = pretty_midi.PrettyMIDI(i)
    if len(midi_file_read.instruments) >= 2:
        lead_input.append(midi_file_read.instruments[0].notes)
        accp_input.append(midi_file_read.instruments[1].notes)




[Note(start=0.500000, end=1.998958, pitch=58, velocity=75), Note(start=0.500000, end=1.998958, pitch=62, velocity=75), Note(start=0.500000, end=1.998958, pitch=65, velocity=75), Note(start=2.000000, end=3.498958, pitch=58, velocity=75), Note(start=2.000000, end=3.498958, pitch=62, velocity=75), Note(start=2.000000, end=3.498958, pitch=65, velocity=75), Note(start=3.500000, end=4.998958, pitch=58, velocity=75), Note(start=3.500000, end=4.998958, pitch=62, velocity=75), Note(start=3.500000, end=4.998958, pitch=65, velocity=75), Note(start=5.000000, end=6.498958, pitch=51, velocity=75), Note(start=5.000000, end=6.498958, pitch=55, velocity=75), Note(start=5.000000, end=6.498958, pitch=58, velocity=75), Note(start=6.500000, end=7.998958, pitch=53, velocity=75), Note(start=6.500000, end=7.998958, pitch=57, velocity=75), Note(start=6.500000, end=7.998958, pitch=60, velocity=75), Note(start=6.500000, end=7.998958, pitch=63, velocity=75), Note(start=8.000000, end=9.498958, pitch=53, velocity=7

In [203]:
print(len(accp_input))

685


### Preprocessing the Data

In [417]:
# deal the lead melody and accompaniment data according to the time and notes.
# first find the maximum time duration in all the pieces, and set the interval as 0.5s

max_time = max(i[-1].start for i in lead_input)
print(max_time)
min_time = min(i[-1].start for i in lead_input)
print(min_time)



446.5
9.75


In [74]:
# Vectorize the data
# the 3d list should include first:the number of training pieces(should be the same for both input and output) 
# the second number of elements should be the maximum length of time, the interval should be 0.5s
# the third parameter is 128, suggesting the number of possible pitches from 0 to 127
train_piece = len(lead_input)
print(train_piece)

# initialize the matrix
encoder_input_data = np.zeros((train_piece,int(max_time*2), 128))
decoder_input_data = np.zeros((train_piece,int(max_time*2), 128))
decoder_target_data = np.zeros((train_piece,int(max_time*2), 128))

# set the matrix
# for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
#     for t, char in enumerate(input_text):
#         encoder_input_data[i, t, input_token_index[char]] = 1.
#     for t, char in enumerate(target_text):
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t, target_token_index[char]] = 1.
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start character.
#             decoder_target_data[i, t - 1, target_token_index[char]] = 1.

# set the encoder input data                 
for i in range(train_piece):
    piece = lead_input[i]
    l = len(piece)
#     t_max = piece[-1].start
    for j in range(l):
        min_bound = (j+1)/2
        max_bound = (j+2)/2
        for p in piece:
            if min_bound <= p.start < max_bound:
                idx = p.pitch
                encoder_input_data[i][j][idx] = 1

# set the decoder input data
for i in range(train_piece):
    piece2 = accp_input[i]
    l2 = len(piece)
    for j in range(l2):
        min_bound = (j+1)/2
        max_bound = (j+2)/2
        for p2 in piece2:
            if min_bound <= p2.start < max_bound:
                idx2 = p2.pitch
                decoder_input_data[i][j][idx2] = 1
# set the decoder target data
for i in range(train_piece):
    piece3 = accp_input[i]
    l3 = len(piece)
    for j in range(l3):
        min_bound = (j+2)/2
        max_bound = (j+3)/2
        for p3 in piece3:
            if min_bound <= p3.start < max_bound:
                idx3 = p3.pitch
                decoder_target_data[i][j][idx3] = 1

685


In [206]:
print(decoder_input_data[0][0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [479]:
# train the data
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, 128))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [480]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, 128))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(128, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [481]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s.h5')

Train on 548 samples, validate on 137 samples
Epoch 1/100

KeyboardInterrupt: 

In [467]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)


In [474]:
idx = random.randint(0,len(encoder_input_data) - 2)
print(idx)


671


In [475]:
input_seq = encoder_input_data[idx:idx+1]
print(input_seq)
states_value = encoder_model.predict(input_seq)
print(len(lead_input[idx]))

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
119


In [476]:
target_seq = np.zeros((1, 1, 128))
# print(target_seq)
stop_condition = False
decoded_notes = []
times = 0
# print(lead_input[idx][0].end - lead_input[idx][0].start)
# print(int(len(lead_input[idx])*(0.5/(lead_input[idx][0].end - lead_input[idx][0].start))))
while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    output_copy = copy.deepcopy(output_tokens[0,-1,:])
    output_copy.sort()
    chord = output_copy[-3:]
#     print(chord)
    d_notes = []
    for i in chord:
        d_notes.append(output_tokens[0,-1,:].tolist().index(i))
        
    decoded_notes.append(d_notes)
    
    if(times > int(len(lead_input[idx])*(0.5/(lead_input[idx][0].end - lead_input[idx][0].start)))):
        stop_condition = True
    times += 1
    target_seq = np.zeros((1, 1, 128))
    states_value = [h,c]
# #     print(output_tokens[0, -1, :])
# sampled_token_index = np.argmax(output_tokens[0, -1, :])
# # sampled_token_index = np.argmax(output_tokens[0, :, :])

# print(sampled_token_index)
print(decoded_notes)

print(len(decoded_notes))

[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]

In [477]:
print(itemPath[idx])

Nottingham/train/ashover_simple_chords_44.mid


In [478]:
def turn_list_to_accp(lst):
    cello_music = pretty_midi.PrettyMIDI()
    # Create an Instrument instance for a cello instrument
    cello = pretty_midi.Instrument(program=0)
    start_time = 0
    for i in range(len(lst)):
        start_time += 0.5
        print(start_time)
        if i%4 == 0:
            decodeNotes = lst[i]
            print(decodeNotes)
            for n in decodeNotes:
                note = pretty_midi.Note(velocity=64, pitch=n, start=start_time, end=start_time+2)
                cello.notes.append(note)
    cello_music.instruments.append(cello)
    cello_music.write('Accompaniment6.mid')


In [473]:
turn_list_to_accp(decoded_notes)

0.5
[0, 0, 0]
1.0
1.5
2.0
2.5
[0, 0, 0]
3.0
3.5
4.0
4.5
[0, 0, 0]
5.0
5.5
6.0
6.5
[0, 0, 0]
7.0
7.5
8.0
8.5
[0, 0, 0]
9.0
9.5
10.0
10.5
[0, 0, 0]
11.0
11.5
12.0
12.5
[0, 0, 0]
13.0
13.5
14.0
14.5
[0, 0, 0]
15.0
15.5
16.0
16.5
[0, 0, 0]
17.0
17.5
18.0
18.5
[0, 0, 0]
19.0
19.5
20.0
20.5
[0, 0, 0]
21.0
21.5
22.0
22.5
[0, 0, 0]
23.0
23.5
24.0
24.5
[0, 0, 0]
25.0
25.5
26.0
26.5
[0, 0, 0]
27.0
27.5
28.0
28.5
[0, 0, 0]
29.0
29.5
30.0
30.5
[0, 0, 0]
31.0
31.5
32.0
32.5
[0, 0, 0]
33.0
33.5
34.0
34.5
[0, 0, 0]
35.0
35.5
36.0
36.5
[0, 0, 0]
37.0
37.5
38.0
38.5
[0, 0, 0]
39.0
39.5
40.0
40.5
[0, 0, 0]
41.0
41.5
42.0
42.5
[0, 0, 0]
43.0
43.5
44.0
44.5
[0, 0, 0]
45.0
45.5
46.0
46.5
[0, 0, 0]
47.0
47.5
48.0
48.5
[0, 0, 0]
49.0
49.5
50.0
50.5
[0, 0, 0]
51.0
51.5
52.0
52.5
[0, 0, 0]
53.0
53.5
54.0
54.5
[0, 0, 0]
55.0
55.5
56.0
56.5
[0, 0, 0]
57.0
57.5
58.0
58.5
[0, 0, 0]
59.0
59.5
60.0
60.5
[0, 0, 0]
61.0
61.5
62.0
62.5
[0, 0, 0]
63.0
63.5
64.0
64.5
[0, 0, 0]
65.0
65.5
66.0
66.5
[0, 0, 0]
67.0
67.5
68.0