In [72]:
import glob
import os

import matplotlib.pyplot as plt

from IPython.display import SVG

from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import Input, GRU, LSTM, Dense, Masking, Dropout, Embedding, Flatten, Dense
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l1_l2
from keras.utils import to_categorical
from keras.utils.vis_utils import model_to_dot
import numpy as np
from sklearn.externals import joblib

# Config

In [7]:
SIG_DIGITS = 4
QUANTIZATION = 12  # smallest unit is 1/12 of a beat
MAX_EVENT_BEATS = 4

MIDI_MIN = 21
MIDI_MAX = 108

MAX_EVENT_SUBBEATS = QUANTIZATION * MAX_EVENT_BEATS

MIDI_LEN = MIDI_MAX - MIDI_MIN + 1  # 88 keys

# One-hot vector lengths
NUM_COMMAND_CLASSES = 3
NUM_MIDI_CLASSES = MIDI_LEN + 1                # + 1 for "0" case
NUM_DURATION_CLASSES = MAX_EVENT_SUBBEATS + 1  # + 1 for "0" case

# Start of range is inclusive, end of range is exclusive. 
COMMAND_VEC_RANGE = (0, NUM_COMMAND_CLASSES)
MIDI_VEC_RANGE = (COMMAND_VEC_RANGE[1], COMMAND_VEC_RANGE[1] + NUM_MIDI_CLASSES)
DURATION_VEC_RANGE = (MIDI_VEC_RANGE[1], MIDI_VEC_RANGE[1] + NUM_DURATION_CLASSES)
VEC_LENGTH = DURATION_VEC_RANGE[1]
COMMAND_VEC_RANGE, MIDI_VEC_RANGE, DURATION_VEC_RANGE, VEC_LENGTH

INPUT_NOTES = 30
OUTPUT_NOTES = 10
SLIDING_WINDOW_NOTES = 5

INPUT_TIMESTEPS = 4 * INPUT_NOTES
OUTPUT_TIMESTEPS = 4 * OUTPUT_NOTES
SLIDING_WINDOW_TIMESTEPS = 4 * SLIDING_WINDOW_NOTES

In [None]:
NUM_LSTM_NODES = 256             # Num of intermediate LSTM nodes
CONTEXT_VECTOR_SIZE = 256        # Size of context vector (num of LSTM nodes in final LSTM layer)

EMBEDDING_DIM = 100              # Embedding layer size for input words

BATCH_SIZE = 64
NUM_EPOCHS = 100

LR = 0.01
DROPOUT = 0.3

# Set up for Training

## Data generator

In [16]:
TRAIN_PATH = '/data/mirex2018/train_pkl/train*pkl'
TEST_PATH = '/data/mirex2018/test_pkl/test*pkl'

In [47]:
def load_dataset(pickle_filename):
    """Returns tuple of train_matrix, test_matrix, holding many examples.
    Ex: two matrix shapes returned: (561, 120, 141), (561, 40, 141).
    dims are: (example #, timestep, feature)
    """
    x_list, y_list = joblib.load(pickle_filename) 
    return np.vstack(x_list), np.vstack(y_list)

In [41]:
TRAIN_FILES = sorted(glob.glob(TRAIN_PATH))
TEST_FILES = sorted(glob.glob(TEST_PATH))

In [51]:
def example_generator(train=True):
    files = TRAIN_FILES if train else TEST_FILES
    
    while True:
        for file in files:
            x, y = load_dataset(file)
            yield x, y    

In [74]:
training_generator = example_generator()
validation_generator = example_generator(False)

## Utils

In [56]:
def plot_matrix(x):
    plt.figure(figsize=(20,10))
    plt.imshow(x, origin='lower')
    plt.show()

## Build Model

In [None]:
# Build RNN model.
# See also: https://machinelearningmastery.com/define-encoder-decoder-sequence-sequence-model-neural-machine-translation-keras/

encoding_size = CONTEXT_VECTOR_SIZE
max_input_seq_len = max_seq_len_X
max_output_seq_len = max_seq_len_y
num_input_words = num_words_X
num_output_words = num_words_y

encoder_inputs = Input(shape=(max_input_seq_len,), name='encoder_input')
encoder_inputs_masked = Masking(mask_value=0, name='encoder_masking')(encoder_inputs)
encoder_inputs_embedded = Embedding(num_input_words, EMBEDDING_DIM, mask_zero=True, name='encoder_embedding')(encoder_inputs_masked)
encoder_outputs1, state_h1, state_c1 = LSTM(NUM_LSTM_NODES, return_sequences=True, return_state=True,
                                            name='encoder_lstm_1')(encoder_inputs_embedded)

# Discard `encoder_outputs2` and only keep the states.
encoder_states1 = [state_h1, state_c1]

In [None]:
# Decoder section
# Set up the decoder, using encoder_states as initial state.
decoder_inputs = Input(shape=(None,), name='decoder_input')
decoder_inputs_masked = Masking(mask_value=0, name='decoder_masking')(decoder_inputs)
decoder_inputs_embedded = Embedding(num_output_words, EMBEDDING_DIM, mask_zero=True, 
                                    name='decoder_embedding')(decoder_inputs_masked)
decoder_lstm = LSTM(NUM_LSTM_NODES, return_sequences=True, return_state=True, name='decoder_lstm_1')
z, _, _ = decoder_lstm(decoder_inputs_embedded, initial_state=encoder_states1)
decoder_dense = Dense(num_output_words, activation='softmax', name='decoder_output')
decoder_outputs = decoder_dense(z)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

# Train

In [None]:
encoder_input_data = X_train
decoder_input_data = y_train

# decoder_target_data will be ahead by one timestep
# and will not include the start token.
decoder_target_data = np.zeros(y_train_one_hot.shape)
decoder_target_data[:,:-1] = y_train_one_hot[:,1
                                             
decoder_target_data_test = np.zeros(y_test_one_hot.shape)
decoder_target_data_test[:,:-1] = y_test_one_hot[:,1:,:]

In [None]:
optimizer = Adam(lr=.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

In [None]:
lr_callback = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, verbose=1, mode='auto', 
                                cooldown=0, min_lr=0)

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')

In [None]:
# Run training
model.fit_generator(training_generator, steps_per_epoch=50000,  # TODO
                    validation_data=validation_generator, 
                    validation_steps=20000, # TODO
                       verbose=1,
                       workers=1,
                       use_multiprocessing=False,
                    
                 #   [encoder_input_data, decoder_input_data], decoder_target_data,
                      batch_size=BATCH_SIZE,
                      epochs=NUM_EPOCHS,
                      #validation_data=([X_test, y_test], decoder_target_data_test),
                      callbacks=[lr_callback, early_stopping_callback])

In [None]:
# Save model
model.save('s2s.h5')

# Run Model

In [None]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states1)
encoder_model.summary()

In [None]:
SVG(model_to_dot(encoder_model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
decoder_state_input_h = Input(shape=(NUM_LSTM_NODES,))
decoder_state_input_c = Input(shape=(NUM_LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs1, state_h1, state_c1 = decoder_lstm(
    decoder_inputs_embedded, initial_state=decoder_states_inputs)

decoder_states1 = [state_h1, state_c1]
decoder_outputs = decoder_dense(decoder_outputs1)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states1)
decoder_model.summary()

In [None]:
SVG(model_to_dot(decoder_model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
def translate_sequence(input_seq):
    # Encode the input as state vectors.
    h1, c1 = encoder_model.predict(input_seq)
    states_value1 = [h1, c1]
    # Generate empty target sequence of length 1 (one-hot encoded).
    #target_seq = np.zeros((1, num_output_words))
    target_seq = np.zeros((1,1))
    # Populate the first word of target sequence with the start symbol.
    #target_seq[0, word_to_index2['<S>']] = 1.
    target_seq[0,0] = word_to_index2['<S>']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    step = 0
    while not stop_condition:
        #print('step:', step)
        #print(states_value1[0][0][0:5])
    
        output_tokens, h1, c1  = decoder_model.predict(
            [target_seq] + states_value1)

        # Sample a token
        #print(output_tokens)
        #sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_word = index_to_word2[sampled_token_index]
        #print(sampled_word)
        decoded_sentence += sampled_word + ' '
        step += 1
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '</S>' or step > max_output_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        #target_seq = np.zeros((1, num_output_words))
        #target_seq[0, sampled_token_index] = 1.
        target_seq[0, 0] = sampled_token_index
        # Update states
        states_value1 = [h1, c1]

    return decoded_sentence

In [None]:
for i in range(10):
    print(translate_sequence(np.expand_dims(X_test[i], axis=0)))