In [1]:
import sys
import tensorflow as tf
from model import build_model
from audio_dataset_generator import AudioDatasetGenerator

In [2]:
# Settings
audio_data_path      = "assets"
learning_rate        = 0.001
amount_epochs        = 200
batch_size           = 64
dropout              = 0.2
rnn_type             = "lstm"
activation           = tf.nn.relu
optimiser            = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
fft_settings         = [2048, 1024, 512]
fft_size             = fft_settings[0]
window_size          = fft_settings[1]
hop_size             = fft_settings[2]
number_rnn_layers    = 2
rnn_number_units     = int(fft_size / 2 + 1)
fully_connected_dim  = int(fft_size / 2 + 1)
sequence_length      = 16
force_new_dataset    = False
sample_rate          = 44100
loss_type            = "mse"

In [3]:
# Dataset generation / loading
dataset = AudioDatasetGenerator(fft_size, window_size, hop_size,
                                sequence_length, sample_rate)
dataset.load(audio_data_path)

In [None]:
# Tensor graph
x = tf.placeholder(tf.float32, dataset.get_x_shape())
y = tf.placeholder(tf.float32, dataset.get_y_shape())
training = tf.placeholder(tf.bool)
keep_prob = tf.placeholder(tf.float32)

weights = {
    'out': tf.Variable(tf.random_normal([fully_connected_dim, fully_connected_dim]))
}
biases = {
    'out': tf.Variable(tf.random_normal([fully_connected_dim]))
}

prediction = build_model(x, rnn_type, number_rnn_layers, rnn_number_units,
                         weights, biases, dataset.get_y_shape()[1], keep_prob)

if loss_type == "mse":
    cost = tf.reduce_mean(tf.pow(tf.subtract(y, prediction), 2))
elif loss_type == "l2":
    cost = tf.nn.l2_loss(y, prediction)

train = optimiser.minimize(cost)

init = tf.global_variables_initializer()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
with tf.Session() as sess:
    sess.run(init)
    dataset.reset_epoch()
    
    while not dataset.completed_all_epochs(amount_epochs):

        batch_x, batch_y = dataset.get_next_batch(batch_size)

        sess.run([train], feed_dict={x: batch_x,
                                     y: batch_y,
                                     training: True,
                                     keep_prob: dropout})

        if dataset.is_new_epoch():
            c = sess.run([cost], feed_dict={x: batch_x,
                                            y: batch_y,
                                            training: True,
                                            keep_prob: dropout})

            sys.stdout.write('Epoch: {}, Cost: {}  \r'.format(dataset.get_epoch(), c[0]))
            sys.stdout.flush()

Epoch: 93, Cost: 0.5956626534461975  

In [None]:
amount_samples     = 5
sequence_length    = 1000
impulse_scale      = 666
griffin_iterations = 200
audio = dataset.generate_samples(prediction, x, training, keep_prob, amount_samples,
                                 sequence_length, impulse_scale, griffin_iterations)

In [None]:
from IPython.display import Audio

i = 0
Audio(audio[i], rate=sample_rate)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.specgram(audio[i], NFFT=2048, Fs=44100, noverlap=512)

# Plot a spectrogram
plt.xlabel('Time (seconds)')
plt.ylabel('Frequency')