In [22]:
#-*- coding: utf_8 -*-
import matplotlib as plt
import cPickle
import numpy as np
import os
import sys
import librosa
from scipy.misc import imsave
import SongsDL.Auralisation.auralise as auralise
import lasagne
import pickle
import scipy
import sklearn
import theano
import theano.tensor as T
from lasagne.utils import floatX
import librosa # Librosa for audio
import urllib
import seaborn # And seaborn to make it look nice
seaborn.set(style='ticks')
# and IPython.display for audio output
import IPython.display
from IPython.display import Audio
%matplotlib inline

CUDA = False

if CUDA:
    from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
else:
    from lasagne.layers import Conv2DLayer as ConvLayer

from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer, FlattenLayer
from lasagne.layers import MaxPool2DLayer as PoolLayer
from lasagne.nonlinearities import softmax

In [23]:
def load_weights():
    ''' Load keras config file and return W
    '''
    import h5py
    model_name = "vggnet5"
    keras_filename = "SongsDL/Auralisation/vggnet5_local_keras_model_CNN_stft_11_frame_173_freq_257_folding_0_best.keras"
    W = []
    f = h5py.File(keras_filename)
    for idx in xrange(f.attrs['nb_layers']):
        key = 'layer_%d' % idx
        if f[key].keys() != []:
            W.append(f[key]['param_0'][:,:,:,:])
        if len(W) == 5:
            break
    layer_names = []
    for idx in xrange(5):
        layer_names.append('Convolution2D')
        layer_names.append('MaxPooling2D')
    layer_names.append('Flatten')

    return W, layer_names

In [24]:
yPop, srPop = librosa.load('SongsDL/Auralisation/src_songs/dream.wav')
yReggae, srReggae = librosa.load('SongsDL/Auralisation/src_songs/toy.wav')
yNoise, srNoise = librosa.load('whitenoise.wav')
path_SRC = 'storage/'
popSong = 'pop'
reggaeSong = 'reggae'
N_FFT = 512
SAMPLE_RATE = 22050
POP, POPphase = librosa.core.magphase(librosa.stft(yPop, n_fft=N_FFT, hop_length=N_FFT/2))
REGGAE, REGGAEphase = librosa.core.magphase(librosa.stft(yReggae, n_fft=N_FFT, hop_length=N_FFT/2))
noise, noisePhase = librosa.core.magphase(librosa.stft(yNoise, n_fft=N_FFT, hop_length=N_FFT/2))

# print POP.shape, POPphase.shape
# print REGGAE.shape, REGGAEphase.shape
# Audio(librosa.istft(REGGAE, hop_length=N_FFT/2), rate=22050)

In [25]:
# load learned weights
W, layer_names = load_weights()
num_conv_layer = len(W)
filenames_SRC = []

def shrink(SRC, length, name=None):
    if SRC.shape[1] > length:
        SRC = SRC[:, :length]
    elif SRC.shape[1] < length:
        temp = np.zeros((257, length))
        temp[:, :SRC.shape[1]] = SRC
        SRC = temp
    if name:
        np.save(path_SRC + name + '.npy', SRC)
        filenames_SRC.append(name +'.npy')
    return SRC

AUDIO_LENGTH = 800
popShort = shrink(POP, AUDIO_LENGTH, popSong)
reggaeShort = shrink(REGGAE, AUDIO_LENGTH, reggaeSong)

In [27]:
# VGG-19, 19-layer model from the paper:
# "Very Deep Convolutional Networks for Large-Scale Image Recognition"
# Original source: https://gist.github.com/ksimonyan/3785162f95cd2d5fee77
# License: non-commercial use only
# Note: tweaked to use average pooling instead of maxpooling
def build_model(weights = load_weights()[0]):
    net = {}
    net['input'] = InputLayer((1,1,257, AUDIO_LENGTH))
    
    net['conv1'] = ConvLayer(net['input'], 64, 3)#, W = weights[0])
    net['pool1'] = PoolLayer(net['conv1'], 2)

    net['conv2'] = ConvLayer(net['pool1'], 64, 3)#, W = weights[1])
    net['pool2'] = PoolLayer(net['conv2'], 2)

    net['conv3'] = ConvLayer(net['pool2'], 64, 3)#, W = weights[2])
    net['pool3'] = PoolLayer(net['conv3'], 2)

    net['conv4'] = ConvLayer(net['pool3'], 64, 3)#, W = weights[3])
    net['pool4'] = PoolLayer(net['conv4'], 2)

    net['conv5'] = ConvLayer(net['pool4'], 64, 3)#, W = weights[4])
    net['pool5'] = PoolLayer(net['conv5'], 2)
    
    net['flatten'] = FlattenLayer(net['pool5'], 2)

    return net

In [28]:
# build net and load weights
net = build_model()
W, layer_names = load_weights()

for i in reversed(range(1,6)):
    W.insert(i, np.ndarray(64,))

lasagne.layers.set_all_param_values(net['conv5'], W) 

In [29]:
def gram_matrix(x):
    x = x.flatten(ndim=3)
    g = T.tensordot(x, x, axes=([2], [2]))
    return g


def content_loss(P, X, layer):  
    p = P[layer]
    x = X[layer]
#     print 'content p ', np.isnan(p.eval()).any().any()
#     print 'content x ', np.isnan(x.eval()).any().any()
    loss = 1./2 * ((x - p)**2).sum()
    print 'content loss ', loss.eval()
    return loss


def style_loss(A, X, layer):
    a = A[layer]
    x = X[layer]
    
#     print 'style a ', np.isnan(a.eval()).any().any()
#     print 'style x ', np.isnan(x.eval()).any().any()
    
    A = gram_matrix(a)
    G = gram_matrix(x)
    
    N = a.shape[1]
    M = a.shape[2] * a.shape[3]
    
#     print 'style A ', np.isnan(A.eval()).any().any()
#     print 'style G ', np.isnan(G.eval()).any().any()
#     print 'style N ', np.isnan(N.eval())
#     print 'style M ', np.isnan(M.eval())
    
    
    loss = 1./(4 * N**2 * M**2) * ((G - A)**2).sum()
    print 'style loss ', loss.eval()
    return loss

def total_variation_loss(x):
    return (((x[:,:,:-1,:-1] - x[:,:,1:,:-1])**2 + (x[:,:,:-1,:-1] - x[:,:,:-1,1:])**2)**1.25).sum()

In [30]:
layers = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
layers = {k: net[k] for k in layers}
print layers.values()

[<lasagne.layers.conv.Conv2DLayer object at 0x7f095bfc00d0>, <lasagne.layers.conv.Conv2DLayer object at 0x7f097aa4d610>, <lasagne.layers.conv.Conv2DLayer object at 0x7f09530e3d10>, <lasagne.layers.conv.Conv2DLayer object at 0x7f095bfc05d0>, <lasagne.layers.conv.Conv2DLayer object at 0x7f095bfc0350>]


In [31]:
# Precompute layer activations for photo and artwork
input_im_theano = T.tensor4()
outputs = lasagne.layers.get_output(layers.values(), input_im_theano)

popShort3D = np.expand_dims(popShort, axis=0)
popShort4D = np.expand_dims(popShort3D, axis=0)

reggaeShort3D = np.expand_dims(reggaeShort, axis=0)
reggaeShort4D = np.expand_dims(reggaeShort3D, axis=0)

content_features = {k: theano.shared(output.eval({input_im_theano: popShort4D}))
                  for k, output in zip(layers.keys(), outputs)}

style_features = {k: theano.shared(output.eval({input_im_theano: reggaeShort4D}))
                for k, output in zip(layers.keys(), outputs)}

  border_mode=border_mode)


In [32]:
# Get expressions for layer activations for generated image
amplitude = 3.0
generated_image = theano.shared(float(amplitude) * floatX(np.random.uniform(-1, 1, (1, 1, 257, AUDIO_LENGTH))))
# print generated_image.eval().shape
# generated_image = theano.shared(floatX(np.random.uniform(0, 128, (1, 1, 257, AUDIO_LENGTH))))

# gen_features = lasagne.layers.get_output(layers.values(), generated_image)
# print gen_features

# # print [e.eval() for e in gen_features]
# gen_features = {k: v for k, v in zip(layers.keys(), gen_features)}

################ V2 ####################
noise3D = np.expand_dims(noise, axis=0)
noise4D = np.expand_dims(noise3D, axis=0)
# outputs = lasagne.layers.get_output(layers.values(), noise)#input_im_theano)
gen_features = {k: theano.shared(output.eval({input_im_theano: noise4D}))
                  for k, output in zip(layers.keys(), outputs)}
# print outputs[0].eval()
# print gen_features['conv1'].eval()

In [33]:
# Define loss function
losses = []

# content loss
losses.append(0.001 * content_loss(content_features, gen_features, 'conv1'))
losses.append(0.001 * content_loss(content_features, gen_features, 'conv2'))
losses.append(0.001 * content_loss(content_features, gen_features, 'conv3'))
losses.append(0.001 * content_loss(content_features, gen_features, 'conv4'))
losses.append(0.001 * content_loss(content_features, gen_features, 'conv5'))


# style loss
losses.append(0.2e6 * style_loss(style_features, gen_features, 'conv1'))
losses.append(0.2e6 * style_loss(style_features, gen_features, 'conv2'))
losses.append(0.2e6 * style_loss(style_features, gen_features, 'conv3'))
losses.append(0.2e6 * style_loss(style_features, gen_features, 'conv4'))
losses.append(0.2e6 * style_loss(style_features, gen_features, 'conv5'))

# total variation penalty
losses.append(0.1e-7 * total_variation_loss(generated_image))

total_loss = sum(losses)

content loss  1313580.1606
content loss  nan
content loss  nan
content loss  nan
content loss  nan
style loss  0.000950107335349
style loss  nan
style loss  nan
style loss  nan
style loss  nan


In [34]:
grad = T.grad(total_loss, generated_image)

# Theano functions to evaluate loss and gradient
f_loss = theano.function([], total_loss)
f_grad = theano.function([], grad)

In [35]:
# Helper functions to interface with scipy.optimize
def eval_loss(x0):
    x0 = floatX(x0.reshape((1, 1, 257, AUDIO_LENGTH)))
    generated_image.set_value(x0)
    return f_loss().astype('float64')

def eval_grad(x0):
    x0 = floatX(x0.reshape((1, 1, 257, AUDIO_LENGTH)))
    generated_image.set_value(x0)
    return np.array(f_grad()).flatten().astype('float64')

In [36]:
# x0 = generated_image.get_value().astype('float64')
# x0 = librosa.stft(x0[0][0], n_fft=N_FFT, hop_length=N_FFT/2)
src = librosa.stft(float(amplitude)*np.random.uniform(-1, 1, (661504)), n_fft=N_FFT, hop_length=N_FFT/2)
srcShort = shrink(src, AUDIO_LENGTH)
magnitude, phase = librosa.magphase(srcShort)
#x0 = librosa.stft(float(amplitude)*np.random.uniform(-1, 1, (661504)), n_fft=N_FFT, hop_length=N_FFT/2)
x0 = magnitude
xs = []
xs.append(x0)

print eval_loss(x0.flatten())

print x0.shape

soun = librosa.istft(popShort, hop_length=N_FFT/2)
Audio(soun, rate=22050)

nan
(257, 800)


In [37]:
def inspect_inputs(i, node, fn):
    print 'inspect input'
    print(i, node, "input(s) value(s):", [input[0] for input in fn.inputs])

def inspect_outputs(i, node, fn):
    print 'inspect output'
    print(" output(s) value(s):", [output[0] for output in fn.outputs])

content_loss(content_features, gen_features, 'conv1')
test = theano.function([], style_loss(content_features,gen_features,'conv2'))
#                        , mode=theano.compile.MonitorMode(
#                         pre_func=inspect_inputs,
#                         post_func=inspect_outputs))
test()
# f_loss()

content loss  1313580.1606
style loss  nan


array(nan)

In [38]:
# Optimize, saving the result periodically
res = []
for i in range(8):
    print(i)
    x, mini, d = scipy.optimize.fmin_l_bfgs_b(eval_loss, x0.flatten(), fprime=eval_grad, maxfun=40)
    print x.shape
    print mini
    res.append(x)
    src = librosa.stft(float(amplitude)*np.random.uniform(-1, 1, (661504)), n_fft=N_FFT, hop_length=N_FFT/2)
    srcShort = shrink(src, AUDIO_LENGTH)
    magnitude, phase = librosa.magphase(srcShort)
    x0 = magnitude
    xs.append(x0)

0


KeyboardInterrupt: 

In [None]:
song = res[0].reshape(257, AUDIO_LENGTH)
song = librosa.istft(song, hop_length=N_FFT/2)
Audio(song, rate=22050)
librosa.output.write_wav('output1.wav', song,22050, norm=True)

In [None]:
song = res[1].reshape(257, AUDIO_LENGTH)
song = librosa.istft(song, hop_length=N_FFT/2)
Audio(song, rate=22050)
librosa.output.write_wav('output2.wav', song,22050, norm=True)