In [46]:
#-*- coding: utf_8 -*-
import matplotlib as plt
import cPickle
import numpy as np
import os
import sys
import librosa
from scipy.misc import imsave
import SongsDL.Auralisation.auralise as auralise
import lasagne
import pickle
import scipy
import sklearn
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
import librosa # Librosa for audio
import urllib
import seaborn # And seaborn to make it look nice
seaborn.set(style='ticks')
# and IPython.display for audio output
import IPython.display
from IPython.display import Audio
%matplotlib inline

CUDA = False
if CUDA:
    from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
else:
    from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.utils import floatX
from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer, FlattenLayer
from lasagne.layers import Pool2DLayer as PoolLayer
from lasagne.nonlinearities import softmax

In [47]:
def load_weights():
    ''' Load keras config file and return W'''
    import h5py
    model_name = "vggnet5"
    keras_filename = "SongsDL/Auralisation/vggnet5_local_keras_model_CNN_stft_11_frame_173_freq_257_folding_0_best.keras"
    W = []
    f = h5py.File(keras_filename)
    for idx in xrange(f.attrs['nb_layers']):
        key = 'layer_%d' % idx
        if f[key].keys() != []:
            W.append(f[key]['param_0'][:,:,:,:])
            W.append(f[key]['param_1'][:])
        if len(W) == 10:
            break
    layer_names = []
    for idx in xrange(5):
        layer_names.append('Convolution2D')
        layer_names.append('MaxPooling2D')
    layer_names.append('Flatten')

    return W, layer_names

In [75]:
yPop, srPop = librosa.load('SongsDL/songsAndCovers/SmoothCriminal/MichaelJacksonShort.wav')
yReggae, srReggae = librosa.load('SongsDL/songsAndCovers/SmoothCriminal/AlienAntFarmShort.wav')
path_SRC = 'storage/'
popSong = 'pop'
reggaeSong = 'reggae'
N_FFT = 512
print srPop, srReggae
SAMPLE_RATE = 22050
POP, POPphase = librosa.core.magphase(librosa.stft(yPop, n_fft=N_FFT, hop_length=N_FFT/2))
REGGAE, REGGAEphase = librosa.core.magphase(librosa.stft(yReggae, n_fft=N_FFT, hop_length=N_FFT/2))
REGGAE.shape
POPphase = np.angle(POPphase)
REGGAEphase = np.angle(REGGAEphase)

22050 22050


In [76]:
Audio(yPop, rate = 22050)

In [77]:
librosa.output.write_wav('outputs/SmoothCriminalMichaelJackson.wav', librosa.istft(POP, hop_length=N_FFT/2),22050, norm=True)
librosa.output.write_wav('outputs/SmoothCriminalDavidGarrett.wav', librosa.istft(REGGAE, hop_length=N_FFT/2),22050, norm=True)
Audio(librosa.istft(REGGAE, hop_length=N_FFT/2),rate=22050)

In [78]:
# load learned weights
W, layer_names = load_weights()
num_conv_layer = len(W)
filenames_SRC = []

def shrink(SRC, length, name=None):
    if SRC.shape[1] > length:
        SRC = SRC[:, :length]
    elif SRC.shape[1] < length:
        temp = np.zeros((257, length))
        temp[:, :SRC.shape[1]] = SRC
        SRC = temp
    if name:
        np.save(path_SRC + name + '.npy', SRC)
        filenames_SRC.append(name +'.npy')
    return SRC

AUDIO_LENGTH = 400 #1652
popShort = shrink(POP, AUDIO_LENGTH, popSong)
popPhaseShort = shrink(POPphase, AUDIO_LENGTH)
reggaeShort = shrink(REGGAE, AUDIO_LENGTH, reggaeSong)
reggaePhaseShort = shrink(REGGAEphase, AUDIO_LENGTH)

In [79]:
# VGG-19, 19-layer model from the paper:
# "Very Deep Convolutional Networks for Large-Scale Image Recognition"
# Original source: https://gist.github.com/ksimonyan/3785162f95cd2d5fee77
# License: non-commercial use only
# Note: tweaked to use average pooling instead of maxpooling
def build_model(weights = load_weights()[0]):
    net = {}
    net['input'] = InputLayer((1,1,257, AUDIO_LENGTH))
    
    net['conv1'] = ConvLayer(net['input'], 64, 3)#, W = weights[0])
    net['pool1'] = PoolLayer(net['conv1'], 2, mode='average_exc_pad')

    net['conv2'] = ConvLayer(net['pool1'], 64, 3)#, W = weights[1])
    net['pool2'] = PoolLayer(net['conv2'], 2, mode='average_exc_pad')

    net['conv3'] = ConvLayer(net['pool2'], 64, 3)#, W = weights[2])
    net['pool3'] = PoolLayer(net['conv3'], 2, mode='average_exc_pad')

    net['conv4'] = ConvLayer(net['pool3'], 64, 3)#, W = weights[3])
    net['pool4'] = PoolLayer(net['conv4'], 2, mode='average_exc_pad')

    net['conv5'] = ConvLayer(net['pool4'], 64, 3)#, W = weights[4])
    net['pool5'] = PoolLayer(net['conv5'], 2, mode='average_exc_pad')
    
    net['flatten'] = FlattenLayer(net['pool5'], 2)
    return net

# build net and load weights
net = build_model()
W, layer_names = load_weights()
lasagne.layers.set_all_param_values(net['conv5'], W) 

In [80]:
def gram_matrix(x):
    x = x.flatten(ndim=3)
    g = T.tensordot(x, x, axes=([2], [2]))
    return g

def content_loss(P, X, layer):  
    p = P[layer]
    x = X[layer]
    loss = 1./2 * ((x - p)**2).sum()
    print 'content loss ', loss.eval()
    return loss

def style_loss(A, X, layer):
    a = A[layer]
    x = X[layer]
    A = gram_matrix(a)
    G = gram_matrix(x)
    N = a.shape[1]
    M = a.shape[2] * a.shape[3]
    loss = 1./(4 * N**2 * M**2) * ((G - A)**2).sum()
    print 'style loss ', loss.eval()
    return loss

def total_variation_loss(x):
    return (((x[:,:,:-1,:-1] - x[:,:,1:,:-1])**2 + (x[:,:,:-1,:-1] - x[:,:,:-1,1:])**2)**1.25).sum()

In [81]:
layers = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
layers = {k: net[k] for k in layers}

# Precompute layer activations for photo and artwork
input_im_theano = T.tensor4()
outputs = lasagne.layers.get_output(layers.values(), input_im_theano)

popShort3D = np.expand_dims(popShort, axis=0)
popShort4D = np.expand_dims(popShort3D, axis=0)
popPhaseShort3D = np.expand_dims(popPhaseShort, axis=0)
popPhaseShort4D = np.expand_dims(popPhaseShort3D, axis=0)

reggaeShort3D = np.expand_dims(reggaeShort, axis=0)
reggaeShort4D = np.expand_dims(reggaeShort3D, axis=0)
reggaePhaseShort3D = np.expand_dims(reggaePhaseShort, axis=0)
reggaePhaseShort4D = np.expand_dims(reggaePhaseShort3D, axis=0)

content_features = {k: theano.shared(output.eval({input_im_theano: popShort4D}))
                  for k, output in zip(layers.keys(), outputs)}
content_phase_features = {k: theano.shared(output.eval({input_im_theano: popPhaseShort4D}))
                  for k, output in zip(layers.keys(), outputs)}

style_features = {k: theano.shared(output.eval({input_im_theano: reggaeShort4D}))
                for k, output in zip(layers.keys(), outputs)}
style_phase_features = {k: theano.shared(output.eval({input_im_theano: reggaePhaseShort4D}))
                for k, output in zip(layers.keys(), outputs)}
amplitude = 3.0
generated_image = theano.shared(floatX(np.random.uniform(-amplitude, amplitude, (1, 1, 257, AUDIO_LENGTH))))
x0 = generated_image.get_value()
gen_features = lasagne.layers.get_output(layers.values(), generated_image)
gen_features = {k: v for k, v in zip(layers.keys(), gen_features)}

generated_phase_image = theano.shared(floatX(np.random.uniform(-amplitude, amplitude, (1, 1, 257, AUDIO_LENGTH))))
x0_phase = generated_phase_image.get_value()
gen_phase_features = lasagne.layers.get_output(layers.values(), generated_phase_image)
gen_phase_features = {k: v for k, v in zip(layers.keys(), gen_phase_features)}

In [82]:
# Define loss function
losses = []

# content loss
losses.append(0.001 * content_loss(content_features, gen_features, 'conv1'))
losses.append(0.001 * content_loss(content_features, gen_features, 'conv2'))
losses.append(0.001 * content_loss(content_features, gen_features, 'conv3'))
# losses.append(1000 * content_loss(content_features, gen_features, 'conv4'))
# losses.append(1000 * content_loss(content_features, gen_features, 'conv5'))

# style loss
# losses.append(2e5 * style_loss(style_features, gen_features, 'conv1'))
# losses.append(2e5 * style_loss(style_features, gen_features, 'conv2'))
losses.append(2e5 * style_loss(style_features, gen_features, 'conv3'))
losses.append(2e5 * style_loss(style_features, gen_features, 'conv4'))
losses.append(2e5 * style_loss(style_features, gen_features, 'conv5'))

# total variation penalty
losses.append(0.1e-7 * total_variation_loss(generated_image))

total_loss = sum(losses)
grad = T.grad(total_loss, [generated_image])

# Theano functions to evaluate loss and gradient
f_loss = theano.function([], total_loss)
f_grad = theano.function([], grad)

content loss  632082.534942
content loss  29793.8218428
content loss  2185.51320928
style loss  4.47497541852e-06
style loss  1.50230495564e-05
style loss  6.02518409089e-05


In [83]:
# Define loss function for phase
losses_phase = []

# content loss
losses_phase.append(0.001 * content_loss(content_phase_features, gen_phase_features, 'conv1'))
losses_phase.append(0.001 * content_loss(content_phase_features, gen_phase_features, 'conv2'))
losses_phase.append(0.001 * content_loss(content_phase_features, gen_phase_features, 'conv3'))
# losses_phase.append(1000 * content_loss(content_phase_features, gen_phase_features, 'conv4'))
# losses_phase.append(1000 * content_loss(content_phase_features, gen_phase_features, 'conv5'))

# style loss
# losses_phase.append(2e5 * style_loss(style_phase_features, gen_phase_features, 'conv1'))
# losses_phase.append(2e5 * style_loss(style_phase_features, gen_phase_features, 'conv2'))
losses_phase.append(2e5 * style_loss(style_phase_features, gen_phase_features, 'conv3'))
losses_phase.append(2e5 * style_loss(style_phase_features, gen_phase_features, 'conv4'))
losses_phase.append(2e5 * style_loss(style_phase_features, gen_phase_features, 'conv5'))

# total variation penalty
losses_phase.append(0.1e-7 * total_variation_loss(generated_phase_image))

total_phase_loss = sum(losses_phase)
grad_phase = T.grad(total_phase_loss, [generated_phase_image])

# Theano functions to evaluate loss and gradient
f_phase_loss = theano.function([], total_phase_loss)
f_phase_grad = theano.function([], grad_phase)

content loss  878066.67907
content loss  16803.3920737
content loss  812.454400173
style loss  6.20836304532e-07
style loss  1.35736923533e-06
style loss  2.14192346519e-06


In [84]:
# Helper functions to interface with scipy.optimize
def eval_loss(x0):
    x0 = floatX(x0.reshape((1, 1, 257, AUDIO_LENGTH)))
    generated_image.set_value(x0)
    return f_loss().astype('float64')

def eval_grad(x0):
    x0 = floatX(x0.reshape((1, 1, 257, AUDIO_LENGTH)))
    generated_image.set_value(x0)
    return np.array(f_grad()).flatten().astype('float64')

def eval_phase_loss(x0):
    x0 = floatX(x0.reshape((1, 1, 257, AUDIO_LENGTH)))
    generated_phase_image.set_value(x0)
    return f_phase_loss().astype('float64')

def eval_phase_grad(x0):
    x0 = floatX(x0.reshape((1, 1, 257, AUDIO_LENGTH)))
    generated_phase_image.set_value(x0)
    return np.array(f_phase_grad()).flatten().astype('float64')

In [85]:
error = eval_loss(x0.flatten())
print 'error', error

soun = librosa.istft(x0[0][0], hop_length=N_FFT/2)
Audio(soun, rate=22050)

error 680.037380705


In [86]:
xs = []
xs.append(x0)
xs_phase = []
xs_phase.append(x0_phase)
# Optimize, saving the result periodically
for i in range(1):
    x, mini, d = scipy.optimize.fmin_l_bfgs_b(eval_loss, x0.flatten(), fprime=eval_grad, maxfun=40)
    print i, mini
    x, mini, d = scipy.optimize.fmin_l_bfgs_b(eval_phase_loss, x0_phase.flatten(), fprime=eval_phase_grad, maxfun=40)
    print i, mini
    x0 = generated_image.get_value().astype('float64')
    xs.append(x0)
    x0_phase = generated_phase_image.get_value().astype('float64')
    xs_phase.append(x0_phase)

0 0.408504750696
0 0.0350447228793


In [91]:
mag = xs[1].reshape(257, AUDIO_LENGTH)
phase = xs_phase[1].reshape(257,AUDIO_LENGTH)
print np.mean(phase)
song = mag * np.exp(1j*phase)
song = librosa.istft(song, hop_length=N_FFT/2)
# librosa.output.write_wav('outputs/MAGANDPHASE.wav', song,22050, norm=True)
Audio(song, rate=22050)

0.0188458788663


In [87]:
song = xs[1].reshape(257, AUDIO_LENGTH)
song = librosa.istft(song, hop_length=N_FFT/2)
# librosa.output.write_wav('outputs/hiphop-style4.wav', song,22050, norm=True)
Audio(song, rate=22050)

In [90]:
print len(xs), len(xs_phase)

 2 2


In [33]:
for i in range(len(xs)):
    song = xs[i].reshape(257,AUDIO_LENGTH)
    song = librosa.istft(song, hop_length=N_FFT/2)
    librosa.output.write_wav('outputs/styleReconstruction/hiphopstyle-'+str(i)+'.wav', song, 22050, norm=True)