#'CROSS-SYNTHESIS' VIA AUTOCODER


Open this notebook in [Google Colab](https://colab.research.google.com). You can do this by using the Chrome plugin [Open in Colab](https://chrome.google.com/webstore/detail/open-in-colab/iogfkhleblhcpcekbiedikdehleodpjo),<br>
or by downloading it from github and uploading it into colab.<br><br>
Make sure to set the runtime type to GPU (Runtime->Change Runtime Type) or training will take<br>an enternity.<br><br>
You can run each code section by pressing the play button which shows up when you hover over the<br> two brackets with/without a number on the top left of the codebox.
<br>

#1. UPLOAD TWO SOUNDFILES
Click on the folder icon on the left to expand the file browser and drag and drop a sound-file there.<br>Then edit the filename below to match the uploaded file. Make sure that the file finishes uploading<br> before continuing. The first one is the modulator and the second the carrier.

In [None]:
filename_1 = "modulator.wav"
filename_2 = "carrier.wav"


#2. SELECT ONE OF THE FOLLOWING TO SET THE TRAINING PARAMETERS
Some of these presets may not produce usable models with specific datasets
so some patience <br>and experimentation may be needed. More training is not always better and some data-sets may<br> need different (lower) values for regression patience to avoid artifacting.<br>

In [None]:
# LOW QUALITY - TRAINING TIME ROUGHLY .8x DURATION – GOOD ENOUGH FOR EXPLORATION
batch_size = 1024
regression_patience = 200
learning_rate = .0001
min_delta = .00001
quality = "low"

In [None]:
# MEDIUM QUALITY – TRAINING TIME ROUGHLY 3x DURATION – GOOD ENOUGH FOR MOST THINGS
batch_size = 512
regression_patience = 500
learning_rate = .0001
min_delta = .00001
quality = "medium"

In [None]:
# HIGH QUALITY – TRAINING TIME ROUGHLY 5x DURATION
batch_size = 256
regression_patience = 1000 
learning_rate = .0001
min_delta = .00001
quality = "high"

In [None]:
# EXTREME QUALITY – TRAINING TAKES A VERY LONG TIME
batch_size = 256
regression_patience = 10000000
learning_rate = .0001
min_delta = 0
quality = "extreme"

#3. RUN THE CODE
Depending on the training settings selected above, this can take anywhere from
.5x to 5x or more<br> of the duration of the input file so be patient.

In [None]:

!pip install python_speech_features
import librosa
import numpy as np
import scipy
from scipy.signal import hann
from python_speech_features.base import get_filterbanks
import tensorflow as tf
import tensorflow.keras.backend as K
import math
import time

# ANALYSIS SETTINGS
fftsize = 16384
windowskip = 1024

# MODEL STRUCTURE
input_dim = 512
intermediate_dim = 1000
encoded_dim = 8
output_fft_size = 16384

def create_mel_filter(fft_size, n_freq_components = 64, start_freq = 300, end_freq = 8000, samplerate = 44100):
    filterbanks = get_filterbanks(nfilt=n_freq_components,
                                           nfft=fft_size, samplerate=samplerate,
                                           lowfreq=start_freq, highfreq=end_freq)
    mel_inversion_filter = np.ascontiguousarray((filterbanks.T[0:(int(fft_size/2))]).T)
    mel_filter = np.ascontiguousarray(np.divide(mel_inversion_filter.T, mel_inversion_filter.sum(axis=1)))

    return mel_filter, mel_inversion_filter


def initialize(size, melsize):
    window = np.zeros((1, size))
    window = np.ascontiguousarray(window)
    window[0,] = hann(size)
    mel_filter, mel_inversion_filter = create_mel_filter(size, melsize, 0, 22050, 44100)
    np.nan_to_num(mel_filter, False, nan = 0.0)
    np.nan_to_num(mel_inversion_filter, False, nan = 0.0)
    return(mel_filter, mel_inversion_filter, window)


def convertToBin(data):
    return(np.sqrt(np.add(np.multiply(data.real, data.real), np.multiply(data.imag, data.imag))))


def spectrogram_to_mel(spectrogram, filter):
    mel_spec = np.transpose(filter).dot(np.transpose(spectrogram))
    return mel_spec


def get_aminmax(X):
    return(np.amin(X), np.amax(X))


def scale_array_by_amax(X):
    return((X - np.amin(X)) / (np.amax(X) - np.amin(X)))

def scale_array(X, minin, maxin):
    return((X - minin) / (maxin - minin))

def analyze(data, window, mel_filter):
    data = np.multiply(data, window)
    fftdata = scipy.fft.rfft(data)
    ampslize = convertToBin(fftdata)
    ampslize = np.ascontiguousarray(ampslize)
    phase = np.angle(fftdata)
    phase = phase[0,0:int(data.shape[1]/2)]
    melslize = spectrogram_to_mel(ampslize[0,0:int(data.shape[1]/2)], mel_filter)
    return(melslize, ampslize[0,0:int(data.shape[1]/2)], phase)


def analyze_data(data, filename, fftsize, windowskip, melsize, window, mel_filter):

    n_slizes = round(len(data)/windowskip)
    output = np.zeros((int((n_slizes - 16))+1, melsize))
    output = np.ascontiguousarray(output)
    fft_output = np.zeros((int((n_slizes - 16))+1, int(fftsize / 2)))
    fft_output = np.ascontiguousarray(fft_output)
    phase_output = np.zeros((int((n_slizes - 16)) + 1, int(fftsize / 2)))
    phase_output = np.ascontiguousarray(phase_output)

    in_slize = np.zeros((1, fftsize))
    in_slize = np.ascontiguousarray(in_slize)
    for i in range(0, (n_slizes - 16)):
        in_slize[0] = data[i * windowskip:((i*windowskip) + fftsize)]
        output[i,:], fft_output[i,:], phase_output[i,:] = analyze(in_slize, window, mel_filter)

    output = np.nan_to_num(output, 0.)
    minin, maxin = get_aminmax(output)
    return(minin, maxin, output, phase_output)


def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """
    z_mean, z_log_var = args
    epsilon = 1e-06
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


def init_autoencoder_shallow(input_dim, intermediate_dim, encoded_dim, learning_rate):

    input_shape = (input_dim, )
    latent_dim = encoded_dim

    print("input_shape: ", input_shape)

    # VAE model = encoder + decoder
    # build encoder model
    inputs = tf.keras.Input(shape=input_shape, name='encoder_input')
    x = tf.keras.layers.Dense(intermediate_dim, activation='relu', activity_regularizer=tf.keras.regularizers.l1(10e-5), kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-5))(inputs)

    z_mean = tf.keras.layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = tf.keras.layers.Dense(latent_dim, name='z_log_var')(x)

    # use reparameterization trick to push the sampling out as input
    # note that "output_shape" isn't necessary with the TensorFlow backend
    z = tf.keras.layers.Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

    # instantiate encoder model
    encoder = tf.keras.Model(inputs, [z_mean, z_log_var, z], name='encoder')
    encoder.summary()

    # build decoder model
    latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
    x = tf.keras.layers.Dense(intermediate_dim, activation='relu')(latent_inputs)
    outputs = tf.keras.layers.Dense(input_dim, activation='sigmoid')(x)

    # instantiate decoder model
    _,mel_inversion_filter = create_mel_filter(output_fft_size, input_dim, 0, 22050, 44100)
    mel = K.expand_dims(tf.constant(mel_inversion_filter), 0)
    transformed_outputs = tf.keras.layers.Dot(axes=(1,1)) ([outputs, mel])
    decoder = tf.keras.Model(latent_inputs, transformed_outputs, name='decoder')
    training_decoder = tf.keras.Model(latent_inputs, outputs, name='training_decoder')
    decoder.summary()
    training_decoder.summary()
    # instantiate VAE model
    training_outputs = training_decoder(encoder(inputs)[2])
    outputs = decoder(encoder(inputs)[2])


    vae = tf.keras.Model(inputs, [training_outputs,outputs], name='vae_mlp')

    reconstruction_loss = tf.keras.losses.binary_crossentropy(inputs, training_outputs)

    reconstruction_loss *= input_dim
    kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    vae_loss = K.mean(reconstruction_loss + kl_loss)
    vae.add_loss(vae_loss)
    opt = tf.keras.optimizers.Adam(lr=learning_rate)

    vae.compile(optimizer=opt)
    return(vae, encoder, decoder, training_decoder)


def get_minmax(encoder, input):
    z_encoded = encoder.predict(input)
    z_encoded = np.asarray(z_encoded[0], dtype = np.float32)

    min = z_encoded.min(axis = 0)
    max = z_encoded.max(axis = 0)
    scale_mult = np.subtract(max, min)
    scale_subtract = min
    return(scale_mult, scale_subtract)

history = 0

def train(filename, vae, encoder, decoder, training_decoder, input, min_delta, regression_patience = 1, batch_size = 4096, deep = 0):
    global history
    tf.executing_eagerly()
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', min_delta=min_delta, patience = regression_patience)
    history = vae.fit(input,
            batch_size = batch_size,
            epochs=50000, verbose = 0, callbacks=[es])

    vae.save_weights(filename + ".h5")
    scale_mult, scale_subtract = get_minmax(encoder, input)

    converter_enc = tf.lite.TFLiteConverter.from_keras_model(encoder)
    converter_dec = tf.lite.TFLiteConverter.from_keras_model(decoder)
    converter_training_dec = tf.lite.TFLiteConverter.from_keras_model(training_decoder)
    tflite_model_enc = converter_enc.convert()
    tflite_model_dec = converter_dec.convert()
    tflite_model_training_dec = converter_training_dec.convert()

    # Save the models
    with open(filename + '.enc', 'wb') as f:
      f.write(tflite_model_enc)
    with open(filename + '.fft.dec', 'wb') as f:
      f.write(tflite_model_dec)
    with open(filename + '.dec', 'wb') as f:
      f.write(tflite_model_training_dec)

    return(vae, encoder, decoder, training_decoder, scale_mult, scale_subtract)


def write_mm(filename, minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep):
    output_ = np.zeros([3, 8])
    output_[0] = scale_mult
    output_[1] = scale_subtract
    output_[2][0] = minin
    output_[2][1] = maxin
    output_[2][2] = input_dim
    output_[2][3] = intermediate_dim
    output_[2][4] = encoded_dim
    output_[2][5] = deep
    np.savetxt(filename + ".mm", output_, delimiter = ", ", fmt="%1.6f")
  
imported_data_1,_ = librosa.load(filename_1, sr = 44100, mono = True)
imported_data_2,_ = librosa.load(filename_2, sr = 44100, mono = True)
print("  ...ANALYZING 1")
print("")
print("  ####################################")
print("  #   number of samples:  ", len(imported_data_1))
print("  ####################################")
print("")

mel_filter, mel_inversion_filter, window = initialize(fftsize, input_dim)
minin_1, maxin_1, input_data_1, phase_data_1 = analyze_data(imported_data_1, filename_1, fftsize, windowskip, input_dim, window, mel_filter)
print("  ...ANALYZING 2")
print("")
print("  ####################################")
print("  #   number of samples:  ", len(imported_data_2))
print("  ####################################")
print("")
minin_2, maxin_2, input_data_2, phase_data_2 = analyze_data(imported_data_2, filename_2, fftsize, windowskip, input_dim, window, mel_filter)
print(np.amin(input_data_1), np.amax(input_data_1))
print(np.amin(input_data_2), np.amax(input_data_2))
print(input_data_1.shape)
output = np.append(input_data_1, input_data_2, axis = 0)
print(output.shape)
output  = np.nan_to_num(output, 0.)
minin, maxin = get_aminmax(output)
input_data = scale_array(output, minin, maxin)
input_data_1 = scale_array(input_data_1, minin, maxin)
input_data_2 = scale_array(input_data_2, minin, maxin)

vae, encoder, decoder, training_decoder = init_autoencoder_shallow(input_dim, intermediate_dim, encoded_dim, learning_rate)
vae, encoder, decoder, training_decoder, scale_mult, scale_subtract = train(filename_1, vae, encoder, decoder, training_decoder, input_data, min_delta, regression_patience, batch_size, 0)

# ONCE THEY ARE TRAINED, RUN EACH MODEL THROUGH TO GET THE SCALE_MULT / SCALE_SUBTRACT FOR EACH

def get_each_min_max(encoder, input):
    z_encoded = encoder.predict(input)
    z_encoded = np.asarray(z_encoded[0], dtype = np.float32)

    min = z_encoded.min(axis = 0)
    max = z_encoded.max(axis = 0)
    return(min, max)


scale_min_1, scale_max_1 = get_each_min_max(encoder, input_data_1)
scale_min_2, scale_max_2 = get_each_min_max(encoder, input_data_2)


  ...ANALYZING 1

  ####################################
  #   number of samples:   395611
  ####################################

  ...ANALYZING 2

  ####################################
  #   number of samples:   3175200
  ####################################

0.0 674.3337549224135
0.0 493.52865883643676
(371, 512)
(3457, 512)
input_shape:  (512,)
Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 512)]        0           []                               
                                                                                                  
 dense_3 (Dense)                (None, 1000)         513000      ['encoder_input[0][0]']          
                                                                                                  
 z_mean (Dense)                 (None

  super(Adam, self).__init__(name, **kwargs)


# CROSS SYNTHESIZE
This is no way an actual cross synthesis. It simply trains a model on two soundfiles and fits the latent space of the modulator onto the latent space of the carrier, using the phase of the modulator for resynthesis.

In [None]:
import scipy.io.wavfile


def apply_one_to_the_other(encoder, decoder, input, min_1, max_1, min_2, max_2):
  z_encoded = encoder.predict(input)
  z_encoded = np.asarray(z_encoded[0], dtype = np.float32)
  z_encoded = np.subtract(z_encoded, min_1)
  z_encoded = np.divide(z_encoded, np.subtract(max_1, min_1))
  z_encoded = np.multiply(z_encoded, np.subtract(max_2, min_2))
  z_encoded = np.add(z_encoded, min_2)
  return(decoder.predict(z_encoded))

decoded_and_applied = apply_one_to_the_other(encoder, decoder, input_data_1, scale_min_1, scale_max_1, scale_min_2, scale_max_2)

#x = range(0, 512)
#import matplotlib.pyplot as plt
#plt.plot(x,input_data_1[5000,])
#plt.show()
#x = range(0, 8192)
#plt.plot(x,decoded_and_applied[5000,])
#plt.show()

mel_filter, mel_inversion_filter, window = initialize(fftsize, input_dim)
output = np.zeros(decoded_and_applied.shape[0] * windowskip)
sout = np.zeros((fftsize), dtype=np.float32)
se = np.zeros(windowskip, dtype=np.float32)

def callback(amp_frame, phase_frame, frame_size):
    global sout

    # DECODE
    ca = np.zeros((1, int(fftsize / 2 + 1)), dtype=np.float32)
    ca[0,0:int(fftsize/2)] = amp_frame
    ca[0,0] = 0.

    # GENERATE NOISE TO USE AS RECONSTRUCTION PHASE
    ph = phase_frame

    # CONVERT BACK TO A SIGNAL
    co = np.zeros(int(fftsize / 2 + 1), dtype='complex64')
    co.real[0:int(fftsize/2,)] = np.multiply(amp_frame, np.cos(phase_frame))
    co.imag[0:int(fftsize/2,)]= np.multiply(amp_frame, np.sin(phase_frame))
    cs = np.multiply(np.fft.irfft(co), window)

    sout[0:(fftsize - frame_size)] = sout[frame_size:fftsize]
    sout[(fftsize - frame_size):fftsize] = se
    sout = np.add(sout, cs)[0,]
    return(np.multiply(sout[0:frame_size].astype(np.float32), 1.))

for i in range(0, decoded_and_applied.shape[0]):
    output[i * windowskip:(i + 1) * windowskip] = callback(decoded_and_applied[i,], phase_data_1[i,], 1024)

scipy.io.wavfile.write(filename_1 + "-resynth.wav", 44100, np.divide(output, np.amax(output)))


<-- You can download the model as a zip filename.wav-resynth.wav from the file browser on the left.