<a href="https://colab.research.google.com/github/eyaler/autocoder/blob/main/colab/autocoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Autocoder: a Variational Autoencoder for Spectral Synthesis [Franzson, et al.]
### An accessible Google Colab for training, and (interactive) generation!

#### Colab by [Eyal Gruss](https://eyalgruss.com) ([@eyaler](https://twitter.com/eyaler?lang=en)\)
#### With a lot of inspiration and guidance from [Nicholas Shaheed](https://nicholasshaheed.com)
#### Made at [Stochastic Labs](http://stochasticlabs.org)


"Our motivation for the Autocoder is the artistic need for a simple tool aimed at creative individuals that allows for the manipulation and exploration of any input sound –– harmonic or in-harmonic, monophonic or polyphonic –– in real-time, with minimal time wasted on training and adjustment of network parameters while emphasizing the quality of the synthesized output." [Franzson, et al.]


##### Paper: https://www.dropbox.com/s/meewllm1rc243ad/icmc_paper_autocoder_final.pdf
##### Original repo: https://github.com/franzson/autocoder 
##### Original Colab: https://colab.research.google.com/github/franzson/autocoder_training/blob/main/autoencoder_simple.ipynb
##### Original examples: https://soundcloud.com/david-brynjar-franzson/sets/autocoder-examples
##### Recommended to use in MAX/MSP with Nick's plugin: https://github.com/nshaheed/autocoder-expanded
##### Shortcut to this Colab: https://bit.ly/autocod

In [None]:
#@title Upload audio file
#@markdown Enter a URL to an audio file OR a YouTube or similar site OR leave empty for manual upload (canceling the upload will use a default file)
audio = '' #@param {type:'string'}

import os
from google.colab import files

upload_folder = '/content/upload'
!rm -rf "$upload_folder"
!mkdir "$upload_folder"
%cd "$upload_folder"

cancelled = False
if not audio:
  try:
    uploaded = files.upload()
    input_filename = list(uploaded)[0]
  except Exception:
    audio = 'https://www.youtube.com/watch?v=GrNue18uWWE'
if audio:
  if audio.rsplit('.')[-1] in ['mp3','wav','m4a','aac','ogg','flac','wma','aiff','opus','amr','ac3','mp4']:
    !wget "$audio" 
    input_filename = audio.rsplit('/')[-1]
  else:
    !pip install -q youtube-dl
    !youtube-dl --no-playlist --extract-audio --audio-format mp3 "$audio" -o "audio.%(ext)s"
    input_filename = 'audio.mp3'
input_filename = os.path.join(upload_folder, input_filename)
print(input_filename)
%cd /content

In [None]:
#@title Train model
quality = 'MEDIUM QUALITY - TRAINING TIME ROUGHLY 3x DURATION - GOOD ENOUGH FOR MOST THINGS' #@param ['LOW QUALITY - TRAINING TIME ROUGHLY .8x DURATION - GOOD ENOUGH FOR EXPLORATION', 'MEDIUM QUALITY - TRAINING TIME ROUGHLY 3x DURATION - GOOD ENOUGH FOR MOST THINGS', 'HIGH QUALITY - TRAINING TIME ROUGHLY 5x DURATION', 'EXTREME QUALITY - TRAINING TAKES A VERY LONG TIME']
start_secs = 0 #@param {type:'number'}
duration_from_start_secs = 0 #@param {type:'number'}
#@markdown Note: setting duration_from_start_secs=0 will take the full remaining duration 
duration_from_start_secs = duration_from_start_secs or None

if quality == 'LOW QUALITY - TRAINING TIME ROUGHLY .8x DURATION - GOOD ENOUGH FOR EXPLORATION':
  batch_size = 1024
  regression_patience = 200
  learning_rate = .0001
  min_delta = .00001
  quality = "low"
elif quality == 'MEDIUM QUALITY - TRAINING TIME ROUGHLY 3x DURATION - GOOD ENOUGH FOR MOST THINGS':
  batch_size = 512
  regression_patience = 500
  learning_rate = .0001
  min_delta = .00001
  quality = "medium"
elif quality == 'HIGH QUALITY - TRAINING TIME ROUGHLY 5x DURATION':
  batch_size = 256
  regression_patience = 1000 
  learning_rate = .0001
  min_delta = .00001
  quality = "high"
elif quality == 'EXTREME QUALITY - TRAINING TAKES A VERY LONG TIME':
  batch_size = 256
  regression_patience = 10000000
  learning_rate = .0001
  min_delta = 0
  quality = "extreme"


%cd /content
!pip install -q python_speech_features

import librosa
import numpy as np
import scipy
from scipy.signal import hann
from python_speech_features.base import get_filterbanks
import tensorflow as tf
import tensorflow.keras.backend as K
import math
import time

rate = 44100


# ANALYSIS SETTINGS
fftsize = 16384
windowskip = 1024

# MODEL STRUCTURE
input_dim = 512
intermediate_dim = 1000
encoded_dim = 8
output_fft_size = 8192


def create_mel_filter(fft_size, n_freq_components = 64, start_freq = 300, end_freq = 8000, samplerate = rate):
    filterbanks = get_filterbanks(nfilt=n_freq_components,
                                           nfft=fft_size, samplerate=samplerate,
                                           lowfreq=start_freq, highfreq=end_freq)
    mel_inversion_filter = np.ascontiguousarray((filterbanks.T[0:(int(fft_size/2))]).T)
    mel_filter = np.ascontiguousarray(np.divide(mel_inversion_filter.T, mel_inversion_filter.sum(axis=1)))

    return mel_filter, mel_inversion_filter


def initialize(size, melsize):
    window = np.zeros((1, size))
    window = np.ascontiguousarray(window)
    window[0,] = hann(size)
    mel_filter, mel_inversion_filter = create_mel_filter(size, melsize, 0, rate//2, rate)
    np.nan_to_num(mel_filter, False, nan = 0.0)
    np.nan_to_num(mel_inversion_filter, False, nan = 0.0)
    return(mel_filter, mel_inversion_filter, window)


def convertToBin(data):
    return(np.sqrt(np.add(np.multiply(data.real, data.real), np.multiply(data.imag, data.imag))))


def spectrogram_to_mel(spectrogram, filter):
    mel_spec = np.transpose(filter).dot(np.transpose(spectrogram))
    return mel_spec


def get_aminmax(X):
    return(np.amin(X), np.amax(X))


def scale_array_by_amax(X):
    return((X - np.amin(X)) / (np.amax(X) - np.amin(X)))


def analyze(data, window, mel_filter):
    data = np.multiply(data, window)
    fftdata = scipy.fft.rfft(data)
    ampslize = convertToBin(fftdata)
    ampslize = np.ascontiguousarray(ampslize)
    #phase = np.angle(fftdata)
    melslize = spectrogram_to_mel(ampslize[0,0:int(data.shape[1]/2)], mel_filter)
    return(melslize, ampslize[0,0:int(data.shape[1]/2)])


def analyze_data(data, filename, fftsize, windowskip, melsize, window, mel_filter):
    n_slizes = round(len(data)/windowskip)
    output = np.zeros((int((n_slizes - 16))+1, melsize))
    output = np.ascontiguousarray(output)
    fft_output = np.zeros((int((n_slizes - 16))+1, int(fftsize / 2)))
    fft_output = np.ascontiguousarray(fft_output)

    in_slize = np.zeros((1, fftsize))
    in_slize = np.ascontiguousarray(in_slize)
    for i in range(0, (n_slizes - 16)):
        in_slize[0] = data[i * windowskip:((i*windowskip) + fftsize)]
        output[i,:], fft_output[i,:] = analyze(in_slize, window, mel_filter)

    output = np.nan_to_num(output, 0.)
    minin, maxin = get_aminmax(output)
    output = scale_array_by_amax(output)

    np.save(filename + ".npy", output)
    #np.save(filename + ".fft.npy", fft_output)
    return(minin, maxin, output)


def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """
    z_mean, z_log_var = args
    epsilon = 1e-06
    return z_mean + K.exp(0.5 * z_log_var) * epsilon


def init_autoencoder_shallow(input_dim, intermediate_dim, encoded_dim, learning_rate):

    input_shape = (input_dim, )
    latent_dim = encoded_dim

    #print("input_shape: ", input_shape)

    # VAE model = encoder + decoder
    # build encoder model
    inputs = tf.keras.Input(shape=input_shape, name='encoder_input')
    x = tf.keras.layers.Dense(intermediate_dim, activation='relu', activity_regularizer=tf.keras.regularizers.l1(10e-5), kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-5))(inputs)

    z_mean = tf.keras.layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = tf.keras.layers.Dense(latent_dim, name='z_log_var')(x)

    # use reparameterization trick to push the sampling out as input
    # note that "output_shape" isn't necessary with the TensorFlow backend
    z = tf.keras.layers.Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

    # instantiate encoder model
    encoder = tf.keras.Model(inputs, [z_mean, z_log_var, z], name='encoder')
    # encoder.summary()

    # build decoder model
    latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
    x = tf.keras.layers.Dense(intermediate_dim, activation='relu')(latent_inputs)
    outputs = tf.keras.layers.Dense(input_dim, activation='sigmoid')(x)

    # instantiate decoder model
    _,mel_inversion_filter = create_mel_filter(output_fft_size, input_dim, 0, rate//2, rate)
    mel = K.expand_dims(tf.constant(mel_inversion_filter), 0)
    transformed_outputs = tf.keras.layers.Dot(axes=(1,1)) ([outputs, mel])
    decoder = tf.keras.Model(latent_inputs, transformed_outputs, name='decoder')
    training_decoder = tf.keras.Model(latent_inputs, outputs, name='training_decoder')
    # decoder.summary()
    # training_decoder.summary()
    # instantiate VAE model
    training_outputs = training_decoder(encoder(inputs)[2])
    outputs = decoder(encoder(inputs)[2])


    vae = tf.keras.Model(inputs, [training_outputs,outputs], name='vae_mlp')

    reconstruction_loss = tf.keras.losses.binary_crossentropy(inputs, training_outputs)

    reconstruction_loss *= input_dim
    kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    vae_loss = K.mean(reconstruction_loss + kl_loss)
    vae.add_loss(vae_loss)
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    vae.compile(optimizer=opt)
    return(vae, encoder, decoder, training_decoder)


def get_minmax(encoder, input):
    z_encoded = encoder.predict(input)
    z_encoded = np.asarray(z_encoded[0], dtype = np.float32)

    min = z_encoded.min(axis = 0)
    max = z_encoded.max(axis = 0)
    scale_mult = np.subtract(max, min)
    scale_subtract = min
    return(scale_mult, scale_subtract)


history = 0


def train(filename, vae, encoder, decoder, training_decoder, input, min_delta, regression_patience = 1, batch_size = 4096, deep = 0):
    global history
    tf.executing_eagerly()
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', min_delta=min_delta, patience = regression_patience)
    history = vae.fit(input,
            batch_size = batch_size,
            epochs=50000, verbose = 0, callbacks=[es])

    vae.save_weights(filename + ".h5")
    scale_mult, scale_subtract = get_minmax(encoder, input)

    converter_enc = tf.lite.TFLiteConverter.from_keras_model(encoder)
    converter_dec = tf.lite.TFLiteConverter.from_keras_model(decoder)
    converter_training_dec = tf.lite.TFLiteConverter.from_keras_model(training_decoder)
    tflite_model_enc = converter_enc.convert()
    tflite_model_dec = converter_dec.convert()
    tflite_model_training_dec = converter_training_dec.convert()

    # Save the models
    with open(filename + '.enc', 'wb') as f:
      f.write(tflite_model_enc)
    with open(filename + '.fft.dec', 'wb') as f:
      f.write(tflite_model_dec)
    with open(filename + '.dec', 'wb') as f:
      f.write(tflite_model_training_dec)

    return(vae, encoder, decoder, training_decoder, scale_mult, scale_subtract)


def write_mm(filename, minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep):
    output_ = np.zeros([3, 8])
    output_[0] = scale_mult
    output_[1] = scale_subtract
    output_[2][0] = minin
    output_[2][1] = maxin
    output_[2][2] = input_dim
    output_[2][3] = intermediate_dim
    output_[2][4] = encoded_dim
    output_[2][5] = deep
    np.savetxt(filename + ".mm", output_, delimiter = ", ", fmt="%1.6f")
  

imported_data,_ = librosa.load(input_filename, sr = rate, offset=start_secs, duration=duration_from_start_secs, mono = True)
filename = os.path.basename(input_filename)

#print("  ...ANALYZING")
#print("")
#print("  ####################################")
print("  #   number of samples:  ", len(imported_data))
#print("  ####################################")
#print("")

mel_filter, mel_inversion_filter, window = initialize(fftsize, input_dim)
minin, maxin, input_data = analyze_data(imported_data, filename, fftsize, windowskip, input_dim, window, mel_filter)
# HAVE THIS GUY RETURN THE INPUT_DATA AND THE FFT DATA
np.savetxt(filename + ".minmax", np.asarray([[minin, maxin]]), delimiter = ", ")
# print("  ...INITIALIZING AUTOENCODER")
vae, encoder, decoder, training_decoder = init_autoencoder_shallow(input_dim, intermediate_dim, encoded_dim, learning_rate)
# print("  ...TRAINING")
at = time.time()
vae, encoder, decoder, training_decoder, scale_mult, scale_subtract = train(filename, vae, encoder, decoder, training_decoder, input_data, min_delta, regression_patience, batch_size, 0)
write_mm(filename, minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, 0)
print("   ... time spent training:", time.time()-at)

#print("")
#print(quality, "quality")

z_encoded = encoder.predict(input_data)
output_data = training_decoder.predict(z_encoded[0])
difference = np.abs(np.subtract(input_data, output_data))

#print("")
print("    ... maximum reconstruction error:", np.amax(difference))
print("    ... average reconstruction error:", np.average(difference))

In [None]:
#@title Download trained model

fileout = "/content/" + filename + "." + quality + ".zip"
print(fileout)
file1 = input_filename
file2 = "/content/" + filename + ".fft.dec"
file3 = "/content/" + filename + ".mm"
!zip -jq $fileout $file1 $file2 $file3

from google.colab import files
files.download(fileout)

In [None]:
#@title Interact!
duration_secs = 3 #@param {type:"integer"}
window_size = 1024 #@param {type:"integer"}
continuous_update = False #@param {type:"boolean"}
loop = True #@param {type:"boolean"}

fftsize = 16384
rate = 44100
window_size1 = window_size
number_of_windows1 = int(round(rate * duration_secs / window_size1))

import numpy as np
import os
from IPython.display import display, Audio
from ipywidgets import interactive_output, FloatSlider, HBox
%cd /content
if not os.path.exists('autocoder'):
  !git clone --depth 1 -q https://github.com/eyaler/autocoder
%cd /content/autocoder/code
import autocoderlib as ac
%cd /content

def gen_phase1(sz_):
    phase = (np.random.rand( 1, int(sz_ / 2) + 1) * math.pi * 2.- math.pi).astype(np.float32)
    phase[0, 0] = 0.;
    phase[0, (int)(sz_ / 2)] = 0.;
    return(phase)

def set_brightness1(fftsize, brightness):
    global brightness1_
    brightness1_ = np.zeros((int)(fftsize / 2), dtype=np.float32)
    for i in range(0, (int)(fftsize / 2)):
        brightness1_[i] = pow(i/((int)(fftsize / 2)), brightness)
    brightness1_ = np.multiply(brightness1_, 10.)

def jit_lopas1(a,b, f):
    a = np.multiply(a, f)
    b = np.multiply(b, 1. - f)
    a = np.add(a, b)
    return(a)

minin1, maxin1, scale_mult1, scale_subtract1, input_dim1, intermediate_dim1, encoded_dim1, deep1 = ac.read_mm(filename)
decoder1, input_details1, output_details1 = ac.load_lite(filename, "decoder")
mel_filter1, mel_inversion_filter1, window1 = ac.initialize(fftsize, input_dim1)

internal_vector1 = np.zeros((1, encoded_dim1))
output1 = np.zeros(number_of_windows1 * window_size1)


se1 = np.zeros(window_size1, dtype=np.float32)
 
    
def signal(brightness=0, smoothing=0.1, filter=1, l1=0, l2=0, l3=0, l4=0, l5=0, l6=0, l7=0, l8=0):

    set_brightness1(fftsize, brightness)
    internal_vector1[0,] = [l1,l2,l3,l4,l5,l6,l7,l8]
    p_m = ac.decode(decoder1, deep1, scale_mult1, scale_subtract1, internal_vector1)
    ca1 = np.zeros((1, int(fftsize / 2 + 1)), dtype=np.float32)
    ca1[0,0:int(fftsize/2)] = p_m.dot(mel_inversion_filter1)
    ca1 = ca1.clip(0., filter)
    ca1[0,0:int(fftsize/2)] = np.abs(np.multiply(ca1[0,0:int(fftsize/2)], brightness1_))
    ca1[0,0] = 0.
   
    smooth1 = np.zeros((1, int(fftsize / 2 + 1)), dtype=np.float32)
    sout1 = np.zeros((fftsize), dtype=np.float32)
    first = True
    
    def callback1():

      nonlocal smooth1
      nonlocal sout1
      nonlocal first

      # DECODE
      smooth1 = jit_lopas1(ca1, smooth1, smoothing if not first else 1)
      first = False
      
      # GENERATE NOISE TO USE AS RECONSTRUCTION PHASE
      ph = gen_phase1(fftsize)

      # CONVERT BACK TO A SIGNAL
      co = np.zeros(int(fftsize / 2 + 1), dtype='complex64')
      co.real = np.multiply(smooth1.real, np.cos(ph))
      co.imag = np.multiply(smooth1.imag, np.sin(ph))
      cs = np.multiply(np.fft.irfft(co), window1)

      sout1[0:(fftsize - window_size1)] = sout1[window_size1:fftsize]
      sout1[(fftsize - window_size1):fftsize] = se1
      sout1 = np.add(sout1, cs)[0,]
      return np.multiply(sout1[0:window_size1].astype(np.float32), 250.)    
      
    for i in range(number_of_windows1):
        output1[i * window_size1:(i + 1) * window_size1] = callback1()
    a = Audio(data=output1, rate=rate, autoplay=True)
    if loop:
      a.autoplay_attr = lambda: 'autoplay="autoplay" loop="loop"'
    display(a)

def get_slider(desc, value=0.0, min_val=0, max_val=1):
  return FloatSlider(description=desc, value=value, min=min_val, max=max_val, step=.01, continuous_update=continuous_update, orientation='vertical')

brightness1=get_slider('brightness', max_val=2)
smoothing1=get_slider('smoothing', 0.1)
filter1=get_slider('filter', 1, min_val=0.01, max_val=2)
l1=get_slider('l1')
l2=get_slider('l2')
l3=get_slider('l3')
l4=get_slider('l4')
l5=get_slider('l5')
l6=get_slider('l6')
l7=get_slider('l7')
l8=get_slider('l8')
ui = HBox([brightness1, smoothing1, filter1, l1,l2,l3,l4,l5,l6,l7,l8])
out = interactive_output(signal, {'brightness':brightness1, 'smoothing':smoothing1, 'filter':filter1, 'l1':l1,'l2':l2,'l3':l3,'l4':l4,'l5':l5,'l6':l6,'l7':l7,'l8':l8})
display(ui, out)

In [None]:
#@title Generate Random Walk
#@markdown Note: granular mode is experimental/WIP
mode = 'spectral' #@param ['spectral', 'granular']
latent_random_max = 10 #@param {type: 'number'}
latent_random_pow = 1 #@param {type: 'number'}
brightness = 0 #@param {type: 'slider', min:0, max:2, step:0.01}
smoothing = 0.1 #@param {type: 'slider', min:0.01, max:1, step:0.01}
filter = 1 #@param {type:"number"}
duration_secs = 30 #@param {type: 'number'}
window_size = 1024 #@param {type: 'integer'}
grain_size = 1024 #@param {type: 'integer'}
training_skip = 1024 #@param {type: 'integer'}
max_rand_step = 10 #@param {type: 'integer'}
rand_pow = 1 #@param {type: 'number'}
group_n = 100 #@param {type: 'integer'}


%cd /content
if not os.path.exists('autocoder'):
  !git clone --depth 1 -q https://github.com/eyaler/autocoder
%cd /content/autocoder/code
import autocoderlib as ac
%cd /content

import sys
import numpy as np
import math
import scipy
import time
import random
from IPython.display import display, Audio

fftsize = 16384
rate = 44100
number_of_windows = int(round(rate * duration_secs / window_size))

if mode == 'spectral':
  mode = 'render'
argv = [None, '-' + mode, filename, number_of_windows]
if mode == 'render':
  argv += [fftsize, window_size]
elif mode == 'granular':
  argv += [grain_size, training_skip, window_size, max_rand_step, rand_pow, group_n]

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

#internal_vector = 0
#brightness_ = 0
#smooth = 0
#ca = 0
#sout = 0
#se = 0
color = ac.color

if(argv[1] == "-help" or argv[1] == '-h'):
    print()
    print("       python3 ./autocoder_generate.py "+color.CYAN+"-play[-p]"+color.END+" "+color.GREEN+"input_file.wav"+color.END+" fftsize[4096] dac_buffersize[512] ")
    print()
    print("                Generates and plays an output based on the input_file.wav ")
    print("                model via pyAudio.")
    print()
    print("       python3 ./autocoder_generate.py "+color.CYAN+"-render[-r]"+color.END+" "+color.GREEN+"input_file.wav"+color.END+" n fftsize[4096] skip[512]")
    print()
    print("                Generates and saves n windows based on the input_file.wav ")
    print("                model to a file named input_file.wav-out.wav.")
    print()
    print("       python3 ./autocoder_generate.py "+color.CYAN+"-granular[-g]"+color.END+" "+color.GREEN+"input_file.wav"+color.END+" n grainsize trainingskip windowskip rand_n rand_p")
    print()
    print("                Generates and saves n grains based on similarities within")
    print("                the input_file.wav model to a file named input_file.wav-out.wav.")
    print()
    print("                Each new grain is selected from the rand_n most similar")
    print("                candidates. rand_p weights the probability either closer")
    print("                (p > 1.) or further (p < 1.) away from the start point")
    print("                along the similarity metric axis.")
    print()
    print("       python3 ./autocoder_generate.py "+color.CYAN+"-autocode[-a]"+color.END+" "+color.GREEN+"carrier_file.wav modulator_file.wav"+color.END+" fftsize windowskip *args:manipulations")
    print()
    print("                The autocoder takes an input model (carrier_file.wav) and")
    print("                a modulator file (modulator_file.wav), and encodes and  ")
    print("                decodes the modulator using the carrier model. A vector ")
    print("                of the same size as the encoded dimension can be provided")
    print("                to reorder and invert the mapping of each dimension in the")
    print("                input file onto the output model.")
    print()
    exit()

def gen_internal_vector_(random_max, random_pow):
    global internal_vector
    internal_vector[0,] =  np.clip(np.add(internal_vector, np.divide(np.subtract(np.power(np.random.rand(1, 8), random_pow), .5), random_max)), 0., 1.)

def gen_phase(sz_):
    phase = (np.random.rand( 1, int(sz_ / 2) + 1) * math.pi * 2.- math.pi).astype(np.float32)
    phase[0, 0] = 0.;
    phase[0, (int)(sz_ / 2)] = 0.;
    return(phase)

def set_brightness(fftsize, brightness):
    global brightness_
    brightness_ = np.zeros((int)(fftsize / 2), dtype=np.float32)
    for i in range(0, (int)(fftsize / 2)):
        brightness_[i] = pow(i/((int)(fftsize / 2)), brightness)
    brightness_ = np.multiply(brightness_, 10.)

def jit_lopas(a,b, f):
    a = np.multiply(a, f)
    b = np.multiply(b, 1. - f)
    a = np.add(a, b)
    return(a)

def callback(in_data, frame_count, time_info, status):

    global smooth
    global sout

    # DECODE
    gen_internal_vector_(latent_random_max, latent_random_pow)
    p_m = ac.decode(decoder, deep, scale_mult, scale_subtract, internal_vector)
    ca = np.zeros((1, int(fftsize / 2 + 1)), dtype=np.float32)
    ca[0,0:int(fftsize/2)] = p_m.dot(mel_inversion_filter)
    ca = ca.clip(0., filter)
    ca[0,0:int(fftsize/2)] = np.abs(np.multiply(ca[0,0:int(fftsize/2)], brightness_))
    ca[0,0] = 0.

    smooth = jit_lopas(ca, smooth, smoothing)

    # GENERATE NOISE TO USE AS RECONSTRUCTION PHASE
    ph = gen_phase(fftsize)

    # CONVERT BACK TO A SIGNAL
    co = np.zeros(int(fftsize / 2 + 1), dtype='complex64')
    co.real = np.multiply(smooth.real, np.cos(ph))
    co.imag = np.multiply(smooth.imag, np.sin(ph))
    cs = np.multiply(np.fft.irfft(co), window)

    sout[0:(fftsize - frame_count)] = sout[frame_count:fftsize]
    sout[(fftsize - frame_count):fftsize] = se
    sout = np.add(sout, cs)[0,]
    return(np.multiply(sout[0:frame_count].astype(np.float32), 250.), 0)


min_a = np.zeros(8)
max_a = np.zeros(8)
min_a.fill(1000000)
max_a.fill(-1000000)

def autocode_norm_factors(in_data):
    global min_a
    global max_a

    amp, ph, norm_factor = ac.analyze_normalized(in_data, window, mel_filter)
    in_data = ac.encode(encoder, deep, scale_mult, scale_subtract, amp)
    t = np.zeros((2, 8))
    t[0,] = in_data
    t[1,] = min_a
    min_a = np.amin(t, axis = 0)
    t[1,] = max_a
    max_a = np.amax(t, axis = 0)


def autocode(in_data, offset, scale, reorder, norm_min, norm_max):

    global smooth

    # DECODE
    amp, ph, norm_factor = ac.analyze_normalized(in_data, window, mel_filter)
    in_data = ac.encode(encoder, deep, scale_mult, scale_subtract, amp)
    in_data = np.divide(np.subtract(in_data,norm_min), np.subtract(norm_max, norm_min))


    in_data = in_data[reorder]
    #in_data = np.add(np.multiply(in_data, invert_mult), invert_add)
    in_data = np.clip(np.add(np.multiply(in_data, scale), offset), 0, 1)
    print(in_data)
    #print(np.add(np.multiply(in_data, scale), offset))
    p_m = np.multiply(ac.decode(decoder, deep, scale_mult, scale_subtract, in_data), norm_factor)
    ca = np.zeros((1, int(fftsize / 2 + 1)), dtype=np.float32)
    ca[0,0:int(fftsize/2)] = p_m.dot(mel_inversion_filter)
    ca = ca.clip(0., filter)
    ca[0,0:int(fftsize/2)] = np.abs(np.multiply(ca[0,0:int(fftsize/2)], brightness_))
    ca[0,0] = 0.

    #smooth = jit_lopas(ca, smooth, smoothing)
    smooth = ca

    # CONVERT BACK TO A SIGNAL
    co = np.zeros(int(fftsize / 2 + 1), dtype='complex64')
    co.real = np.multiply(smooth.real, np.cos(ph))
    co.imag = np.multiply(smooth.imag, np.sin(ph))

    return(np.multiply(np.fft.irfft(co), window))


if(argv[1] == '-test'):
    # read parameters
    minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep = ac.read_mm(argv[2])
    # load both models
    decoder, input_details, output_details = ac.load_lite(argv[2], "decoder")
    fft_decoder, fft_input_details, fft_output_details = ac.load_lite(argv[2] + ".fft", "decoder")
    mel_filter, mel_inversion_filter, window = ac.initialize(4096, input_dim)
    print("")
    print(input_details)
    print("")
    print(output_details)
    print("")
    print(fft_input_details)
    print("")
    print(fft_output_details)
    print("")
    p_m = ac.decode(decoder, 0, scale_mult, scale_subtract, [0, 0, 0, 0, 0, 0, 0, 0])
    print(p_m)
    print("")
    p_n = ac.decode(fft_decoder, 2, scale_mult, scale_subtract, [0, 0, 0, 0, 0, 0, 0, 0])
    print(p_n[0:256])

if(argv[1] == "-render" or argv[1] == "-r"):
    '''
    print("")
    print("------------------------------")
    print("|         RENDERING          |")
    print("------------------------------")
    print("")
    '''

    filename_ = argv[2]
    n = int(argv[3])
    fftsize = int(argv[4])
    skip = int(argv[5])

    minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep = ac.read_mm(argv[2])
    decoder, input_details, output_details = ac.load_lite(argv[2], "decoder")
    set_brightness(fftsize, brightness)
    mel_filter, mel_inversion_filter, window = ac.initialize(fftsize, input_dim)

    ### INITIALIZE GLOBAL BUFFERS, TRY TO GET RID OF THESE
    output = np.zeros(n * skip)
    smooth = np.zeros((1, int(fftsize / 2 + 1)), dtype=np.float32)
    internal_vector = np.zeros((1, encoded_dim))
    sout = np.zeros((fftsize), dtype=np.float32)
    se = np.zeros(skip, dtype=np.float32)

    for i in range(n):
        output[i * skip:(i + 1) * skip], paval = callback(0, skip, 0, 0)

    scipy.io.wavfile.write(argv[2] + "-out.wav", rate, (32767 * np.divide(output, np.amax(np.abs(output)))).astype(np.int16))


elif(argv[1] == "-granular" or argv[1] == "-g"):
    '''
    print("")
    print("------------------------------")
    print("| ORDERED GRANULAR RENDERING |")
    print("------------------------------")
    print("")
    '''
    filename_ = argv[2]
    n = int(argv[3])
    grainsize = int(argv[4])
    trainingskip = int(argv[5])   ### GET THIS FROM THE MM FILE,
    windowskip = int(argv[6])
    max_rand_step = int(argv[7])
    rand_pow =  float(argv[8])
    group_n =  int(argv[9])


    minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep = ac.read_mm(argv[2])
    decoder, input_details, output_details = ac.load_lite(argv[2], "decoder")
    set_brightness(grainsize, brightness)
    mel_filter, mel_inversion_filter, window = ac.initialize(grainsize, input_dim)

    output = np.zeros(n * windowskip)

    # READ THE WAVEFILE
    wavefile = librosa.load(input_filename, sr = rate, offset=start_secs, duration=duration_from_start_secs, mono = True)[0]
    #print("   WAVEFILE LENGTH:", wavefile.shape)

    group = group_n > 1
    
    input_data = ac.import_training_data(filename_)

    minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep = ac.read_mm(argv[2])
    encoder, input_details, output_details = ac.load_lite(filename_, "encoder")

    encoded_input = np.zeros((input_data.shape[0], encoded_dim))

    for i in range(0, encoded_input.shape[0]):
        encoded_input[i,] = ac.encode(encoder, deep, scale_mult, scale_subtract, input_data[i,])

    if(group):
        t = np.zeros((int(encoded_input.shape[0]/group_n), encoded_input.shape[1]))
        sd = np.zeros((int(encoded_input.shape[0]/group_n), encoded_input.shape[1]))
        for i in range(0, int(encoded_input.shape[0]/group_n)):
            t[i,] = np.sum(encoded_input[i * group_n:(i + 1) * group_n,], axis = 0) / group_n
            sd[i,] = np.std(encoded_input[i * group_n:(i + 1) * group_n,], axis = 0)
        encoded_input = t

    distances = np.zeros((encoded_input.shape[0], encoded_input.shape[0]))

    for i in range(0, distances.shape[0]):
        for j in range(0, distances.shape[0]):
            if(group):
                distances[i,j] = np.sum(np.multiply(np.abs(np.subtract(t[i,], t[j,])), np.subtract(1, sd[j,])))
            else:
                distances[i,j] = np.sum(np.abs(np.subtract(encoded_input[i,], encoded_input[j,])))

    n_returns = min([int(max_rand_step), distances.shape[0] - 1])

    if(group):
        order = np.zeros((t.shape[0], n_returns), dtype='int')
        for i in range(0, t.shape[0]):
            order[i,] = np.argsort(distances[i,])[1:n_returns + 1]
    else:
        order = np.zeros((encoded_input.shape[0], n_returns), dtype='int')
        for i in range(0, encoded_input.shape[0]):
            order[i,] = np.argsort(distances[i,])[1:n_returns + 1]
    
    #print("   ORDER FILE SHAPE:", order.shape)

    reconstructed = np.zeros((n, grainsize))

    index_ =  np.random.randint(0, order.shape[0])

    for i in range(n):
        # 2. GET THE GRAIN AND WINDOW IT
        reconstructed[i,] = np.multiply(wavefile[(index_ * trainingskip):((index_ * trainingskip)+ grainsize)], window)

        # 3. RANDOM WALK
        index_ = order[index_, int(pow(random.random(), rand_pow) * n_returns)]

    output = np.zeros(reconstructed.shape[0] * windowskip + grainsize)

    for i in range(reconstructed.shape[0]):
        output[i * windowskip:i * windowskip + grainsize] = np.add(output[i * windowskip:i * windowskip + grainsize], reconstructed[i,])

    scipy.io.wavfile.write(argv[2] + "-out.wav", rate, (32767 * np.divide(output, np.amax(np.abs(output)))).astype(np.int16))

elif(argv[1] == "-autocode" or argv[1] == "-a"):
    '''
    print("")
    print("------------------------------")
    print("|         AUTOCODING         |")
    print("------------------------------")
    print("")
    '''

    model_filename = argv[2]
    input_filename = argv[3]
    fftsize = int(argv[4])
    windowskip = int(argv[5])
    offset = 0 #float(argv[6])
    scale = 1 #float(argv[7])

    minin, maxin, scale_mult, scale_subtract, input_dim, intermediate_dim, encoded_dim, deep = ac.read_mm(model_filename)
    decoder, input_details, output_details = ac.load_lite(model_filename, "decoder")
    encoder, input_details, output_details = ac.load_lite(model_filename, "encoder")
    set_brightness(fftsize, brightness)
    mel_filter, mel_inversion_filter, window = ac.initialize(fftsize, input_dim)

    reorder = np.zeros((encoded_dim), dtype = np.int)
    scale = np.zeros((encoded_dim), dtype = np.float32)
    offset = np.zeros((encoded_dim), dtype = np.float32)


    for i in range(0, encoded_dim):
        reorder[i] = int(argv[6 + i])
        scale[i] = float(argv[6 + encoded_dim + i])
        offset[i] = float(argv[6 + encoded_dim + encoded_dim + i])


    #reorder = np.subtract(np.abs(reorder), 1)

    # READ THE WAVEFILE
    wavefile = librosa.load(input_filename, sr = rate, offset=start_secs, duration=duration_from_start_secs,mono = True)[0]

    n = int((wavefile.shape[0] - fftsize) / windowskip)

    for i in range(n):
        # 2. GET THE GRAIN AND WINDOW IT
        autocode_norm_factors(np.multiply(wavefile[(i * windowskip):((i * windowskip)+ fftsize)], window))

    #np.divide(np.subtract(a, np.amin(a,axis=0)), np.subtract(np.amax(a, axis = 0), np.amin(a, axis = 0)))
    #quit()

    reconstructed = np.zeros((n, fftsize))

    # NORMALIZE THE INPUT
    for i in range(n):
        # 2. GET THE GRAIN AND WINDOW IT
        reconstructed[i,] = autocode(np.multiply(wavefile[(i * windowskip):((i * windowskip)+ fftsize)], window), offset, scale, reorder, min_a, max_a)


    output = np.zeros(reconstructed.shape[0] * windowskip + fftsize)

    for i in range(reconstructed.shape[0]):
        output[i * windowskip:i * windowskip + fftsize] = np.add(output[i * windowskip:i * windowskip + fftsize], reconstructed[i,])

    scipy.io.wavfile.write(argv[2] + "-out.wav", rate, (32767 * np.divide(output, np.amax(np.abs(output)))).astype(np.int16))
display(Audio(argv[2] + "-out.wav", rate=rate, autoplay=True))

In [None]:
#@title Download output file

from google.colab import files
files.download(argv[2] + "-out.wav")