## Imports

In [None]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, PReLU
from tensorflow.keras.optimizers import Adam

from music_generator.prefabs.random_walk_track import generate_dataset
from music_generator.signalproc.signalproc import SamplingInfo
from music_generator.music.timing import Tempo
from music_generator.music.scales import GenericScale
from music_generator.signalproc.signalproc import mix_at
from music_generator.analysis import preprocessing

from music_generator.music import scales
import numpy as np
from multiprocessing import Pool
from functools import partial

import matplotlib.pyplot as plt
from IPython.display import Audio
%matplotlib inline

# Filtering lead instrument

## Goal

We are going to generate some music with more than one synthesizer

We will filter out the lead tone using a feed-forward neural network.

```
Model: input wave with 3 instruments -> output wave 1 instrument
```

We will use a auto-encoder like setup. Replace the image by a short fragment of 1024 samples (~1/40th of a second) of sound data.

<img src="images/ae.png">

[Image source](`https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd)

## Generating the data...

In [None]:
sr = 44100
sampling_info = SamplingInfo(sr)

In [None]:
# Generate in all keys
all_roots = scales.chromatic_scale('C')
roots = [n.get_symbol() for n in all_roots.generate(0, 1)]
print(roots)

def generate_dataset_for_root(root):
    return generate_dataset(n_measures=32,
                            tempo=Tempo(120),
                            scale=GenericScale(root, [0, 2, 3, 5, 7, 8, 10]),
                            sampling_info=sampling_info)
    
with Pool(8) as pool:
    datasets = pool.map(generate_dataset_for_root, roots)
    
# Make one big data set and make sure data is of same size    
audio_tracks, mix = preprocessing.combine_datasets(datasets)    

In [None]:
mix.shape

In [None]:
audio_tracks.shape

## Training and target

In [None]:
n_samples = 1024 * 40
fragment_length = 1024 * 1
input_track = mix
target_track = audio_tracks[2]

In [None]:
Audio(input_track[0:8*sr], rate=sr)

In [None]:
Audio(target_track[0:8*sr], rate=sr)

<img width='75%' src="images/ae.png">

# Create training set

In [None]:
x, y = preprocessing.create_training_data_set(n_samples, 
                                              fragment_length, 
                                              input_track, 
                                              target_track)

In [None]:
Audio(x[0], rate=sr)

# play_array(np.tile(x[0], 100))

In [None]:
def x_fade_profile(batch_dim):
    x = np.arange(batch_dim)
    return 1 - abs(x - (batch_dim / 2)) / (batch_dim / 2)

def model_predict(model, input_track):
    dim = input_shape[0]
    n_batches = int(len(input_track) / dim) - 1
    pred_batches = input_track[0:n_batches*dim].reshape((-1, dim))
    
    pred_batches_shifted = input_track[dim//2:n_batches*dim + dim//2].reshape((-1, dim))
    
    xfp = x_fade_profile(dim)
    
    x0 = np.array([xfp * batch for batch in model.predict(pred_batches)]).reshape(-1)
    x1 = np.array([xfp * batch for batch in model.predict(pred_batches_shifted)]).reshape(-1)
    
    return mix_at(x0, x1, dim//2)

## Time for some (deep) learning: build an auto-encoder-like network

The model is just a simple feed forward neural network

The architecture is one of a simple auto-encoder: same output dim as input dim. However, the data that we present is different: targets $\neq$ inputs

<img src="images/ae.png">

[Image source](`https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd)

In [None]:
input_shape = x[0].shape
output_shape = x[1].shape[0]

In [None]:
model = tf.keras.models.Sequential()
model.add(Dense(1024, input_shape=input_shape))
model.add(PReLU())
model.add(Dense(512))
model.add(PReLU())
model.add(Dense(output_shape))
model.compile(Adam(), 'mse')
model.summary()

## How does the network sound before training?

In [None]:
Audio(model_predict(model, mix)[0:15*sr], rate=sr)

## Fit the model in two epochs

In [None]:
model.fit(x, y, epochs=2)

## Let's test the model

In [None]:
display(Audio(mix[40*sr:45*sr], rate=sr))
display(Audio(model_predict(model, mix)[40*sr:45*sr], rate=sr))

## Is it overfitted?

Of course it is overfitted on this particular sound of synth and backing track, and it will not work for any other sounds than this. But how well can it predict if we generate a completely new data set using a different scale (Phrygian Dominant, instead of minor)?

In [None]:
score_tracks_test, audio_tracks_test, mix_test = \
    generate_dataset(n_measures=64,
                     tempo=Tempo(120),
                     scale=GenericScale('E', [0, 1, 4, 5, 7, 8, 10]),
                     sampling_info=sampling_info)

In [None]:
Audio(model_predict(model, mix_test[0:15*44100]), rate=sr)