<a href="https://colab.research.google.com/github/magenta/ddsp/blob/main/ddsp/colab/tutorials/3_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


##### Copyright 2021 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");





In [None]:
# Copyright 2021 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# DDSP Training

This notebook demonstrates the libraries in [https://github.com/magenta/ddsp/tree/master/ddsp/training](https://github.com/magenta/ddsp/tree/master/ddsp/training). It is a simple example, overfitting a single audio sample, for educational purposes. 

_For a full training pipeline please use [ddsp/training/ddsp_run.py](https://github.com/magenta/ddsp/blob/main/ddsp/training/README.md#train-1) as in the [train_autoencoder.ipynb](https://github.com/magenta/ddsp/blob/main/ddsp/colab/demos/train_autoencoder.ipynb)_.



In [None]:
#@title Install DDSP

#@markdown Install ddsp in a conda environment with Python 3.9 for compatibility.

!rm -rf /content/miniconda
!curl -L https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh -o miniconda.sh
!chmod +x miniconda.sh
!sh miniconda.sh -b -p /content/miniconda
!/content/miniconda/bin/pip install tensorflow==2.11 tensorflow-probability==0.19.0 tensorflow-datasets==4.9.0 ddsp==3.7.0
print('\nDone installing DDSP in conda environment!')

In [None]:
#@title Import display helpers

import warnings
warnings.filterwarnings("ignore")

import base64
import io

import numpy as np
import matplotlib.pyplot as plt
from IPython import display
from scipy.io import wavfile
from scipy import signal as scipy_signal

sample_rate = 16000


def play(array_of_floats, sample_rate=sample_rate):
  """Play audio in colab using HTML5 audio widget."""
  if isinstance(array_of_floats, list):
    array_of_floats = np.array(array_of_floats)
  if len(array_of_floats.shape) == 2:
    array_of_floats = array_of_floats[0]
  normalizer = float(np.iinfo(np.int16).max)
  array_of_ints = np.array(
      np.asarray(array_of_floats) * normalizer, dtype=np.int16)
  memfile = io.BytesIO()
  wavfile.write(memfile, sample_rate, array_of_ints)
  html = """<audio controls>
              <source controls src="data:audio/wav;base64,{base64_wavfile}"
              type="audio/wav" />
              Your browser does not support the audio element.
            </audio>"""
  html = html.format(
      base64_wavfile=base64.b64encode(memfile.getvalue()).decode('ascii'))
  memfile.close()
  display.display(display.HTML(html))


def specplot(audio, vmin=-5, vmax=1, rotate=True, size=512 + 256):
  """Plot the log magnitude spectrogram of audio."""
  if isinstance(audio, list):
    audio = np.array(audio)
  if len(audio.shape) == 2:
    audio = audio[0]
  f, t, Sxx = scipy_signal.stft(audio, fs=sample_rate, nperseg=size,
                                 noverlap=size * 3 // 4)
  logmag = np.log10(np.abs(Sxx) + 1e-7)
  if rotate:
    logmag = np.flipud(logmag)
  plt.matshow(logmag, vmin=vmin, vmax=vmax, cmap=plt.cm.magma, aspect='auto')
  plt.xticks([])
  plt.yticks([])
  plt.xlabel('Time')
  plt.ylabel('Frequency')


print('Helpers imported!')

# Get a batch of data

In [None]:
#@title Load NSynth data

#@markdown This loads a single example from NSynth and saves it for use in training.

SCRIPT = r'''
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from ddsp.training import data

output_dir = '/content/training_outputs'
os.makedirs(output_dir, exist_ok=True)

# Get a single example from NSynth.
data_provider = data.NSynthTfds(split='test')
dataset = data_provider.get_batch(batch_size=1, shuffle=False).take(1).repeat()
batch = next(iter(dataset))

# Save audio for display
audio = batch['audio'].numpy()
np.save(os.path.join(output_dir, 'original_audio.npy'), audio)
print('Audio shape:', audio.shape)
print('Data loaded and saved.')
'''

with open('/content/training_data.py', 'w') as f:
  f.write(SCRIPT)

!unset PYTHONPATH PYTHONHOME && /content/miniconda/bin/python /content/training_data.py

In [None]:
audio = np.load('/content/training_outputs/original_audio.npy')
specplot(audio)
play(audio)

# Get model and trainer

## python 

# Train

In [None]:
#@title Build model and train (Python config, 300 steps)

#@markdown This builds the Autoencoder model using Python and trains for 300
#@markdown steps on a single NSynth example. It saves:
#@markdown - Original audio, resynthesized audio, noise audio
#@markdown - Model parameters (amps, harmonic_distribution, noise_magnitudes, f0_hz, loudness)

SCRIPT = r'''
import os
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import ddsp
from ddsp.training import (data, decoders, encoders, models, preprocessing,
                           train_util, trainers)
import tensorflow.compat.v2 as tf

output_dir = '/content/training_outputs'
os.makedirs(output_dir, exist_ok=True)
sample_rate = 16000

# ===========================================================================
# Load data
# ===========================================================================
print('Loading data...')
data_provider = data.NSynthTfds(split='test')
dataset = data_provider.get_batch(batch_size=1, shuffle=False).take(1).repeat()
batch = next(iter(dataset))
audio = batch['audio']
n_samples = audio.shape[1]

# ===========================================================================
# Build model (Python config)
# ===========================================================================
print('Building model...')
strategy = train_util.get_strategy()

TIME_STEPS = 1000

# Create Neural Networks.
preprocessor = preprocessing.F0LoudnessPreprocessor(time_steps=TIME_STEPS)

decoder = decoders.RnnFcDecoder(rnn_channels = 256,
                                rnn_type = 'gru',
                                ch = 256,
                                layers_per_stack = 1,
                                input_keys = ('ld_scaled', 'f0_scaled'),
                                output_splits = (('amps', 1),
                                                 ('harmonic_distribution', 45),
                                                 ('noise_magnitudes', 45)))

# Create Processors.
harmonic = ddsp.synths.Harmonic(n_samples=n_samples,
                                sample_rate=sample_rate,
                                name='harmonic')

noise = ddsp.synths.FilteredNoise(window_size=0,
                                  initial_bias=-10.0,
                                  name='noise')
add = ddsp.processors.Add(name='add')

# Create ProcessorGroup.
dag = [(harmonic, ['amps', 'harmonic_distribution', 'f0_hz']),
       (noise, ['noise_magnitudes']),
       (add, ['noise/signal', 'harmonic/signal'])]

processor_group = ddsp.processors.ProcessorGroup(dag=dag,
                                                 name='processor_group')

# Loss functions
spectral_loss = ddsp.losses.SpectralLoss(loss_type='L1',
                                         mag_weight=1.0,
                                         logmag_weight=1.0)

with strategy.scope():
  model = models.Autoencoder(preprocessor=preprocessor,
                             encoder=None,
                             decoder=decoder,
                             processor_group=processor_group,
                             losses=[spectral_loss])
  trainer = trainers.Trainer(model, strategy, learning_rate=1e-3)

# ===========================================================================
# Build and train
# ===========================================================================
print('Building model with first batch...')
dataset_dist = trainer.distribute_dataset(dataset)
trainer.build(next(iter(dataset_dist)))

print('Training for 300 steps...')
dataset_iter = iter(dataset_dist)
for i in range(300):
    losses = trainer.train_step(dataset_iter)
    if i % 50 == 0 or i == 299:
        res_str = 'step: {}	'.format(i)
        for k, v in losses.items():
            res_str += '{}: {:.2f}	'.format(k, v)
        print(res_str)

# ===========================================================================
# Run inference and save results
# ===========================================================================
print('Running inference...')
start_time = time.time()
controls = model(next(dataset_iter))
audio_gen = model.get_audio_from_outputs(controls)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Save original audio
audio_np = audio.numpy()
np.save(os.path.join(output_dir, 'original_audio.npy'), audio_np)

# Save generated audio
audio_gen_np = audio_gen.numpy() if hasattr(audio_gen, 'numpy') else np.array(audio_gen)
np.save(os.path.join(output_dir, 'audio_gen.npy'), audio_gen_np)

# Save noise signal
audio_noise = controls['noise']['signal']
audio_noise_np = audio_noise.numpy() if hasattr(audio_noise, 'numpy') else np.array(audio_noise)
np.save(os.path.join(output_dir, 'audio_noise.npy'), audio_noise_np)

# Save synth parameters for plotting
batch_idx = 0
get = lambda key: ddsp.core.nested_lookup(key, controls)[batch_idx]

params = {
    'amps': get('harmonic/controls/amplitudes'),
    'harmonic_distribution': get('harmonic/controls/harmonic_distribution'),
    'noise_magnitudes': get('noise/controls/magnitudes'),
    'f0_hz': get('f0_hz'),
    'loudness_db': get('loudness_db'),
}
for k, v in params.items():
    v_np = v.numpy() if hasattr(v, 'numpy') else np.array(v)
    np.save(os.path.join(output_dir, f'{k}.npy'), v_np)

print('\nDone! All training outputs saved.')
'''

with open('/content/training_run.py', 'w') as f:
  f.write(SCRIPT)

!unset PYTHONPATH PYTHONHOME && /content/miniconda/bin/python /content/training_run.py

# Analyze results

In [None]:
audio = np.load('/content/training_outputs/original_audio.npy')
audio_gen = np.load('/content/training_outputs/audio_gen.npy')
audio_noise = np.load('/content/training_outputs/audio_noise.npy')

print('Original Audio')
play(audio)
print('Resynthesized Audio')
play(audio_gen)
print('Filtered Noise Audio')
play(audio_noise)

specplot(audio)
specplot(audio_gen)
specplot(audio_noise)

In [None]:
amps = np.load('/content/training_outputs/amps.npy')
harmonic_distribution = np.load('/content/training_outputs/harmonic_distribution.npy')
noise_magnitudes = np.load('/content/training_outputs/noise_magnitudes.npy')
f0_hz = np.load('/content/training_outputs/f0_hz.npy')
loudness = np.load('/content/training_outputs/loudness_db.npy')

f, ax = plt.subplots(1, 2, figsize=(14, 4))
f.suptitle('Input Features', fontsize=16)
ax[0].plot(loudness)
ax[0].set_ylabel('Loudness')
ax[1].plot(f0_hz)
ax[1].set_ylabel('F0_Hz')

f, ax = plt.subplots(1, 2, figsize=(14, 4))
f.suptitle('Synth Params', fontsize=16)
ax[0].semilogy(amps)
ax[0].set_ylabel('Amps')
ax[0].set_ylim(1e-5, 2)
ax[1].matshow(np.rot90(np.log10(harmonic_distribution + 1e-6)),
              cmap=plt.cm.magma, 
              aspect='auto')
ax[1].set_ylabel('Harmonic Distribution')
ax[1].set_xticks([])
_ = ax[1].set_yticks([])

f, ax = plt.subplots(1, 1, figsize=(7, 4))
ax.matshow(np.rot90(np.log10(noise_magnitudes + 1e-6)), 
           cmap=plt.cm.magma, 
           aspect='auto')
ax.set_ylabel('Filtered Noise Magnitudes')
ax.set_xticks([])
_ = ax.set_yticks([])


## Alternative: [`gin`](https://github.com/google/gin-config) configuration

The model above can also be configured using gin. Run the cell below instead of the Python training cell above to see the same results with gin configuration.

In [None]:
#@title Build model and train (Gin config, 300 steps)

SCRIPT = r'''
import os
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import ddsp
from ddsp.training import (data, decoders, encoders, models, preprocessing,
                           train_util, trainers)
import gin
import tensorflow.compat.v2 as tf

output_dir = '/content/training_outputs_gin'
os.makedirs(output_dir, exist_ok=True)
sample_rate = 16000

# Load data
print('Loading data...')
data_provider = data.NSynthTfds(split='test')
dataset = data_provider.get_batch(batch_size=1, shuffle=False).take(1).repeat()
batch = next(iter(dataset))
audio = batch['audio']
n_samples = audio.shape[1]

# Gin config
gin_string = """
import ddsp
import ddsp.training

# Preprocessor
models.Autoencoder.preprocessor = @preprocessing.F0LoudnessPreprocessor()
preprocessing.F0LoudnessPreprocessor.time_steps = 1000

# Encoder
models.Autoencoder.encoder = None

# Decoder
models.Autoencoder.decoder = @decoders.RnnFcDecoder()
decoders.RnnFcDecoder.rnn_channels = 256
decoders.RnnFcDecoder.rnn_type = 'gru'
decoders.RnnFcDecoder.ch = 256
decoders.RnnFcDecoder.layers_per_stack = 1
decoders.RnnFcDecoder.input_keys = ('ld_scaled', 'f0_scaled')
decoders.RnnFcDecoder.output_splits = (('amps', 1),
                                       ('harmonic_distribution', 20),
                                       ('noise_magnitudes', 20))

# ProcessorGroup
models.Autoencoder.processor_group = @processors.ProcessorGroup()

processors.ProcessorGroup.dag = [
  (@harmonic/synths.Harmonic(),
    ['amps', 'harmonic_distribution', 'f0_hz']),
  (@noise/synths.FilteredNoise(),
    ['noise_magnitudes']),
  (@add/processors.Add(),
    ['noise/signal', 'harmonic/signal']),
]

# Harmonic Synthesizer
harmonic/synths.Harmonic.name = 'harmonic'
harmonic/synths.Harmonic.n_samples = 64000
harmonic/synths.Harmonic.scale_fn = @core.exp_sigmoid

# Filtered Noise Synthesizer
noise/synths.FilteredNoise.name = 'noise'
noise/synths.FilteredNoise.n_samples = 64000
noise/synths.FilteredNoise.window_size = 0
noise/synths.FilteredNoise.scale_fn = @core.exp_sigmoid
noise/synths.FilteredNoise.initial_bias = -10.0

# Add
add/processors.Add.name = 'add'

models.Autoencoder.losses = [
    @losses.SpectralLoss(),
]
losses.SpectralLoss.loss_type = 'L1'
losses.SpectralLoss.mag_weight = 1.0
losses.SpectralLoss.logmag_weight = 1.0
"""

strategy = train_util.get_strategy()

with gin.unlock_config():
    gin.parse_config(gin_string)

with strategy.scope():
    model = ddsp.training.models.Autoencoder()
    trainer = trainers.Trainer(model, strategy, learning_rate=1e-4)

# Build and train
print('Building model...')
dataset_dist = trainer.distribute_dataset(dataset)
trainer.build(next(iter(dataset_dist)))

print('Training for 300 steps...')
dataset_iter = iter(dataset_dist)
for i in range(300):
    losses = trainer.train_step(dataset_iter)
    if i % 50 == 0 or i == 299:
        res_str = 'step: {}	'.format(i)
        for k, v in losses.items():
            res_str += '{}: {:.2f}	'.format(k, v)
        print(res_str)

# Inference
print('Running inference...')
controls = model(next(dataset_iter))
audio_gen = model.get_audio_from_outputs(controls)

# Save
np.save(os.path.join(output_dir, 'original_audio.npy'), audio.numpy())
audio_gen_np = audio_gen.numpy() if hasattr(audio_gen, 'numpy') else np.array(audio_gen)
np.save(os.path.join(output_dir, 'audio_gen.npy'), audio_gen_np)

audio_noise = controls['noise']['signal']
audio_noise_np = audio_noise.numpy() if hasattr(audio_noise, 'numpy') else np.array(audio_noise)
np.save(os.path.join(output_dir, 'audio_noise.npy'), audio_noise_np)

print('\nDone! Gin training outputs saved.')
'''

with open('/content/training_gin.py', 'w') as f:
  f.write(SCRIPT)

!unset PYTHONPATH PYTHONHOME && /content/miniconda/bin/python /content/training_gin.py