In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%cd ..

In [None]:
import os

import numpy as np
import torch
import librosa
from torch.nn import functional as F
import matplotlib.pyplot as plt
import librosa.display
import IPython.display as ipd

from wavenet import model, train, sample, audio, datasets, utils, viz

# Train on simple Sines

A very simple dataset, should be able to crush this. Sines all at 440 hz and 1.0 amplitude, but different phases.

We will try to train in around 30 minutes. At batches of 16 running at 2.35 it/s, we have 30*60=1800 seconds so 765 steps to complete over batches of 16 1s audios. This is around 200 minutes of audio to iterate over, so 20 epochs of 10 minutes each seems reasonable. That's around 40 steps per epoch, so 40*16=640 examples.

In [None]:
dry = False
if dry:
    os.environ['WANDB_MODE'] = 'dryrun'

In [None]:
p = model.HParams(n_audio_chans=2)
X, X_test = datasets.Sines(640, 1, p, hz=440, amp=1.0), datasets.Sines(640, 1, p, hz=440, amp=1.0)
batch = datasets.to_tensor(X, 20)

In [None]:
m = model.Wavenet(p)
t = train.Trainer(m, X, X_test, train.HParams(max_epochs=20, batch_size=16, num_workers=10, learning_rate=0.0021), None)

In [None]:
track_i = viz.plot_track(batch, n_samples=p.receptive_field_size())

In [None]:
ipd.Audio(audio.mu_expand(batch[track_i].numpy(), p), rate=p.sampling_rate)

In [None]:
t.train()

In [None]:
_, track = sample.sample(m, decoder=utils.decode_nucleus(), n_samples=32000, batch_size=10)

In [None]:
viz.plot_track(track, n_samples=p.receptive_field_size() // 6)

In [None]:
ipd.Audio(track[0], rate=p.sampling_rate)