In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%cd ../..

In [None]:
import copy
import os
import pprint

import numpy as np
import torch
import librosa
from torch.nn import functional as F
import matplotlib.pyplot as plt

import librosa.display
import IPython.display as ipd

from wavenet import model, train, sample, audio, datasets, utils, viz, debug

In [None]:
pp = pprint.PrettyPrinter(indent=2)

# Train on 440 hz Sines with random phases

A very simple dataset, should be able to crush this. Sines all at 440 hz, unit amplitude, random phases. Some ways of reducing the size of the problems:

- 4 cycles of sin should do, no need for a full second of audio. 160 samples
- Mu compress to 7 bits
- Fast generate only 160 samples
- Mono

We will try to train in around 10 minutes. 

In [None]:
# this cell contains papermill tagged parameters
# they can be overriden by the cli when training:  
# papermill in.ipynb out.ipynb -p batch_norm True

learning_rate = 0.015
batch_size = 16
max_epochs = 10

In [None]:
p_small = model.HParams(
    mixed_precision=True,
    embed_inputs=True,
    n_audio_chans=1, 
    n_classes=2**7, 
    compress=False, 
    sample_length=160, 
    dilation_stacks=1,
    n_layers=7,
    sample_from_gpu=True,
    seed=32455,
    n_chans=32,
    n_chans_embed=256,
    n_chans_skip=256,
    n_chans_res=32,
    n_chans_end=64,
)

pp.pprint(dict(p_small))

In [None]:
tp = train.HParams(
    max_epochs=max_epochs, 
    batch_size=batch_size, 
    num_workers=1, 
    learning_rate=learning_rate
)

pp.pprint(dict(tp))

In [None]:
utils.seed(p_small)
ds = datasets.Sines(640, p_small, hz=440, amp=1.0)
viz.plot_audio_dataset(ds, p_small)

In [None]:
utils.seed(p_small)
m = model.Wavenet(p_small)
debug.summarize(m)

In [None]:
t = train.Trainer(m, ds, None, tp, None)
t.metrics

In [None]:
utils.seed(p_small)
t.train()

In [None]:
for sampler in [sample.fast, sample.simple]:
    utils.seed(p_small)
    viz.plot_model_samples(m, ds.transforms, sampler, p_small)

In [None]:
t.metrics.finish()

# Train on random Sinusoids

Harder.

In [None]:
p_moderate = copy.copy(p_small)
p_moderate.n_layers = 8
p_moderate.n_chans_skip = 512
p_moderate.__dict__

In [None]:
utils.seed(p_moderate)
ds = datasets.Sines(6400, p_moderate, amp=1.0, minhz=20, maxhz=400)
viz.plot_audio_dataset(ds, p_moderate)

In [None]:
tp = train.HParams(
    max_epochs=50, 
    batch_size=128, 
    num_workers=1, 
    learning_rate=0.03
)

pp.pprint(dict(tp))

In [None]:
m = model.Wavenet(p_moderate)
debug.summarize(m)

In [None]:
t = train.Trainer(m, ds, None, tp, None)
t.metrics

In [None]:
utils.seed(p_moderate)
t.train()

In [None]:
for sampler in [sample.fast, sample.simple]:
    utils.seed(p_moderate)
    viz.plot_model_samples(m, ds.transforms, sampler, p_moderate)

In [None]:
t.metrics.finish()

## 🚨 Observations and Questions

- This seems to need much longer to converge. Using 2x epochs
- Using 2x params
- Why do we need to up the model and training so much? It is a harder problem, but surely not that much harder?
- The problem seems to come up with low frequency sinusoids

# Train on 20 hz Sines

Here's a bit of a puzzle. Given the same resources as training on 440 hz sines with random phases, training on 20 hz sines performs much worse:

In [None]:
utils.seed(p_small)
ds = datasets.Sines(640, p_small, hz=20, amp=1.0)
viz.plot_audio_dataset(ds, p_small)

In [None]:
tp = train.HParams(
    max_epochs=10, 
    batch_size=16, 
    num_workers=1, 
    learning_rate=0.015
)

pp.pprint(dict(tp))

In [None]:
m = model.Wavenet(p_small)
debug.summarize(m)

In [None]:
t = train.Trainer(m, ds, None, tp, None)
t.metrics

In [None]:
utils.seed(p_small)
t.train()

In [None]:
for sampler in [sample.fast, sample.simple]:
    utils.seed(p_small)
    viz.plot_model_samples(m, ds.transforms, sampler, p_small)

In [None]:
t.metrics.finish()

## 🚨 Observations and Questions

- Why is this doing so much worse, given the only difference is 20 hz vz 440 hz?