<a href="https://colab.research.google.com/github/coryellj4/4540/blob/main/Wavenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Vichoko/pytorch-wavenet.git

fatal: destination path 'pytorch-wavenet' already exists and is not an empty directory.


In [6]:
import os
import sys
sys.path.append('pytorch-wavenet')

import torch
from wavenet_model import *
from audio_data import WavenetDataset
from wavenet_training import *
from model_logging import *



In [7]:
# initialize cuda option
dtype = torch.FloatTensor # data type
ltype = torch.LongTensor # label type

use_cuda = torch.cuda.is_available()
if use_cuda:
    print('use gpu')
    dtype = torch.cuda.FloatTensor
    ltype = torch.cuda.LongTensor

use gpu


In [8]:
model = WaveNetModel(layers=5,
                     blocks=2,
                     dilation_channels=32,
                     residual_channels=32,
                     skip_channels=512,
                     end_channels=256,
                     output_length=16,
                     dtype=dtype,
                     bias=True)
# model = load_latest_model_from('snapshots', use_cuda=use_cuda)

print('model: ', model)
print('receptive field: ', model.receptive_field)
print('parameter count: ', model.parameter_count())

model:  WaveNetModel(
  (filter_convs): ModuleList(
    (0-9): 10 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (gate_convs): ModuleList(
    (0-9): 10 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (residual_convs): ModuleList(
    (0-9): 10 x Conv1d(32, 32, kernel_size=(1,), stride=(1,))
  )
  (skip_convs): ModuleList(
    (0-9): 10 x Conv1d(32, 512, kernel_size=(1,), stride=(1,))
  )
  (start_conv): Conv1d(256, 32, kernel_size=(1,), stride=(1,))
  (end_conv_1): Conv1d(512, 256, kernel_size=(1,), stride=(1,))
  (end_conv_2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
)
receptive field:  63
parameter count:  426464


In [9]:
data = WavenetDataset(dataset_file='pytorch-wavenet/train_samples/bach_chaconne/dataset.npz',
                      item_length=model.receptive_field + model.output_length - 1,
                      target_length=model.output_length,
                      file_location='train_samples/bach_chaconne',
                      test_stride=500)
print('the dataset has ' + str(len(data)) + ' items')

one hot input
the dataset has 598464 items


In [None]:
def generate_and_log_samples(step):
    sample_length=32000
    gen_model = load_latest_model_from('snapshots', use_cuda=False)
    print("start generating...")
    samples = generate_audio(gen_model,
                             length=sample_length,
                             temperatures=[0.5])
    tf_samples = tf.convert_to_tensor(samples, dtype=tf.float32)
    logger.audio_summary('temperature_0.5', tf_samples, step, sr=16000)

    samples = generate_audio(gen_model,
                             length=sample_length,
                             temperatures=[1.])
    tf_samples = tf.convert_to_tensor(samples, dtype=tf.float32)
    logger.audio_summary('temperature_1.0', tf_samples, step, sr=16000)
    print("audio clips generated")
model.cuda()
logger = Logger(log_interval=200,
                           validation_interval=400,
                           generate_interval=1000)

# logger = Logger(log_interval=200,
#                 validation_interval=400,
#                 generate_interval=1000)
os.makedirs('snapshots', exist_ok=True)
trainer = WavenetTrainer(model=model,
                         dataset=data,
                         lr=0.001,
                         snapshot_path='snapshots',
                         snapshot_name='chaconne_model',
                         snapshot_interval=1000,
                         logger=logger,
                         dtype=dtype,
                         ltype=ltype)

print('start training...')
trainer.train(batch_size=16,
              epochs=1)



start training...
epoch 0
one training step does take approximately 0.16564294576644897 seconds)
loss at step 200: tensor(5.0171, device='cuda:0', grad_fn=<DivBackward0>)
loss at step 400: tensor(4.4602, device='cuda:0', grad_fn=<DivBackward0>)
validation loss: tensor(4.5600, device='cuda:0', grad_fn=<DivBackward0>)
validation accuracy: tensor(5.4003, device='cuda:0')%
loss at step 600: tensor(4.1217, device='cuda:0', grad_fn=<DivBackward0>)
loss at step 800: tensor(3.7348, device='cuda:0', grad_fn=<DivBackward0>)
validation loss: tensor(3.9258, device='cuda:0', grad_fn=<DivBackward0>)
validation accuracy: tensor(7.6626, device='cuda:0')%
loss at step 1000: tensor(3.6181, device='cuda:0', grad_fn=<DivBackward0>)
loss at step 1200: tensor(3.5487, device='cuda:0', grad_fn=<DivBackward0>)
validation loss: tensor(3.8477, device='cuda:0', grad_fn=<DivBackward0>)
validation accuracy: tensor(7.6835, device='cuda:0')%
loss at step 1400: tensor(3.5066, device='cuda:0', grad_fn=<DivBackward0>)
l

In [None]:


start_data = data[250000][0] # use start data from the data set
start_data = torch.max(start_data, 0)[1] # convert one hot vectors to integers

def prog_callback(step, total_steps):
    print(str(100 * step // total_steps) + "% generated")

model.cpu()
generated = model.generate_fast(num_samples=160000,
                                 first_samples=start_data,
                                 progress_callback=prog_callback,
                                 progress_interval=1000,
                                 temperature=1.0,
                                 regularize=0.)



In [None]:
import IPython.display as ipd

ipd.Audio(generated, rate=16000)