[View in Colaboratory](https://colab.research.google.com/github/m-toman/SALB2/blob/master/Lisa_tacotron_wavenet.ipynb)

# Get prerequisites

In [1]:
! pip install "tensorflow<=1.9.0"

import os
from os.path import exists, join, expanduser

os.chdir(expanduser("~"))
! rm -rf wavenet_vocoder
! rm -rf Tacotron-2

wavenet_dir = "wavenet_vocoder"
if not exists(wavenet_dir):
  ! git clone https://github.com/r9y9/$wavenet_dir
    
taco2_dir = "Tacotron-2"
if not exists(taco2_dir):
  ! git clone https://github.com/r9y9/$taco2_dir
  ! cd $taco2_dir && git checkout -B wavenet3 origin/wavenet3
  

# Install dependencies
os.chdir(join(expanduser("~"), taco2_dir))
! pip install -q -r requirements.txt

os.chdir(join(expanduser("~"), wavenet_dir))
! pip install -q -e '.[train]'


# Get models
os.chdir(join(expanduser("~"), taco2_dir))
! mkdir -p logs-Tacotron/pretrained
os.chdir(join(expanduser("~"), taco2_dir, "logs-Tacotron", "pretrained"))
! curl -O -L "http://www.neuratec.com/download/taco-pretrained.zip"
! unzip taco-pretrained.zip
! rm taco-pretrained.zip
! echo $PWD
! ls

os.chdir(join(expanduser("~"), wavenet_dir))
wn_preset = "wavenet_config.json"
wn_checkpoint_path = "checkpoint_step000375000_ema.pth"

if not exists(wn_preset):
  !curl -O -L "http://www.neuratec.com/download/wavenet_config.json"
if not exists(wn_checkpoint_path):
  !curl -O -L "http://www.neuratec.com/download/checkpoint_step000375000_ema.pth"
  
os.chdir(join(expanduser("~"), taco2_dir))

print("Done loading prerequisites")

Cloning into 'wavenet_vocoder'...
remote: Counting objects: 1056, done.[K
remote: Total 1056 (delta 0), reused 0 (delta 0), pack-reused 1056[K
Receiving objects: 100% (1056/1056), 20.11 MiB | 20.31 MiB/s, done.
Resolving deltas: 100% (528/528), done.
Cloning into 'Tacotron-2'...
remote: Counting objects: 570, done.[K
remote: Total 570 (delta 0), reused 0 (delta 0), pack-reused 570[K
Receiving objects: 100% (570/570), 8.08 MiB | 19.15 MiB/s, done.
Resolving deltas: 100% (352/352), done.
Branch wavenet3 set up to track remote branch wavenet3 from origin.
Switched to a new branch 'wavenet3'
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  288M  100  288M    0     0  24.0M      0  0:00:12  0:00:12 --:--:-- 24.8M
Archive:  taco-pretrained.zip
  inflating: checkpoint              
  inflating: model.ckpt-202500.data-00000-of-00001  
  inflating: model.ckpt-202500.index  
  in

# Initialize

In [2]:
import librosa.display
import IPython
from IPython.display import Audio
import numpy as np
import torch
from glob import glob
from tqdm import tqdm


# Wavenet
os.chdir(join(expanduser("~"), wavenet_dir))

# Setup WaveNet vocoder hparams
from hparams import hparams
with open(wn_preset) as f:
    hparams.parse_json(f.read())

# Setup WaveNet vocoder
from train import build_model
from synthesis import wavegen
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = build_model().to(device)

print("Load checkpoint from {}".format(wn_checkpoint_path))
checkpoint = torch.load(wn_checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

os.chdir(join(expanduser("~"), taco2_dir))


This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 657, in launch_instance
    app.initialize(argv)
  File "<decorator-gen-121>", line 2, in initialize
  File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 87, in catch_config_error
    return method(app, *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-pac

Load checkpoint from checkpoint_step000375000_ema.pth


# Enter sentence(s)

In [0]:
os.chdir(join(expanduser("~"), taco2_dir))

In [4]:
%%bash
cat << EOS > text_list.txt
This is really awesome, let's do it!
EOS

cat text_list.txt

This is really awesome, let's do it!


# Synthesis

In [0]:
# Tacotron
os.chdir(join(expanduser("~"), taco2_dir))

# Remove old files if exist
! rm -rf tacotron_output
! python synthesize.py --model='Tacotron' --mode='eval' \
  --hparams='symmetric_mels=False,max_abs_value=4.0,power=1.1,outputs_per_step=1' \
  --text_list=./text_list.txt



# Wavenet
os.chdir(join(expanduser("~"), wavenet_dir))

with open("../Tacotron-2/tacotron_output/eval/map.txt") as f:
  maps = f.readlines()
maps = list(map(lambda x:x[:-1].split("|"), maps))
# filter out invalid ones
maps = list(filter(lambda x:len(x) == 2, maps))

print("List of texts to be synthesized")
for idx, (text,_) in enumerate(maps):
  print(idx, text)
  
waveforms = []

for idx, (text, mel) in enumerate(maps):
  print("\n", idx, text)
  mel_path = join("../Tacotron-2", mel)
  c = np.load(mel_path)
  if c.shape[1] != hparams.num_mels:
    np.swapaxes(c, 0, 1)
  # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
  c = np.interp(c, (0, 4), (0, 1))
 
  # Generate
  waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
  
  waveforms.append(waveform)

  # Audio
  IPython.display.display(Audio(waveform, rate=hparams.sample_rate))

loaded model at logs-Tacotron/pretrained/model.ckpt-202500
Hyperparameters:
  allow_clipping_in_normalization: True
  attention_dim: 128
  attention_filters: 32
  attention_kernel: (31,)
  cleaners: english_cleaners
  cumulative_weights: True
  decoder_layers: 2
  decoder_lstm_units: 1024
  embedding_dim: 512
  enc_conv_channels: 512
  enc_conv_kernel_size: (5,)
  enc_conv_num_layers: 3
  encoder_lstm_units: 256
  fft_size: 1024
  fmax: 7600
  fmin: 125
  frame_shift_ms: None
  griffin_lim_iters: 60
  hop_size: 256
  impute_finished: False
  input_type: raw
  log_scale_min: -32.23619130191664
  mask_encoder: False
  mask_finished: False
  max_abs_value: 4.0
  max_iters: 2500
  min_level_db: -100
  num_freq: 513
  num_mels: 80
  outputs_per_step: 1
  postnet_channels: 512
  postnet_kernel_size: (5,)
  postnet_num_layers: 5
  power: 1.1
  predict_linear: False
  prenet_layers: [256, 256]
  quantize_channels: 65536
  ref_level_db: 20
  rescale: True

 33%|███▎      | 13026/38912 [07:49<15:33, 27.72it/s]

In [15]:
for idx, (text, mel) in enumerate(maps):
  print(idx, text)
  IPython.display.display(Audio(waveforms[idx], rate=hparams.sample_rate))

0 This is really awesome, let's do it!
