🐙

Tacotron: Towards End-to-End Speech Synthesis: https://arxiv.org/abs/1703.10135

code: https://github.com/r9y9/tacotron_pytorch

In [2]:
# Choose your favorite model
checkpoint_path = "../checkpoints/checkpoint_step46800.pth"

In [3]:
%pylab inline
rcParams["figure.figsize"] = (16,5)

# Use text & audio modules from existing Tacotron implementation.
import sys
sys.path.insert(0, "../lib/tacotron")
from text import text_to_sequence, symbols
from util import audio

Populating the interactive namespace from numpy and matplotlib
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
import torch
import numpy as np

from tacotron_pytorch import Tacotron
from synthesis import tts as _tts
from hparams import hparams

import os
import librosa
import librosa.display
import IPython
from IPython.display import Audio

In [5]:
fs = hparams.sample_rate
hop_length = 250

In [6]:
def visualize(alignment, spectrogram):
    label_fontsize = 16
    figure(figsize=(16,16))
    
    subplot(2,1,1)
    imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    xlabel("Decoder timestamp", fontsize=label_fontsize)
    ylabel("Encoder timestamp", fontsize=label_fontsize)
    colorbar()

    subplot(2,1,2)
    librosa.display.specshow(spectrogram.T, sr=fs, 
                             hop_length=hop_length, x_axis="time", y_axis="linear")
    xlabel("Time", fontsize=label_fontsize)
    ylabel("Hz", fontsize=label_fontsize)
    tight_layout()
    colorbar()

In [7]:
def tts(model, text, figures=True):
    waveform, alignment, spectrogram = _tts(model, text)
    if figures:
        visualize(alignment, spectrogram)
    IPython.display.display(Audio(waveform, rate=fs))

## Model

In [8]:
model = Tacotron(n_vocab=len(symbols),
                 embedding_dim=256,
                 mel_dim=hparams.num_mels,
                 linear_dim=hparams.num_freq,
                 r=hparams.outputs_per_step,
                 padding_idx=hparams.padding_idx,
                 use_memory_mask=hparams.use_memory_mask,
                 )
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

# Set large max_decoder steps to handle long sentence outputs
model.decoder.max_decoder_steps = 500

## TTS samples

### Generatd audio and alignment

In [9]:
tts(model, "Hi, my name is Tacotron. I'm still learning a lot from data.")

  alignment = F.softmax(alignment)


In [10]:
tts(model, "Training neural networks is very hard!")

In [11]:
tts(model, "Generative adversarial network or variational auto-encoder")