[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb)

# ESPnet real time E2E-TTS demonstration

This notebook provides demonstration of realtime E2E-TTS using ESPnet-TTS and ParallelWaveGAN.

- ESPnet: https://github.com/espnet/espnet
- ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN

Author: Tomoki Hayashi (@kan-bayashi)

## Install

In [0]:
# install minimal components
!pip install -q parallel_wavegan PyYaml unidecode ConfigArgparse g2p_en nltk
!git clone -q https://github.com/espnet/espnet.git
!cd espnet && git fetch && git checkout -b v.0.6.0 8bfb7ac6974699e9720558a4ef20376805e38d6b



---
## English demo


### Download pretrained models

You can select one from three models. Please only run the seletected model cells.


#### (a) Tacotron2

In [0]:
# download char based Tacotron2
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7 downloads/ tar.gz > /dev/null 2>&1
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1hiZn14ITUDM1nkn-GkaN_M3oaTOUcn1n downloads/ tar.gz > /dev/null 2>&1

# set path
trans_type = "char"
dict_path = "downloads/data/lang_1char/train_no_dev_units.txt"
model_path = "downloads/exp/train_no_dev_pytorch_train_pytorch_tacotron2.v3/results/model.last1.avg.best"
vocoder_path = "downloads/ljspeech.parallel_wavegan.v1/checkpoint-400000steps.pkl"
vocoder_conf = "downloads/ljspeech.parallel_wavegan.v1/config.yml"
print("sucessfully finished download.")

#### (b) Transformer

In [0]:
# download phoneme based Transformer
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7 downloads/ tar.gz > /dev/null 2>&1
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1M_w7nxI6AfbtSHpMO-exILnAc_aUYvXP downloads/ tar.gz > /dev/null 2>&1

# set path
trans_type = "phn"
dict_path = "downloads/data/lang_1phn/train_no_dev_units.txt"
model_path = "downloads/exp/phn_train_no_dev_pytorch_train_pytorch_transformer.v3/results/model.last1.avg.best"
vocoder_path = "downloads/ljspeech.parallel_wavegan.v1/checkpoint-400000steps.pkl"
vocoder_conf = "downloads/ljspeech.parallel_wavegan.v1/config.yml"
print("sucessfully finished download.")

#### (c) FastSpeech

In [0]:
# download phoneme based FastSpeech
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7 downloads/ tar.gz > /dev/null 2>&1
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1otwFFYiVMcbbgY55xk6DrOfb8Pi5uqjJ downloads/ tar.gz > /dev/null 2>&1

# set path
trans_type = "phn"
dict_path = "downloads/data/lang_1phn/train_no_dev_units.txt"
model_path = "downloads/exp/phn_train_no_dev_pytorch_train_fastspeech.v4/results/model.last1.avg.best"
vocoder_path = "downloads/ljspeech.parallel_wavegan.v1/checkpoint-400000steps.pkl"
vocoder_conf = "downloads/ljspeech.parallel_wavegan.v1/config.yml"
print("sucessfully finished download.")

### Setup

In [0]:
# add path
import sys
sys.path.append("espnet/egs/ljspeech/tts1/local")
sys.path.append("espnet")

# define device
import torch
device = torch.device("cuda")

# define E2E-TTS model
from argparse import Namespace
from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import torch_load
from espnet.utils.dynamic_import import dynamic_import
idim, odim, train_args = get_model_conf(model_path)
model_class = dynamic_import(train_args.model_module)
model = model_class(idim, odim, train_args)
torch_load(model_path, model)
model = model.eval().to(device)
inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0})

# define neural vocoders
import yaml
from parallel_wavegan.models import ParallelWaveGANGenerator
with open(vocoder_conf) as f:
    config = yaml.load(f, Loader=yaml.Loader)
vocoder = ParallelWaveGANGenerator(**config["generator_params"])
vocoder.load_state_dict(torch.load(vocoder_path, map_location="cpu")["model"]["generator"])
vocoder.remove_weight_norm()
vocoder = vocoder.eval().to(device)

# define text frontend
from text.cleaners import custom_english_cleaners
from g2p_en import G2p
with open(dict_path) as f:
    lines = f.readlines()
lines = [line.replace("\n", "").split(" ") for line in lines]
char_to_id = {c: int(i) for c, i in lines}
g2p = G2p()
def frontend(text):
    """Clean text and then convert to id sequence."""
    text = custom_english_cleaners(text)
    
    if trans_type == "phn":
        text = filter(lambda s: s != " ", g2p(text))
        text = " ".join(text)
        print(f"Cleaned text: {text}")
        charseq = text.split(" ")
    else:
        print(f"Cleaned text: {text}")
        charseq = list(text)
    idseq = []
    for c in charseq:
        if c.isspace():
            idseq += [char_to_id["<space>"]]
        elif c not in char_to_id.keys():
            idseq += [char_to_id["<unk>"]]
        else:
            idseq += [char_to_id[c]]
    idseq += [idim - 1]  # <eos>
    return torch.LongTensor(idseq).view(-1).to(device)

import nltk
nltk.download('punkt')
print("Now ready to synthesize!")

### Synthesis

In [0]:
import time
print("Input your favorite sentencne in English!")
input_text = input()

with torch.no_grad():
    start = time.time()
    x = frontend(input_text)
    c, _, _ = model.inference(x, inference_args)
    z = torch.randn(1, 1, c.size(0) * config["hop_size"]).to(device)
    c = torch.nn.ReplicationPad1d(
        config["generator_params"]["aux_context_window"])(c.unsqueeze(0).transpose(2, 1))
    y = vocoder(z, c).view(-1)
rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
print(f"RTF = {rtf:5f}")

from IPython.display import display, Audio
display(Audio(y.view(-1).cpu().numpy(), rate=config["sampling_rate"]))



---

## Japanese demo


### Install Japanese dependencies

In [0]:
# install dependency
!mkdir tools && cd tools && git clone https://github.com/r9y9/hts_engine_API.git
!cd tools/hts_engine_API/src && ./waf configure && ./waf build install
!cd tools && git clone https://github.com/r9y9/open_jtalk.git
!mkdir -p tools/open_jtalk/src/build && cd tools/open_jtalk/src/build && cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON .. && make install
!cp tools/open_jtalk/src/build/*.so* /usr/lib64-nvidia
!cd tools && git clone https://github.com/r9y9/pyopenjtalk.git
!cd tools/pyopenjtalk && pip install .

### Download pretrained models

Here we select Tacotron2 or Transformer.


#### (a) Tacotron 2

In [0]:
# download pretrained models
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM downloads tar.gz > /dev/null 2>&1
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1kp5M4VvmagDmYckFJa78WGqh1drb_P9t downloads tar.gz > /dev/null 2>&1

# set path
dict_path = "downloads/data/lang_1phn/train_no_dev_units.txt"
model_path = "downloads/exp/train_no_dev_pytorch_train_pytorch_tacotron2_phn/results/model.last1.avg.best"
vocoder_path = "downloads/jsut.parallel_wavegan.v1/checkpoint-400000steps.pkl"
vocoder_conf = "downloads/jsut.parallel_wavegan.v1/config.yml"

print("sucessfully finished download.")

#### (b) Transformer

In [0]:
# download pretrained models
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM downloads tar.gz > /dev/null 2>&1
!./espnet/utils/download_from_google_drive.sh \
    https://drive.google.com/open?id=1mEnZfBKqA4eT6Bn0eRZuP6lNzL-IL3VD downloads tar.gz > /dev/null 2>&1

# set path
dict_path = "downloads/data/lang_1phn/train_no_dev_units.txt"
model_path = "downloads/exp/train_no_dev_pytorch_train_pytorch_transformer_phn/results/model.last1.avg.best"
vocoder_path = "downloads/jsut.parallel_wavegan.v1/checkpoint-400000steps.pkl"
vocoder_conf = "downloads/jsut.parallel_wavegan.v1/config.yml"

print("sucessfully finished download.")

### Setup

In [0]:
# add path
import sys
sys.path.append("espnet/egs/ljspeech/tts1/local")
sys.path.append("espnet")

# define device
import torch
device = torch.device("cuda")

# define E2E-TTS model
from argparse import Namespace
from espnet.asr.asr_utils import get_model_conf
from espnet.asr.asr_utils import torch_load
from espnet.utils.dynamic_import import dynamic_import
idim, odim, train_args = get_model_conf(model_path)
model_class = dynamic_import(train_args.model_module)
model = model_class(idim, odim, train_args)
torch_load(model_path, model)
model = model.eval().to(device)
inference_args = Namespace(**{"threshold": 0.3, "minlenratio": 0.0, "maxlenratio": 10.0})

# define neural vocoders
import yaml
from parallel_wavegan.models import ParallelWaveGANGenerator
with open(vocoder_conf) as f:
    config = yaml.load(f, Loader=yaml.Loader)
vocoder = ParallelWaveGANGenerator(**config["generator_params"])
vocoder.load_state_dict(torch.load(vocoder_path, map_location="cpu")["model"]["generator"])
vocoder.remove_weight_norm()
vocoder = vocoder.eval().to(device)

# define text frontend
import pyopenjtalk
with open(dict_path) as f:
    lines = f.readlines()
lines = [line.replace("\n", "").split(" ") for line in lines]
char_to_id = {c: int(i) for c, i in lines}
def frontend(text):
    """Clean text and then convert to id sequence."""
    text = pyopenjtalk.g2p(text, kana=False)
    print(f"Cleaned text: {text}")
    charseq = text.split(" ")
    idseq = []
    for c in charseq:
        if c.isspace():
            idseq += [char_to_id["<space>"]]
        elif c not in char_to_id.keys():
            idseq += [char_to_id["<unk>"]]
        else:
            idseq += [char_to_id[c]]
    idseq += [idim - 1]  # <eos>
    return torch.LongTensor(idseq).view(-1).to(device)

frontend("初回の辞書のインストールが必要です")
print("Now ready to synthesize!")

### Synthesis

In [0]:
import time
print("日本語で好きな文章を入力してください")
input_text = input()

with torch.no_grad():
    start = time.time()
    x = frontend(input_text)
    c, _, _ = model.inference(x, inference_args)
    z = torch.randn(1, 1, c.size(0) * config["hop_size"]).to(device)
    c = torch.nn.ReplicationPad1d(config["generator_params"]["aux_context_window"])(c.unsqueeze(0).transpose(2, 1))
    y = vocoder(z, c).view(-1)
rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
print(f"RTF = {rtf:5f}")

from IPython.display import display, Audio
display(Audio(y.view(-1).cpu().numpy(), rate=config["sampling_rate"]))