[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb)

# ESPnet2-TTS realtime demonstration

This notebook provides a demonstration of the realtime E2E-TTS using ESPnet2-TTS and ParallelWaveGAN (+ MelGAN).

- ESPnet2-TTS: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1
- ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN

Author: Tomoki Hayashi ([@kan-bayashi](https://github.com/kan-bayashi))

## Installation

In [None]:
# NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care
!pip install -q espnet==0.9.7 parallel_wavegan==0.4.8
!pip install -q espnet_model_zoo
!pip install -q pyopenjtalk

## Single speaker model demo

### Model Selection

Please select model: English, Japanese, and Mandarin are supported.

You can try Tacotron2, FastSpeech, and FastSpeech2 as the text2mel model.  
And you can use Parallel WaveGAN and Multi-band MelGAN as the vocoder model.

In [None]:
#@title Choose English model { run: "auto" }

lang = 'English'
fs = 22050 #@param {type:"integer"}
tag = 'kan-bayashi/ljspeech_conformer_fastspeech2' #@param ["kan-bayashi/ljspeech_tacotron2", "kan-bayashi/ljspeech_fastspeech", "kan-bayashi/ljspeech_fastspeech2", "kan-bayashi/ljspeech_conformer_fastspeech2"] {type:"string"}
vocoder_tag = "ljspeech_parallel_wavegan.v1" #@param ["ljspeech_parallel_wavegan.v1", "ljspeech_full_band_melgan.v2", "ljspeech_multi_band_melgan.v2"] {type:"string"}

In [None]:
#@title Choose Japanese model { run: "auto" }

lang = 'Japanese'
fs = 24000 #@param {type:"integer"}
tag = 'kan-bayashi/jsut_conformer_fastspeech2_accent_with_pause' #@param ["kan-bayashi/jsut_tacotron2", "kan-bayashi/jsut_transformer", "kan-bayashi/jsut_fastspeech", "kan-bayashi/jsut_fastspeech2", "kan-bayashi/jsut_conformer_fastspeech2", "kan-bayashi/jsut_conformer_fastspeech2_accent", "kan-bayashi/jsut_conformer_fastspeech2_accent_with_pause"] {type:"string"}
vocoder_tag = "jsut_parallel_wavegan.v1" #@param ["jsut_parallel_wavegan.v1", "jsut_multi_band_melgan.v2"] {type:"string"}

In [None]:
#@title Choose Mandarin model { run: "auto" }

lang = 'Mandarin'
fs = 24000 #@param {type:"integer"}
tag = 'kan-bayashi/csmsc_conformer_fastspeech2' #@param ["kan-bayashi/csmsc_tacotron2", "kan-bayashi/csmsc_transformer", "kan-bayashi/csmsc_fastspeech", "kan-bayashi/csmsc_fastspeech2", "kan-bayashi/csmsc_conformer_fastspeech2"] {type: "string"}
vocoder_tag = "csmsc_parallel_wavegan.v1" #@param ["csmsc_parallel_wavegan.v1", "csmsc_multi_band_melgan.v2"] {type:"string"}

### Model Setup

In [None]:
import time
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model
d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is 
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

### Synthesis

In [None]:
# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# synthesis
with torch.no_grad():
    start = time.time()
    wav, c, *_ = text2speech(x)
    wav = vocoder.inference(c)
rtf = (time.time() - start) / (len(wav) / fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=fs))

## Multi-speaker Model Demo

### Model Selection

Please select models by comment out.

Now we provide only English multi-speaker pretrained model.

In [None]:
#@title English multi-speaker pretrained model { run: "auto" }

lang = 'English'
fs = 24000 #@param {type:"integer"}
tag = 'kan-bayashi/libritts_gst+xvector_conformer_fastspeech2' #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/libritts_xvector_transformer", "kan-bayashi/libritts_xvector_conformer_fastspeech2", "kan-bayashi/libritts_gst+xvector_transformer", "kan-bayashi/libritts_gst+xvector_conformer_fastspeech2"] {type:"string"}
vocoder_tag = "libritts_parallel_wavegan.v1.long" #@param ["vctk_parallel_wavegan.v1.long", "vctk_multi_band_melgan.v2", "libritts_parallel_wavegan.v1.long", "libritts_multi_band_melgan.v2"] {type:"string"}

### Model Setup

In [None]:
import time
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model
d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is 
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

### Speaker selection

For multi-speaker model, we need to provide X-vector and/or the reference speech to decide the speaker characteristics.  
For X-vector, you can select the speaker from the dumped x-vectors.  
For the reference speech, you can use any speech but please make sure the sampling rate is matched.

In [None]:
import os
import numpy as np
import kaldiio

# X-vector selection
if text2speech.tts.spk_embed_dim is not None:
    # load x-vector
    model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
    xvector_ark = f"{model_dir}/../../dump/xvector/tr_no_dev/spk_xvector.ark"  # training speakers
    # xvector_ark = f"{model_dir}/../../dump/xvector/dev/spk_xvector.ark"  # development speakers
    # xvector_ark = f"{model_dir}/../../dump/xvector/eval1/spk_xvector.ark"  # eval speakers
    xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
    spks = list(xvectors.keys())

    # randomly select speaker
    random_spk_idx = np.random.randint(0, len(spks))
    spk = spks[random_spk_idx]
    spembs = xvectors[spk]
    print(f"selected spk: {spk}")

# Reference speech selection for GST
if text2speech.use_speech:
    # you can change here to load your own reference speech
    # e.g.
    # import soundfile as sf
    # speech, fs = sf.read("/path/to/reference.wav")
    # speech = torch.from_numpy(speech).float()
    speech = torch.randn(50000,)

### Synthesis

In [None]:
# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# synthesis
with torch.no_grad():
    start = time.time()
    wav, c, *_ = text2speech(x, speech=speech, spembs=spembs)
    wav = vocoder.inference(c)
rtf = (time.time() - start) / (len(wav) / fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=fs))