[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb)

# ESPnet2-TTS realtime demonstration

This notebook provides a demonstration of the realtime E2E-TTS using ESPnet2-TTS and ParallelWaveGAN (+ MelGAN).

- ESPnet2-TTS: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1
- ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN

Author: Tomoki Hayashi ([@kan-bayashi](https://github.com/kan-bayashi))

## Installation

In [None]:
# NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care
!pip install -q espnet==0.9.3 parallel_wavegan==0.4.6

### (Optional)

If you want to try Japanese TTS, please run the following cell to install pyopenjtalk.

In [None]:
!mkdir tools && cd tools && git clone https://github.com/r9y9/hts_engine_API.git
!cd tools/hts_engine_API/src && ./waf configure && ./waf build install
!cd tools && git clone https://github.com/r9y9/open_jtalk.git
!mkdir -p tools/open_jtalk/src/build && cd tools/open_jtalk/src/build && \
    cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON .. && make install
!cp tools/open_jtalk/src/build/*.so* /usr/lib64-nvidia
!cd tools && git clone https://github.com/r9y9/pyopenjtalk.git
!cd tools/pyopenjtalk && pip install .

## Single speaker model demo

### Model Selection

Please select models by comment out.

English, Japanese, and Mandarin are supported.

You can try Tacotron2, FastSpeech, and FastSpeech2 as the text2mel model.  
And you can use Parallel WaveGAN and Multi-band MelGAN as the vocoder model.

In [None]:
###################################
#          ENGLISH MODELS         #
###################################
fs, lang = 22050, "English"
# tag = "kan-bayashi/ljspeech_tacotron2"
# tag = "kan-bayashi/ljspeech_fastspeech"
# tag = "kan-bayashi/ljspeech_fastspeech2"
tag = "kan-bayashi/ljspeech_conformer_fastspeech2"
vocoder_tag = "ljspeech_parallel_wavegan.v1"
# vocoder_tag = "ljspeech_full_band_melgan.v2"
# vocoder_tag = "ljspeech_multi_band_melgan.v2"

###################################
#         JAPANESE MODELS         #
###################################
# fs, lang = 24000, "Japanese"
# tag = "kan-bayashi/jsut_tacotron2"
# tag = "kan-bayashi/jsut_transformer"
# tag = "kan-bayashi/jsut_fastspeech"
# tag = "kan-bayashi/jsut_fastspeech2"
# tag = "kan-bayashi/jsut_conformer_fastspeech2"
# vocoder_tag = "jsut_parallel_wavegan.v1"
# vocoder_tag = "jsut_multi_band_melgan.v2"

###################################
#         MANDARIN MODELS         #
###################################
# fs, lang = 24000, "Mandarin"
# tag = "kan-bayashi/csmsc_tacotron2"
# tag = "kan-bayashi/csmsc_transformer"
# tag = "kan-bayashi/csmsc_fastspeech"
# tag = "kan-bayashi/csmsc_fastspeech2"
# tag = "kan-bayashi/csmsc_conformer_fastspeech2"
# vocoder_tag = "csmsc_parallel_wavegan.v1"
# vocoder_tag = "csmsc_multi_band_melgan.v2"

### Model Setup

In [None]:
import time
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model
d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is 
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

### Synthesis

In [None]:
# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# synthesis
with torch.no_grad():
    start = time.time()
    wav, c, *_ = text2speech(x)
    wav = vocoder.inference(c)
rtf = (time.time() - start) / (len(wav) / fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=fs))

## Multi-speaker Model Demo

### Model Selection

Please select models by comment out.

Now we provide only English multi-speaker pretrained model.

In [None]:
###################################
#          ENGLISH MODELS         #
###################################
fs, lang = 24000, "English"
tag = "kan-bayashi/vctk_gst_tacotron2"
# tag = "kan-bayashi/vctk_gst_transformer"
vocoder_tag = "vctk_parallel_wavegan.v1"
# vocoder_tag = "vctk_multi_band_melgan.v2"

### Model Setup

In [None]:
import time
import torch
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model
d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is 
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

### Synthesis

For multi-speaker model, we need to provide the reference speech to decide the speaker characteristics.  
You can use any speech but please make sure the sampling rate is matched.

In [None]:
# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# you can change here to load your own reference speech
# e.g.
# import soundfile as sf
# speech, fs = sf.read("/path/to/reference.wav")
# speech = torch.from_numpy(speech)
speech = torch.randn(50000,)

# synthesis
with torch.no_grad():
    start = time.time()
    wav, c, *_ = text2speech(x, speech=speech)
    wav = vocoder.inference(c)
rtf = (time.time() - start) / (len(wav) / fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=fs))