# Mozilla TTS on CPU Real-Time Speech Synthesis 

We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.

Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.

MultiBand-Melgan is trained  1.45M steps with real spectrograms.

Note that both model performances can be improved with more training.

### Download Models

In [60]:
! mkdir data/
! gdown --id 1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw -O data/tts_model.pth.tar
! gdown --id 1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz -O data/tts_config.json
! gdown --id 1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV -O data/tts_scale_stats.npy

mkdir: cannot create directory 'data/': File exists
Downloading...
From: https://drive.google.com/uc?id=1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV
To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/tts_scale_stats.npy
100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 18.1MB/s]


In [61]:
! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar
! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json
! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy

Downloading...
From: https://drive.google.com/uc?id=11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU
To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/vocoder_scale_stats.npy
100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 16.7MB/s]


### Define TTS function

In [62]:
def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,
                                                                             truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)
    # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)
    if not use_gl:
        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
        waveform = waveform.flatten()
    if use_cuda:
        waveform = waveform.cpu()
    waveform = waveform.numpy()
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [64]:
import os
import torch
import time
import IPython

from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis

In [33]:
# runtime settings
use_cuda = False

In [65]:
# model paths
TTS_MODEL = "/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar"
TTS_CONFIG = "/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json"

TTS_MODEL = "data/tts_model.pth.tar"
TTS_CONFIG = "data/tts_config.json"

VOCODER_MODEL = "/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar"
VOCODER_CONFIG = "/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json"

VOCODER_MODEL = "data/vocoder_model.pth.tar"
VOCODER_CONFIG = "data/vocoder_config.json"

In [66]:
# load configs
TTS_CONFIG = load_config(TTS_CONFIG)
VOCODER_CONFIG = load_config(VOCODER_CONFIG)


In [67]:
# load the audio processor
TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'
ap = AudioProcessor(**TTS_CONFIG.audio)         

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:data/tts_scale_stats.npy
 | > hop_length:256
 | > win_length:1024


In [68]:
# LOAD TTS MODEL
# multi speaker 
speaker_id = None
speakers = []

# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)
if TTS_CONFIG.get("characters"):
    _characters = TTS_CONFIG["characters"]["characters"]
    _phonemes = TTS_CONFIG["characters"]["phonemes"]
    _punctuations = TTS_CONFIG["characters"]["punctuations"]
    _pad = TTS_CONFIG["characters"]["pad"]
    _eos = TTS_CONFIG["characters"]["eos"]
    _bos = TTS_CONFIG["characters"]["bos"]
    
    symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos  )

num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), TTS_CONFIG)

# load model state
cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

 > Using model: tacotron2


In [69]:
from TTS.vocoder.utils.generic_utils import setup_generator

# LOAD VOCODER MODEL
vocoder_model = setup_generator(VOCODER_CONFIG)
vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
vocoder_model.remove_weight_norm()
vocoder_model.inference_padding = 0


VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'
ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
if use_cuda:
    vocoder_model.cuda()
vocoder_model.eval()
print("\nVocoder loaded")

 > Generator Model: multiband_melgan_generator
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:data/vocoder_scale_stats.npy
 | > hop_length:256
 | > win_length:1024

Vocoder loaded


## Run Inference

In [70]:
# Here some test sentences for you to play with :
sentence = "我从来不会说很标准的中文。"
sentence = "我喜欢听人工智能的博客。"
sentence = "我来自一个法国郊区的地方。"
sentence = "不比不知道，一比吓一跳！"
sentence = "台湾是一个真的很好玩的地方！"
sentence = "干一行，行一行，行行都行。"
sentence = "我要盖被子，好尴尬！"

In [71]:
# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset
# has no emotion through all the sentences. It's hard to get some nice GST with this.
# That's also why adding "!" or "?" at the end of sentence change nothing. The dataset has no such prosody.
style_wav = {"2": 0.3, "1": -0.1}


In [78]:
sentence =  "我喜欢听人工智能的博客。"
style_wav = {"2": 0.2, "7": -0.1}

align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)

(50688,)
 > Run-time: 1.5945854187011719
 > Real-time factor: 0.6935317513786934
 > Time per step: 3.145291761617468e-05
