# Mozilla-TTS
This notebook contains code for testing the Mozilla Text-To-Speech model for speech synthesis currently being maintained by [Coqui-AI](https://github.com/coqui-ai/TTS).

The model uses the Tacotron 2 model for extracting acoustic features from text, and the MelGAN model for synthesizing speech from the extracted features (voice encoding).

### Install Required Libraries

In [27]:
# install espeak to help with speech synthesis

# MACOS
# !brew install espeak

# LINUX
!sudo apt-get install espeak

In [28]:
# clone fork of coqui-ai repository and install required python modules
!git clone https://github.com/francisohara24/TTS.git

%cd TTS
!pip install -r ./requirements.txt
!python setup.py install
%cd ..

Cloning into 'TTS'...
remote: Enumerating objects: 6937, done.[K
remote: Counting objects: 100% (473/473), done.[Kcts:  69% (327/473)[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 6937 (delta 442), reused 441 (delta 436), pack-reused 6464[K
Receiving objects: 100% (6937/6937), 115.26 MiB | 34.55 MiB/s, done.
Resolving deltas: 100% (4737/4737), done.
/Users/francisohara/DataspellProjects/AI-reader/notebooks/TTS/mozilla-tts/TTS
Collecting numpy>=1.16.0 (from -r ./requirements.txt (line 1))
  Obtaining dependency information for numpy>=1.16.0 from https://files.pythonhosted.org/packages/51/3b/2ba379bf754f13041e3d8b994394e78c69cdb9d1e5dd1dba9404b24afbdf/numpy-1.26.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata
  Using cached numpy-1.26.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (61 kB)
Collecting torch>=1.5 (from -r ./requirements.txt (line 2))
  Obtaining dependency information for torch>=1.5 from https://files.pythonhosted.org/packages/63/e4/efa1029

### Download Pre-Trained Models

In [29]:
# create directory to contain the pre-trained models
%mkdir models

# download pre-trained Tacotron 2 model and configurations file
!gdown --id 12pTojgg7qoXrsnyMsNl-WOil1eetZh7L -O ./models/tts_model.pth.tar
!gdown --id 12Z5r4rdOx_7LmD-pyXvIyt4vvGpCmOQy -O ./models/config.json

Downloading...
From (uriginal): https://drive.google.com/uc?id=12pTojgg7qoXrsnyMsNl-WOil1eetZh7L
From (redirected): https://drive.google.com/uc?id=12pTojgg7qoXrsnyMsNl-WOil1eetZh7L&confirm=t&uuid=7f68a819-9528-4a31-b687-4ae89a8e8cba
To: /Users/francisohara/DataspellProjects/AI-reader/notebooks/TTS/mozilla-tts/models/tts_model.pth.tar
100%|████████████████████████████████████████| 347M/347M [00:09<00:00, 38.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=12Z5r4rdOx_7LmD-pyXvIyt4vvGpCmOQy
To: /Users/francisohara/DataspellProjects/AI-reader/notebooks/TTS/mozilla-tts/models/config.json
100%|██████████████████████████████████████| 9.54k/9.54k [00:00<00:00, 7.13MB/s]


In [30]:
# download pre-trained MelGAN model, configuration file and model statistics file.
!gdown --id 12YvyBhE17VYIjOg4vYWD_xKAAdB0r4qE -O ./models/vocoder_model.pth.tar
!gdown --id 12npX6u1RbMZzV6LBlnKQcazZbwfFTQlk -O ./models/config_vocoder.json
!gdown --id 12oeQ3slzyr4lyMEfs-OfV_RUiJg8cOz7 -O ./models/scale_stats.npy

Downloading...
From: https://drive.google.com/uc?id=12YvyBhE17VYIjOg4vYWD_xKAAdB0r4qE
To: /Users/francisohara/DataspellProjects/AI-reader/notebooks/TTS/mozilla-tts/models/vocoder_model.pth.tar
100%|██████████████████████████████████████| 82.8M/82.8M [00:02<00:00, 38.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=12npX6u1RbMZzV6LBlnKQcazZbwfFTQlk
To: /Users/francisohara/DataspellProjects/AI-reader/notebooks/TTS/mozilla-tts/models/config_vocoder.json
100%|██████████████████████████████████████| 6.77k/6.77k [00:00<00:00, 11.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=12oeQ3slzyr4lyMEfs-OfV_RUiJg8cOz7
To: /Users/francisohara/DataspellProjects/AI-reader/notebooks/TTS/mozilla-tts/models/scale_stats.npy
100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 18.3MB/s]


### Define TTS function

In [31]:
# define function for converting given text to speech format using the downloaded models
def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)
    
    # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)
    if not use_gl:
        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
        waveform = waveform.flatten()
    if use_cuda:
        waveform = waveform.cpu()

    waveform = waveform.numpy()
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [32]:
import os
import torch
import time
import IPython

from TTS.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.utils.synthesis import synthesis

In [33]:
# runtime settings
use_cuda = False

In [34]:
# model paths
TTS_MODEL = "./models/tts_model.pth.tar"
TTS_CONFIG = "./models/config.json"
VOCODER_MODEL = "./models/vocoder_model.pth.tar"
VOCODER_CONFIG = "./models/config_vocoder.json"

In [35]:
# load configs
TTS_CONFIG = load_config(TTS_CONFIG)
VOCODER_CONFIG = load_config(VOCODER_CONFIG)

In [36]:
# load the audio processor
ap = AudioProcessor(**TTS_CONFIG.audio)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:./models/scale_stats.npy
 | > hop_length:256
 | > win_length:1024


In [37]:
# LOAD TTS MODEL
# multi speaker
speaker_id = None
speakers = []

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), TTS_CONFIG)

# load model state
cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

 > Using model: Tacotron2


In [38]:
from TTS.vocoder.utils.generic_utils import setup_generator

# LOAD VOCODER MODEL
vocoder_model = setup_generator(VOCODER_CONFIG)
vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
vocoder_model.remove_weight_norm()
vocoder_model.inference_padding = 0

ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])
if use_cuda:
    vocoder_model.cuda()
vocoder_model.eval()

 > Generator Model: multiband_melgan_generator
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:./models/scale_stats.npy
 | > hop_length:256
 | > win_length:1024


MultibandMelganGenerator(
  (layers): Sequential(
    (0): ReflectionPad1d((3, 3))
    (1): Conv1d(80, 384, kernel_size=(7,), stride=(1,))
    (2): LeakyReLU(negative_slope=0.2)
    (3): ConvTranspose1d(384, 192, kernel_size=(16,), stride=(8,), padding=(4,))
    (4): ResidualStack(
      (blocks): ModuleList(
        (0): Sequential(
          (0): LeakyReLU(negative_slope=0.2)
          (1): ReflectionPad1d((1, 1))
          (2): Conv1d(192, 192, kernel_size=(3,), stride=(1,))
          (3): LeakyReLU(negative_slope=0.2)
          (4): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
        )
        (1): Sequential(
          (0): LeakyReLU(negative_slope=0.2)
          (1): ReflectionPad1d((3, 3))
          (2): Conv1d(192, 192, kernel_size=(3,), stride=(1,), dilation=(3,))
          (3): LeakyReLU(negative_slope=0.2)
          (4): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
        )
        (2): Sequential(
          (0): LeakyReLU(negative_slope=0.2)
          (1): Reflectio

## Run Inference

In [40]:
sentence =  open("../../../data/colby_affirmation.txt").read()
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)

(2579712,)
 > Run-time: 47.09668207168579
 > Real-time factor: 0.402556224006646
 > Time per step: 1.8256520498475588e-05
