# Demo zero-shot TTS with YourTTS

##TTS Model setup

### Download and install Coqui TTS


In [1]:
!git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS
!pip install -q -e TTS/
!pip install -q torchaudio==0.9.0

Cloning into 'TTS'...
remote: Enumerating objects: 23810, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 23810 (delta 42), reused 51 (delta 28), pack-reused 23724[K
Receiving objects: 100% (23810/23810), 133.41 MiB | 13.87 MiB/s, done.
Resolving deltas: 100% (17329/17329), done.
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 124 kB 3.9 MB/s 
[K     |████████████████████████████████| 11.1 MB 32.7 MB/s 
[K     |████████████████████████████████| 3.4 MB 36.9 MB/s 
[K     |████████████████████████████████| 47.4 MB 89 kB/s 
[K     |████████████████████████████████| 1.3 MB 50.4 MB/s 
[K     |████████████████████████████████| 71 kB 8.0 MB/s 
[K     |████████████████████████████████| 80 kB 7.9 MB/s 
[K     |████████████████████████████████| 183 kB 56.6 MB/

###Download TTS Checkpoint

In [6]:
! gdown --id 1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR -O best_model.pth.tar 

Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR 



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# TTS checkpoints

# download config  
! gdown --id 1-PfXD66l1ZpsZmJiC-vhL055CDSugLyP
# download language json 
! gdown --id 1_Vb2_XHqcC0OcvRF82F883MTxfTRmerg
# download speakers json
! gdown --id 1SZ9GE0CBM-xGstiXH2-O2QWdmSXsBKdC -O speakers.json
# download checkpoint
! gdown --id 1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR -O best_model.pth.tar  

Downloading...
From: https://drive.google.com/uc?id=1-PfXD66l1ZpsZmJiC-vhL055CDSugLyP
To: /content/config.json
100% 12.3k/12.3k [00:00<00:00, 3.92MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_Vb2_XHqcC0OcvRF82F883MTxfTRmerg
To: /content/language_ids.json
100% 47.0/47.0 [00:00<00:00, 55.9kB/s]
Downloading...
From: https://drive.google.com/uc?id=1SZ9GE0CBM-xGstiXH2-O2QWdmSXsBKdC
To: /content/speakers.json
100% 671k/671k [00:00<00:00, 116MB/s]
Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR 



### Imports

In [4]:
import sys
TTS_PATH = "TTS/"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally

import os
import string
import time
import argparse
import json

import numpy as np
import IPython
from IPython.display import Audio


import torch

from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor


from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *

### Paths definition

In [5]:
OUT_PATH = 'out/'

# create output path
os.makedirs(OUT_PATH, exist_ok=True)

# model vars 
MODEL_PATH = '/content/drive/MyDrive/NLP Models/best_model_latest.pth.tar'
CONFIG_PATH = 'config.json'
TTS_LANGUAGES = "language_ids.json"
TTS_SPEAKERS = "speakers.json"
USE_CUDA = torch.cuda.is_available()

### Restore model

In [6]:
# load the config
C = load_config(CONFIG_PATH)


# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
# print(model.language_manager.num_languages, model.embedded_language_dim)
# print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)


model.eval()

if USE_CUDA:
    model = model.cuda()

# synthesize voice
use_griffin_lim = False

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 6 speakers: female-en-5, female-en-5
, female-pt-4
, male-en-2, male-en-2
, male-pt-3



##Speaker encoder setup

### Install helper libraries

In [7]:
! pip install -q pydub ffmpeg-normalize

### Paths definition

In [8]:
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"

# download config 
! gdown --id  19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1 -O $CONFIG_SE_PATH
# download checkpoint  
! gdown --id   17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X -O $CHECKPOINT_SE_PATH

Downloading...
From: https://drive.google.com/uc?id=19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1
To: /content/config_se.json
100% 3.49k/3.49k [00:00<00:00, 5.12MB/s]
Downloading...
From: https://drive.google.com/uc?id=17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X
To: /content/SE_checkpoint.pth.tar
100% 44.6M/44.6M [00:00<00:00, 205MB/s]


###Imports

In [9]:
from TTS.tts.utils.speakers import SpeakerManager
from pydub import AudioSegment
from google.colab import files
import librosa

###Load the Speaker encoder

In [10]:
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


###Define helper function

In [11]:
def compute_spec(ref_file):
  y, sr = librosa.load(ref_file, sr=ap.sample_rate)
  spec = ap.spectrogram(y)
  spec = torch.FloatTensor(spec).unsqueeze(0)
  return spec

## TTS

###Upload, normalize and resample your reference wav files

Please upload wav files

In [None]:
print("Select speaker reference audios files:")
reference_files = files.upload()
reference_files = list(reference_files.keys())
for sample in reference_files:
    !ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f

###Compute embedding

In [None]:
reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files)

In [None]:
reference_files

In [None]:
import os
import matplotlib
matplotlib.use('Agg') # No pictures displayed 
import pylab
import librosa
import librosa.display
import numpy as np
%matplotlib inline


sig, fs = librosa.load('1320_00027.wav')   
# make pictures name 
save_path = 'test.jpg'

pylab.axis('off') # no axis
pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
S = librosa.feature.melspectrogram(y=sig, sr=fs)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
# pylab.savefig(save_path, bbox_inches=None, pad_inches=0)
# pylab.close()

In [None]:
sig.shape


In [None]:
np.array(reference_emb).shape

In [None]:
S = librosa.feature.melspectrogram(y=np.array(reference_emb), sr=fs)
librosa.display.specshow(librosa.power_to_db(S, ref=np.max))

###Define inference variables

In [None]:
model.length_scale = 1  # scaler for the duration predictor. The larger it is, the slower the speech.
model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.
text = "It took me quite a long time to develop a voice and now that I have it I am not going to be silent."

###Chose language id

In [None]:
model.language_manager.language_id_mapping

In [None]:
language_id = 0

### Sythesis

In [None]:
text = 'Your statements are false because each student shows pity first, but does not work upon his promise and grows materialistic.'

print(" > text: {}".format(text))
wav, alignment, _, _ = synthesis(
                    model,
                    text,
                    C,
                    "cuda" in str(next(model.parameters()).device),
                    ap,
                    speaker_id=None,
                    d_vector=reference_emb,
                    style_wav=None,
                    language_id=language_id,
                    enable_eos_bos_chars=C.enable_eos_bos_chars,
                    use_griffin_lim=True,
                    do_trim_silence=False,
                ).values()
print("Generated Audio")
IPython.display.display(Audio(wav, rate=ap.sample_rate))
file_name = text.replace(" ", "_")
file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(OUT_PATH, file_name)
print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path)

In [48]:
def get_reference_input(speaker):

    reference_output = {}

    print("Select speaker reference audios files:")
    reference_files = files.upload()
    reference_files = list(reference_files.keys())
    for sample in reference_files:
        !ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f

    reference_output[speaker] = reference_files

    return reference_output


In [33]:
female_teen = get_reference_input()

Select speaker reference audios files:


Saving p240_00002.wav to p240_00002 (1).wav
Saving p240_00006.wav to p240_00006 (1).wav
Saving p240_00027.wav to p240_00027 (1).wav
Saving p240_00038.wav to p240_00038 (1).wav


In [34]:
female_teen.get('reference_files')

['p240_00002.wav', 'p240_00006.wav', 'p240_00027.wav', 'p240_00038.wav']

In [40]:
def get_output_audio(text,references,speaker):

    reference_files = references.get(speaker)

    #Compute embeddings
    reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files)

    # define variables for the generation
    model.length_scale = 1  # scaler for the duration predictor. The larger it is, the slower the speech.
    model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
    model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.

    # print('Audio Mapping : ',model.language_manager.language_id_mapping)
    
    #Select Language ID
    language_id = 0

    #text = 'Your statements are false because each student shows pity first, but does not work upon his promise and grows materialistic.'

    print(" > text: {}".format(text))
    wav, alignment, _, _ = synthesis(
                        model,
                        text,
                        C,
                        "cuda" in str(next(model.parameters()).device),
                        ap,
                        speaker_id=None,
                        d_vector=reference_emb,
                        style_wav=None,
                        language_id=language_id,
                        enable_eos_bos_chars=C.enable_eos_bos_chars,
                        use_griffin_lim=True,
                        do_trim_silence=False,
                    ).values()
    print("Generated Audio")
    IPython.display.display(Audio(wav, rate=ap.sample_rate))
    file_name = text.replace(" ", "_")[:20]
    file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', '')))+ '_'+ str(len(reference_files)) + '.wav'
    out_path = os.path.join(OUT_PATH, file_name)
    print(" > Saving output to {}".format(out_path))
    ap.save_wav(wav, out_path)


In [22]:
get_output_audio(' I think that several people die due to inadequate medical aid. They cannot afford substantial medical costs. I shall help them without exerting any charges.')

Select speaker reference audios files:


Saving 5_srk_freedom_to_be_yourself.wav to 5_srk_freedom_to_be_yourself.wav
Audio Mapping :  {'en': 0, 'fr-fr': 1, 'pt-br': 2}
 > text:  I think that several people die due to inadequate medical aid. They cannot afford substantial medical costs. I shall help them without exerting any charges.
Generated Audio


 > Saving output to out/_I_think_that_several_people_die_due_to_inadequate_medical_aid_They_cannot_afford_substantial_medical_costs_I_shall_help_them_without_exerting_any_charges.wav


In [23]:
get_output_audio(' I think that several people die due to inadequate medical aid. They cannot afford substantial medical costs. I shall help them without exerting any charges.')

Select speaker reference audios files:


Saving 4_srk_freedom_to_be_yourself.wav to 4_srk_freedom_to_be_yourself.wav
Saving 5_srk_freedom_to_be_yourself.wav to 5_srk_freedom_to_be_yourself (1).wav
Saving 6_srk_freedom_to_be_yourself.wav to 6_srk_freedom_to_be_yourself.wav
Saving 7_srk_freedom_to_be_yourself.wav to 7_srk_freedom_to_be_yourself.wav
Saving 8_srk_freedom_to_be_yourself.wav to 8_srk_freedom_to_be_yourself.wav
Saving 9_srk_freedom_to_be_yourself.wav to 9_srk_freedom_to_be_yourself.wav
Audio Mapping :  {'en': 0, 'fr-fr': 1, 'pt-br': 2}
 > text:  I think that several people die due to inadequate medical aid. They cannot afford substantial medical costs. I shall help them without exerting any charges.
Generated Audio


 > Saving output to out/_I_think_that_several_people_die_due_to_inadequate_medical_aid_They_cannot_afford_substantial_medical_costs_I_shall_help_them_without_exerting_any_charges.wav


In [None]:
filename = '/content/convo_1.txt'


def get_narrator_inputs(filename):


    references = {}

    with open(filename) as f:
        lines = f.readlines()
        lines  = [ line.strip('\n') for line in lines ]
        narrators = []

        for each in lines:
            
            # print(each.split(":"))
            line_item = each.split(":")
            narrator = line_item[0].lstrip().rstrip()
            text = line_item[1].lstrip().rstrip()
            
            narrators.append(narrator)
    
    narrators = list(set(narrators))
    print(narrators)

    for each in narrators :
        print(each, ' : ' , )
        ref_each = get_reference_input(each)
        references.update(ref_each)

    
    
    return references


references = get_narrator_inputs(filename)

['Title', 'Student', 'Teacher']
Title  : 
Select speaker reference audios files:


Saving 2_tale1.wav to 2_tale1 (1).wav
Student  : 
Select speaker reference audios files:


Saving p240_00002.wav to p240_00002 (3).wav
Saving p240_00006.wav to p240_00006 (3).wav
Saving p240_00027.wav to p240_00027 (3).wav
Saving p240_00038.wav to p240_00038 (3).wav
Teacher  : 
Select speaker reference audios files:


In [39]:
with open('/content/convo_1.txt') as f:
    lines = f.readlines()
    lines  = [ line.strip('\n') for line in lines ]
    
    for each in lines:
        narrators = []
        # print(each.split(":"))
        line_item = each.split(":")
        narrator = line_item[0].lstrip().rstrip()
        text = line_item[1].lstrip().rstrip()
        
        narrators = narrators.append(narrator)


        print(narrators)

        if narrator == 'Student':
            print(narrator,text)
            get_output_audio(text)

        

Student Sir, it would depend on what marks I get.
Audio Mapping :  {'en': 0, 'fr-fr': 1, 'pt-br': 2}
 > text: Sir, it would depend on what marks I get.
Generated Audio


 > Saving output to out/Sir_it_would_depend_on_what_marks_I_get_4.wav
Student I will haunt pre-medical groups in F.S.C. Otherwise, I shall join I.C.S.
Audio Mapping :  {'en': 0, 'fr-fr': 1, 'pt-br': 2}
 > text: I will haunt pre-medical groups in F.S.C. Otherwise, I shall join I.C.S.
Generated Audio


 > Saving output to out/I_will_haunt_premedical_groups_in_FSC_Otherwise_I_shall_join_ICS_4.wav
Student I think that several people die due to inadequate medical aid. They cannot afford substantial medical costs. I shall help them without exerting any charges.
Audio Mapping :  {'en': 0, 'fr-fr': 1, 'pt-br': 2}
 > text: I think that several people die due to inadequate medical aid. They cannot afford substantial medical costs. I shall help them without exerting any charges.
Generated Audio


 > Saving output to out/I_think_that_several_people_die_due_to_inadequate_medical_aid_They_cannot_afford_substantial_medical_costs_I_shall_help_them_without_exerting_any_charges_4.wav
Student I would not be in that evil group. My grandmother was very ill, and we could not get here appropriately treated because we were destitute. My purpose in life is to be a doctor, and I shall serve the people as a good citizen and help the needy free of cost.
Audio Mapping :  {'en': 0, 'fr-fr': 1, 'pt-br': 2}
 > text: I would not be in that evil group. My grandmother was very ill, and we could not get here appropriately treated because we were destitute. My purpose in life is to be a doctor, and I shall serve the people as a good citizen and help the needy free of cost.
Generated Audio


 > Saving output to out/I_would_not_be_in_that_evil_group_My_grandmother_was_very_ill_and_we_could_not_get_here_appropriately_treated_because_we_were_destitute_My_purpose_in_life_is_to_be_a_doctor_and_I_shall_serve_the_people_as_a_good_citizen_and_help_the_needy_free_of_cost_4.wav


OSError: ignored





T
i
t
l
e
 
:
 
C
o
n
v
e
r
s
a
t
i
o
n
 
B
e
t
w
e
e
n
 
S
t
u
d
e
n
t
 
a
n
d
 
T
e
a
c
h
e
r
 
A
b
o
u
t
 
F
u
t
u
r
e


T
e
a
c
h
e
r
:
 
W
h
a
t
 
a
r
e
 
y
o
u
 
p
l
a
n
n
i
n
g
 
t
o
 
d
o
 
a
f
t
e
r
 
p
a
s
s
i
n
g
 
y
o
u
r
 
m
a
t
r
i
c
?


S
t
u
d
e
n
t
:
 
S
i
r
,
 
i
t
 
w
o
u
l
d
 
d
e
p
e
n
d
 
o
n
 
w
h
a
t
 
m
a
r
k
s
 
I
 
g
e
t
.


T
e
a
c
h
e
r
:
 
O
k
,
 
s
o
 
w
h
a
t
 
h
a
v
e
 
y
o
u
 
p
l
a
n
n
e
d
 
i
f
 
y
o
u
 
s
e
c
u
r
e
d
 
g
o
o
d
 
m
a
r
k
s
 
i
n
 
m
a
t
r
i
c
?


S
t
u
d
e
n
t
:
 
I
 
w
i
l
l
 
h
a
u
n
t
 
p
r
e
-
m
e
d
i
c
a
l
 
g
r
o
u
p
s
 
i
n
 
F
.
S
.
C
.
 
O
t
h
e
r
w
i
s
e
,
 
I
 
s
h
a
l
l
 
j
o
i
n
 
I
.
C
.
S
.


T
e
a
c
h
e
r
:
 
W
h
y
 
d
i
d
 
y
o
u
 
d
e
e
m
 
m
e
d
i
c
a
l
 
g
r
o
u
p
s
?


S
t
u
d
e
n
t
:
 
I
 
t
h
i
n
k
 
t
h
a
t
 
s
e
v
e
r
a
l
 
p
e
o
p
l
e
 
d
i
e
 
d
u
e
 
t
o
 
i
n
a
d
e
q
u
a
t
e
 
m
e
d
i
c
a
l
 
a
i
d
.
 
T
h
e
y
 
c
a
n
n
o
t
 
a
f
f
o
r
d
 
s
u
b
s
t
a
n
t
i
a
l
 
m
e
d
i
c
a
l
 
c
o
s
t
s
.
 
I
 
s
h
