# Import

In [1]:
import string
import IPython
from IPython.display import Audio
import torch
import os

from TTS.tts.utils.synthesis import synthesis
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor
from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *
from TTS.tts.utils.speakers import SpeakerManager


from TTS.bin.resample import resample_files
from TTS.utils.vad import get_vad_model_and_utils, remove_silence

# Define Parameter and Constant

In [2]:
OUT_PATH = 'output'
BASE_MODEL_PATH = './model'

# model vars 
MODEL_PATH = os.path.join(BASE_MODEL_PATH, 'best_model_160217.pth')
CONFIG_PATH = os.path.join(BASE_MODEL_PATH, 'config.json')
TTS_LANGUAGES = os.path.join(BASE_MODEL_PATH, 'language_ids.json')
USE_CUDA = torch.cuda.is_available()
REFERENCE_PATH = "./reference_wav/Mingall-th.wav"
SPEAKER_FOLDER = REFERENCE_PATH.split("/")[-1].split(".")[0]

model_name = MODEL_PATH.split(".")[0]

# Setup Model and Config

In [3]:
# load the config
C = load_config(CONFIG_PATH)

# load the audio processor
ap = AudioProcessor(**C.audio)

# override config
C["speakers_file"] = None
C["d_vector_file"] = []
C["language_ids_file"] = TTS_LANGUAGES

C["model_args"]["speakers_file"] = None
C["model_args"]["d_vector_file"] = []
C["model_args"]["language_ids_file"] = TTS_LANGUAGES

C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))

# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)

model.eval()

if USE_CUDA:
  model = model.cuda()

# synthesize voice
use_griffin_lim = False

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > 

  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > initialization of language-embedding layers.


  cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))


# Setup Language

In [4]:
# Select language
language_id = 0
language_name_to_id = model.language_manager.name_to_id
language_id_to_name = {v: k for k, v in language_name_to_id.items()}
print(f"Language ID: {language_id}, Language Name: {language_id_to_name[language_id]}")

Language ID: 0, Language Name: th


# Setup duration predictor

In [5]:
model.length_scale = 1.25  # scaler for the duration predictor. The larger it is, the slower the speech.
model.inference_noise_scale = 0.2 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.2 # defines the noise variance applied to the duration predictor z vector at inference.

# Process reference audio file

In [6]:
# reamples the audio file to match the sample rate of the model
resample_files("./reference_wav/", C.audio['sample_rate'], file_ext="wav")

Resampling the audio files...
Found 5 files...


100%|██████████| 5/5 [00:01<00:00,  3.55it/s]

Done !





In [7]:
# trim silence at the beginning and end of the audio
model_and_utils = get_vad_model_and_utils(use_cuda=USE_CUDA, use_onnx=False)

output_path, is_speech = remove_silence(
  model_and_utils,
  REFERENCE_PATH,
  REFERENCE_PATH,
  trim_just_beginning_and_end=True,
  use_cuda=USE_CUDA
)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /Users/titor/.cache/torch/hub/master.zip


In [8]:
# normalize the reference audio with rms to -27dB
!ffmpeg-normalize $REFERENCE_PATH -nt rms -t=-27 -o $REFERENCE_PATH -ar 16000 -f

In [9]:
SE_speaker_manager = SpeakerManager(encoder_model_path=C["model_args"]["speaker_encoder_model_path"], encoder_config_path=C["model_args"]["speaker_encoder_config_path"], use_cuda=USE_CUDA)
reference_emb = SE_speaker_manager.compute_embedding_from_clip(REFERENCE_PATH)

 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


# Inference

In [10]:
text = "ผมประทับใจกับความเป็นมิตรของคนไทยมากครับ"
print(f" > text: {text} with sampling rate: {ap.sample_rate}")

wav, alignment, _, _ = synthesis(
                    model = model,
                    text = text,
                    CONFIG = C,
                    use_cuda = USE_CUDA,
                    d_vector = reference_emb,
                    style_wav = None,
                    language_id = language_id,
                    use_griffin_lim = True,
                    do_trim_silence = False,
                ).values()
print("Audio Generated")
IPython.display.display(Audio(wav, rate=ap.sample_rate))

 > text: ผมประทับใจกับความเป็นมิตรของคนไทยมากครับ with sampling rate: 16000
Audio Generated


# Save to Folder

In [11]:
file_name = text.replace(" ", "_")
file_name = model_name + '_' + file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(OUT_PATH, SPEAKER_FOLDER, file_name,)

print(f" > Saving output to {out_path}")

os.makedirs(f"{OUT_PATH}/{SPEAKER_FOLDER}", exist_ok=True)
ap.save_wav(wav, out_path)

 > Saving output to output/Mingall-th/_ผมประทับใจกับความเป็นมิตรของคนไทยมากครับ.wav
