# Import

In [11]:
import string
import IPython
from IPython.display import Audio
import torch
import os

from TTS.tts.utils.synthesis import synthesis
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor
from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *
from TTS.tts.utils.speakers import SpeakerManager


from TTS.bin.resample import resample_files
from TTS.utils.vad import get_vad_model_and_utils, remove_silence

from tqdm import tqdm
import numpy as np

# Define Parameter and Constant

In [2]:
BASE_MODEL_PATH = '../model'

# model vars 
MODEL_PATH = os.path.join(BASE_MODEL_PATH, 'best_model_160217.pth')
CONFIG_PATH = os.path.join(BASE_MODEL_PATH, 'config.json')
TTS_LANGUAGES = os.path.join(BASE_MODEL_PATH, 'language_ids.json')
USE_CUDA = torch.cuda.is_available()

REFERENCE_PATH = './dataset/reference_wav/'

model_name = MODEL_PATH.split(".")[0]

# Setup Model and Config

In [3]:
# load the config
C = load_config(CONFIG_PATH)

# load the audio processor
ap = AudioProcessor(**C.audio)

# override config
C["speakers_file"] = None
C["d_vector_file"] = []
C["language_ids_file"] = TTS_LANGUAGES

C["model_args"]["speakers_file"] = None
C["model_args"]["d_vector_file"] = []
C["model_args"]["language_ids_file"] = TTS_LANGUAGES

C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))

# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)

model.eval()

if USE_CUDA:
  model = model.cuda()

# synthesize voice
use_griffin_lim = False

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > 

# Setup Language

In [4]:
# Select language
language_id = 0
language_name_to_id = model.language_manager.name_to_id
language_id_to_name = {v: k for k, v in language_name_to_id.items()}
print(f"Language ID: {language_id}, Language Name: {language_id_to_name[language_id]}")

Language ID: 0, Language Name: th


# Setup duration predictor

In [37]:
model.length_scale = 1  # scaler for the duration predictor. The larger it is, the slower the speech.
model.inference_noise_scale = 0.2 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.2 # defines the noise variance applied to the duration predictor z vector at inference.

# Process reference audio file

In [38]:
# reamples the audio file to match the sample rate of the model
resample_files(REFERENCE_PATH, C.audio['sample_rate'], file_ext="wav")

Resampling the audio files...
Found 28 files...


100%|██████████| 28/28 [00:04<00:00,  5.81it/s]

Done !





In [39]:
# trim silence at the beginning and end of the audio
model_and_utils = get_vad_model_and_utils(use_cuda=USE_CUDA, use_onnx=False)

for file in tqdm(os.listdir(REFERENCE_PATH)):
  if not file.endswith(".wav"):
    continue
  output_path, is_speech = remove_silence(
    model_and_utils,
    os.path.join(REFERENCE_PATH, file),
    os.path.join(REFERENCE_PATH, file),
    trim_just_beginning_and_end=True,
    use_cuda=USE_CUDA
  )

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Ming/.cache\torch\hub\master.zip
100%|██████████| 28/28 [00:05<00:00,  4.68it/s]


In [40]:
for file in os.listdir(REFERENCE_PATH):
    if not file.endswith(".wav"):
        continue
    REFERENCE_FILE = os.path.join(REFERENCE_PATH, file)
    # normalize the reference audio with rms to -27dB
    !ffmpeg-normalize $REFERENCE_FILE -nt rms -t=-27 -o $REFERENCE_FILE -ar 16000 -f

In [41]:
SE_speaker_manager = SpeakerManager(encoder_model_path=C["model_args"]["speaker_encoder_model_path"], encoder_config_path=C["model_args"]["speaker_encoder_config_path"], use_cuda=USE_CUDA)

for file in tqdm(os.listdir(REFERENCE_PATH)):
    if not file.endswith(".wav"):
        continue
    filename = file.split(".")[0]
    REFERENCE_FILE = os.path.join(REFERENCE_PATH, file)
    reference_emb = SE_speaker_manager.compute_embedding_from_clip(REFERENCE_FILE)
    with open(f"dataset/transcripts/{filename}.txt", "r") as f:
        text = "".join(f.readline().split("|")).strip()
    wav, alignment, _, _ = synthesis(
                    model = model,
                    text = text,
                    CONFIG = C,
                    use_cuda = USE_CUDA,
                    d_vector = reference_emb,
                    style_wav = None,
                    language_id = language_id,
                    use_griffin_lim = True,
                    do_trim_silence = False,
                ).values()
    ap.save_wav(wav, f"dataset/synthesized_wav/{filename}.wav")


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


100%|██████████| 28/28 [00:04<00:00,  6.38it/s]


# Compare Reference Vs Synthesized

In [42]:
def get_audio_lengths(audio_folder):
    audio_lengths = []
    for file in os.listdir(audio_folder):
        if not file.endswith(".wav"):
            continue
        audio_lengths.append(ap.get_duration(os.path.join(audio_folder, file)))
    return audio_lengths

reference_lengths = get_audio_lengths(REFERENCE_PATH)
synthesized_lengths = get_audio_lengths("dataset/synthesized_wav")

print(f"Reference Lengths: {reference_lengths}")
print(f"Synthesized Lengths: {synthesized_lengths}")

Reference Lengths: [3.194, 4.562, 7.378, 2.772, 9.33, 4.372, 2.454, 2.93, 5.81, 3.482, 5.59, 5.074, 4.764, 6.962, 7.634, 3.604, 3.922, 4.38, 6.78, 6.938, 3.986, 1.844, 3.026, 5.332, 5.366, 5.752, 8.466, 2.68]
Synthesized Lengths: [2.56, 4.128, 4.864, 2.336, 8.496, 3.792, 2.112, 2.336, 4.128, 2.928, 4.432, 4.016, 3.968, 5.952, 6.016, 2.864, 3.232, 3.392, 5.616, 5.776, 3.376, 1.552, 2.544, 4.272, 4.656, 5.024, 7.104, 2.192]


In [43]:
reference_lengths = np.array(reference_lengths)
synthesized_lengths = np.array(synthesized_lengths)

ratios = synthesized_lengths / reference_lengths
print(f"Ratios: {ratios}")
harmonic_mean = len(ratios) / np.sum(1 / ratios)
print(f"Harmonic Mean: {harmonic_mean}")
print(f"length_scale: {1/harmonic_mean}")

Ratios: [0.80150282 0.90486629 0.65925725 0.84271284 0.91061093 0.8673376
 0.8606357  0.79726962 0.71049914 0.84089604 0.79284436 0.79148601
 0.83291352 0.85492675 0.78805345 0.79467259 0.82406935 0.77442922
 0.82831858 0.83251658 0.84696438 0.84164859 0.84071381 0.8012003
 0.86768543 0.87343533 0.83912119 0.81791045]
Harmonic Mean: 0.8193357884680831
length_scale: 1.220500817948775
