# Import

In [1]:
import string
import IPython
from IPython.display import Audio
import torch
import os
import random

from TTS.tts.utils.synthesis import synthesis
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor
from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *
from TTS.tts.utils.speakers import SpeakerManager

from TTS.bin.resample import resample_files
from TTS.utils.vad import get_vad_model_and_utils, remove_silence

# Define Parameter and Constant

In [2]:
OUT_PATH = 'output'
BASE_MODEL_PATH = './model'

# model vars 
MODEL_PATH = os.path.join(BASE_MODEL_PATH, 'best_model_160217.pth')
CONFIG_PATH = os.path.join(BASE_MODEL_PATH, 'config.json')
TTS_LANGUAGES = os.path.join(BASE_MODEL_PATH, 'language_ids.json')
USE_CUDA = torch.cuda.is_available()
SPEAKERS_FILE = os.path.join(BASE_MODEL_PATH, 'speakers.pth')
D_VECTOR_FILE = os.path.join(BASE_MODEL_PATH, 'dvector.pth')

model_name = MODEL_PATH.split(".")[0]

# Setup Model and Config

In [3]:
# load the config
C = load_config(CONFIG_PATH)

# load the audio processor
ap = AudioProcessor(**C.audio)

# override config
C["speakers_file"] = SPEAKERS_FILE
C["d_vector_file"] = D_VECTOR_FILE
C["language_ids_file"] = TTS_LANGUAGES

C["model_args"]["speakers_file"] = SPEAKERS_FILE
C["model_args"]["d_vector_file"] = D_VECTOR_FILE
C["model_args"]["language_ids_file"] = TTS_LANGUAGES

C["use_speaker_embedding"] = False
C["use_d_vector_file"] = True

C["model_args"]["use_speaker_embedding"] = False
C["model_args"]["use_d_vector_file"] = True

C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))

model.load_state_dict(cp['model'])

model.eval()

if USE_CUDA:
  model = model.cuda()

# synthesize voice
use_griffin_lim = False

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > 

  return torch.load(f, map_location="cpu")
  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > initialization of language-embedding layers.


  cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))


# Setup Language

In [4]:
# Select language
language_id = 0
language_name_to_id = model.language_manager.name_to_id
language_id_to_name = {v: k for k, v in language_name_to_id.items()}
print(f"Language ID: {language_id}, Language Name: {language_id_to_name[language_id]}")

Language ID: 0, Language Name: th


# Setup duration predictor

In [5]:
model.length_scale = 1.25  # scaler for the duration predictor. The larger it is, the slower the speech.
model.inference_noise_scale = 0.2 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.2 # defines the noise variance applied to the duration predictor z vector at inference.

# Setup speaker

In [6]:
speaker_id = 0
speaker_name_to_id =  model.speaker_manager.name_to_id
speaker_id_to_name = {v: k for k, v in speaker_name_to_id.items()}
speaker_name = speaker_id_to_name[speaker_id]
print(f"Speaker ID: {speaker_id}, Speaker Name: {speaker_name}")

Speaker ID: 0, Speaker Name: VCTK_cv001


# Inference

In [7]:
text = "และนี่คือธาตุทองซาว"
print(f" > text: {text} with sampling rate: {ap.sample_rate}")

wav, alignment, _, _ = synthesis(
                    model = model,
                    text = text,
                    CONFIG = C,
                    use_cuda = USE_CUDA,
                    # speaker_id = 0,
                    d_vector = model.speaker_manager.get_mean_embedding(speaker_name),
                    # style_wav = None,
                    language_id = language_id,
                    use_griffin_lim = True,
                    do_trim_silence = False,
                ).values()
print("Audio Generated")
IPython.display.display(Audio(wav, rate=ap.sample_rate))

 > text: และนี่คือธาตุทองซาว with sampling rate: 16000
Audio Generated


# Save to Folder

In [8]:
file_name = text.replace(" ", "_")
file_name = model_name + '_' + file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(OUT_PATH, speaker_name, file_name,)

print(f" > Saving output to {out_path}")

os.makedirs(f"{OUT_PATH}/{speaker_name}", exist_ok=True)
ap.save_wav(wav, out_path)

 > Saving output to output/VCTK_cv001/_และนี่คือธาตุทองซาว.wav


# Batch inference

In [10]:
# Setup speaker mapping
speaker_name_to_id = model.speaker_manager.name_to_id
speaker_id_to_name = {v: k for k, v in speaker_name_to_id.items()}
available_speaker_ids = list(speaker_id_to_name.keys())

# Create output directory
output_dir = "output_batch"
os.makedirs(output_dir, exist_ok=True)

# Read input texts
with open("input_texts.txt", 'r', encoding='utf-8') as f:
    texts = f.readlines()

# Open metadata file
with open(os.path.join(output_dir, "metadata.txt"), 'w', encoding='utf-8') as meta_file:
    # Process each text
    for text in texts:
        text = text.strip()
        if not text:  # Skip empty lines
            continue
            
        # Random speaker selection
        speaker_id = random.choice(available_speaker_ids)
        speaker_name = speaker_id_to_name[speaker_id]
        
        print(f"Processing: {text}")
        print(f"Selected speaker: {speaker_name} (ID: {speaker_id})")
        
        # Generate audio
        wav, alignment, _, _ = synthesis(
            model=model,
            text=text,
            CONFIG=C,
            use_cuda=USE_CUDA,
            d_vector=model.speaker_manager.get_mean_embedding(speaker_name),
            language_id=language_id,
            use_griffin_lim=True,
            do_trim_silence=False,
        ).values()
        
        # Save the audio file
        file_name = text.replace(" ", "_")
        file_name = "tts_" + file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
        out_path = os.path.join(output_dir, file_name)
        
        print(f"Saving to: {out_path}")
        ap.save_wav(wav, out_path)
        
        # Write metadata
        meta_file.write(f"{file_name}|{speaker_name}\n")
        
        print("-" * 50)

Processing: ส่วนหน้าแข้งยาวเรียวและโค้งเหมาะที่จะใช้ในการจับเหยื่อ
Selected speaker: VCTK_cv037 (ID: 36)
Saving to: output_batch/tts_ส่วนหน้าแข้งยาวเรียวและโค้งเหมาะที่จะใช้ในการจับเหยื่อ.wav
--------------------------------------------------
Processing: ได้เลี้ยงตัวเหี้ยหรือตัวเงินตัวทองที่โผล่เข้ามาอยู่ในบ้าน
Selected speaker: VCTK_cv092 (ID: 91)
Saving to: output_batch/tts_ได้เลี้ยงตัวเหี้ยหรือตัวเงินตัวทองที่โผล่เข้ามาอยู่ในบ้าน.wav
--------------------------------------------------
Processing: ซึ่งจะมีเทคนิคแตกต่างกันทั้งการทำน้ำเคลือบและวิธีเคลือบทำให้ผลิตภัณฑ์สวยงามมีคุณค่า
Selected speaker: VCTK_cv001 (ID: 0)
Saving to: output_batch/tts_ซึ่งจะมีเทคนิคแตกต่างกันทั้งการทำน้ำเคลือบและวิธีเคลือบทำให้ผลิตภัณฑ์สวยงามมีคุณค่า.wav
--------------------------------------------------
Processing: เสียงพยัญชนะควบกล้ำในภาษาไทยมาตรฐานมีทั้งหมดสิบสองเสียงดังนี้
Selected speaker: VCTK_cv006 (ID: 5)
Saving to: output_batch/tts_เสียงพยัญชนะควบกล้ำในภาษาไทยมาตรฐานมีทั้งหมดสิบสองเสียงดังนี้.wav
---