Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multilingual tokenizer #2229

Merged
merged 6 commits into from
Jan 2, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions TTS/config/shared_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,9 @@ class BaseDatasetConfig(Coqpit):
language (str):
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.

phonemizer (str):
Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.

meta_file_val (str):
Name of the dataset meta file that defines the instances used at validation.

Expand All @@ -226,6 +229,7 @@ class BaseDatasetConfig(Coqpit):
meta_file_train: str = ""
ignored_speakers: List[str] = None
language: str = ""
phonemizer: str = ""
meta_file_val: str = ""
meta_file_attn_mask: str = ""

Expand Down
6 changes: 3 additions & 3 deletions TTS/tts/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,14 +569,14 @@ def __init__(

def __getitem__(self, index):
item = self.samples[index]
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
ph_hat = self.tokenizer.ids_to_text(ids)
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}

def __len__(self):
return len(self.samples)

def compute_or_load(self, file_name, text):
def compute_or_load(self, file_name, text, language):
"""Compute phonemes for the given text.

If the phonemes are already cached, load them from cache.
Expand All @@ -586,7 +586,7 @@ def compute_or_load(self, file_name, text):
try:
ids = np.load(cache_path)
except FileNotFoundError:
ids = self.tokenizer.text_to_ids(text)
ids = self.tokenizer.text_to_ids(text, language=language)
np.save(cache_path, ids)
return ids

Expand Down
8 changes: 7 additions & 1 deletion TTS/tts/utils/synthesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,15 @@ def synthesis(
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
style_mel = style_mel.transpose(1, 2) # [1, time, depth]

language_name = None
if language_id is not None:
language = [k for k, v in model.language_manager.name_to_id.items() if v == language_id]
assert len(language) == 1, "language_id must be a valid language"
language_name = language[0]

# convert text to sequence of token IDs
text_inputs = np.asarray(
model.tokenizer.text_to_ids(text, language=language_id),
model.tokenizer.text_to_ids(text, language=language_name),
dtype=np.int32,
)
# pass tensors to backend
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/utils/text/phonemizers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def _phonemize_postprocess(self, phonemized, punctuations) -> str:
return self._punctuator.restore(phonemized, punctuations)[0]
return phonemized[0]

def phonemize(self, text: str, separator="|") -> str:
def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
"""Returns the `text` phonemized for the given language

Args:
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _phonemize(self, text: str, separator: str = "|") -> str:
return separator.join(ph)
return ph

def phonemize(self, text: str, separator="|") -> str:
def phonemize(self, text: str, separator="|", language=None) -> str:
"""Custom phonemize for JP_JA

Skip pre-post processing steps used by the other phonemizers.
Expand Down
2 changes: 1 addition & 1 deletion TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def _phonemize(self, text: str, separator: str = "", character: str = "hangeul")
return separator.join(ph)
return ph

def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
return self._phonemize(text, separator, character)

@staticmethod
Expand Down
28 changes: 19 additions & 9 deletions TTS/tts/utils/text/phonemizers/multi_phonemizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,40 @@ class MultiPhonemizer:
TODO: find a way to pass custom kwargs to the phonemizers
"""

lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
language = "multi-lingual"
lang_to_phonemizer = {}

def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value
self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value
for k, v in lang_to_phonemizer_name.items():
if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
elif v == "":
raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
self.lang_to_phonemizer_name = lang_to_phonemizer_name
self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)

@staticmethod
def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
lang_to_phonemizer = {}
for k, v in lang_to_phonemizer_name.items():
phonemizer = get_phonemizer_by_name(v, language=k)
lang_to_phonemizer[k] = phonemizer
lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
return lang_to_phonemizer

@staticmethod
def name():
return "multi-phonemizer"

def phonemize(self, text, language, separator="|"):
def phonemize(self, text, separator="|", language=""):
if language == "":
raise ValueError("Language must be set for multi-phonemizer to phonemize.")
return self.lang_to_phonemizer[language].phonemize(text, separator)

def supported_languages(self) -> List:
return list(self.lang_to_phonemizer_name.keys())
return list(self.lang_to_phonemizer.keys())

def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > phoneme language: {self.supported_languages()}")
print(f"{indent}| > phoneme backend: {self.name()}")


# if __name__ == "__main__":
Expand All @@ -48,7 +58,7 @@ def supported_languages(self) -> List:
# "zh-cn": "这是中国的例子",
# }
# phonemes = {}
# ph = MultiPhonemizer()
# ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
# for lang, text in texts.items():
# phoneme = ph.phonemize(text, lang)
# phonemes[lang] = phoneme
Expand Down
39 changes: 24 additions & 15 deletions TTS/tts/utils/text/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
from TTS.utils.generic_utils import get_import_path, import_class


Expand Down Expand Up @@ -106,7 +107,7 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint:
if self.text_cleaner is not None:
text = self.text_cleaner(text)
if self.use_phonemes:
text = self.phonemizer.phonemize(text, separator="")
text = self.phonemizer.phonemize(text, separator="", language=language)
if self.add_blank:
text = self.intersperse_blank_char(text, True)
if self.use_eos_bos:
Expand Down Expand Up @@ -182,21 +183,29 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
# init phonemizer
phonemizer = None
if config.use_phonemes:
phonemizer_kwargs = {"language": config.phoneme_language}

if "phonemizer" in config and config.phonemizer:
phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
if "phonemizer" in config and config.phonemizer == "multi_phonemizer":
lang_to_phonemizer_name = {}
for dataset in config.datasets:
if dataset.language != "":
lang_to_phonemizer_name[dataset.language] = dataset.phonemizer
else:
raise ValueError("Multi phonemizer requires language to be set for each dataset.")
phonemizer = MultiPhonemizer(lang_to_phonemizer_name)
else:
try:
phonemizer = get_phonemizer_by_name(
DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
)
new_config.phonemizer = phonemizer.name()
except KeyError as e:
raise ValueError(
f"""No phonemizer found for language {config.phoneme_language}.
You may need to install a third party library for this language."""
) from e
phonemizer_kwargs = {"language": config.phoneme_language}
if "phonemizer" in config and config.phonemizer:
phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
else:
try:
phonemizer = get_phonemizer_by_name(
DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
)
new_config.phonemizer = phonemizer.name()
except KeyError as e:
raise ValueError(
f"""No phonemizer found for language {config.phoneme_language}.
You may need to install a third party library for this language."""
) from e

return (
TTSTokenizer(
Expand Down
126 changes: 126 additions & 0 deletions recipes/multilingual/vits_tts/train_vits_tts_phonemes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
from glob import glob

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

output_path = "/media/julian/Workdisk/train"

mailabs_path = "/home/julian/workspace/mailabs/**"
dataset_paths = glob(mailabs_path)
dataset_config = [
BaseDatasetConfig(
formatter="mailabs",
meta_file_train=None,
path=path,
language=path.split("/")[-1], # language code is the folder name
)
for path in dataset_paths
]

audio_config = VitsAudioConfig(
sample_rate=16000,
win_length=1024,
hop_length=256,
num_mels=80,
mel_fmin=0,
mel_fmax=None,
)

vitsArgs = VitsArgs(
use_language_embedding=True,
embedded_language_dim=4,
use_speaker_embedding=True,
use_sdp=False,
)

config = VitsConfig(
model_args=vitsArgs,
audio=audio_config,
run_name="vits_vctk",
use_speaker_embedding=True,
batch_size=32,
eval_batch_size=16,
batch_group_size=0,
num_loader_workers=12,
num_eval_loader_workers=12,
precompute_num_workers=12,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="multilingual_cleaners",
use_phonemes=True,
phoneme_language=None,
phonemizer="multi_phonemizer",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
compute_input_seq_cache=True,
print_step=25,
use_language_weighted_sampler=True,
print_eval=False,
mixed_precision=False,
min_audio_len=audio_config.sample_rate,
max_audio_len=audio_config.sample_rate * 10,
output_path=output_path,
datasets=dataset_config,
test_sentences=[
[
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"mary_ann",
None,
"en-us",
],
[
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
"ezwa",
None,
"fr-fr",
],
["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, "de-de"],
["Я думаю, что этот стартап действительно удивительный.", "nikolaev", None, "ru"],
],
)

# force the convertion of the custom characters to a config attribute
config.from_dict(config.to_dict())

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)

# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.model_args.num_speakers = speaker_manager.num_speakers

language_manager = LanguageManager(config=config)
config.model_args.num_languages = language_manager.num_languages

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

# init model
model = Vits(config, ap, tokenizer, speaker_manager, language_manager)

# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()
42 changes: 42 additions & 0 deletions tests/text_tests/test_phonemizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from distutils.version import LooseVersion

from TTS.tts.utils.text.phonemizers import ESpeak, Gruut, JA_JP_Phonemizer, ZH_CN_Phonemizer
from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer

EXAMPLE_TEXTs = [
"Recent research at Harvard has shown meditating",
Expand Down Expand Up @@ -226,3 +227,44 @@ def test_get_version(self):

def test_is_available(self):
self.assertTrue(self.phonemizer.is_available())


class TestMultiPhonemizer(unittest.TestCase):
def setUp(self):
self.phonemizer = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})

def test_phonemize(self):

# Enlish espeak
text = "Be a voice, not an! echo?"
gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?"
output = self.phonemizer.phonemize(text, separator="|", language="en-us")
output = output.replace("|", "")
self.assertEqual(output, gt)

# German gruut
text = "Hallo, das ist ein Deutches Beipiel!"
gt = "haloː, das ɪst aeːn dɔɔʏ̯tçəs bəʔiːpiːl!"
output = self.phonemizer.phonemize(text, separator="|", language="de")
output = output.replace("|", "")
self.assertEqual(output, gt)

def test_phonemizer_initialization(self):
# test with unsupported language
with self.assertRaises(ValueError):
MultiPhonemizer({"tr": "espeak", "xx": ""})

# test with unsupported phonemizer
with self.assertRaises(ValueError):
MultiPhonemizer({"tr": "espeak", "fr": "xx"})

def test_sub_phonemizers(self):
for lang in self.phonemizer.lang_to_phonemizer_name.keys():
self.assertEqual(lang, self.phonemizer.lang_to_phonemizer[lang].language)
self.assertEqual(self.phonemizer.lang_to_phonemizer_name[lang], self.phonemizer.lang_to_phonemizer[lang].name())

def test_name(self):
self.assertEqual(self.phonemizer.name(), "multi-phonemizer")

def test_get_supported_languages(self):
self.assertIsInstance(self.phonemizer.supported_languages(), list)