In [None]:
import os
import re
from TTS.tts.configs.shared_configs import CharactersConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
import pyopenjtalk

## PyOpenJTalk_Phonemizer

- `TTSTokenizer.text_to_ids()` の処理

In [None]:
text = "これは、音声合成のテストです。"
ph = pyopenjtalk.g2p(text, kana=False)
ph = ph.split(" ")
print(ph)

In [None]:
characters = CharactersConfig(
    characters_class="TTS.tts.models.vits.VitsCharacters",
    pad="<PAD>",
    characters=[
        "pau",
        "I",
        "N",
        "U",
        "a",
        "b",
        "by",
        "ch",
        "cl",
        "d",
        "dy",
        "e",
        "f",
        "g",
        "gy",
        "h",
        "hy",
        "i",
        "j",
        "k",
        "ky",
        "m",
        "my",
        "n",
        "ny",
        "o",
        "p",
        "py",
        "r",
        "ry",
        "s",
        "sh",
        "t",
        "ts",
        "u",
        "v",
        "w",
        "y",
        "z",
    ],
    punctuations=".?!",
)

In [None]:
config = VitsConfig(
    run_name="vits_jsut",
    text_cleaner="japanese_cleaners",
    use_phonemes=True,
    add_blank=True,
    phoneme_language="ja-jp",
    phonemizer="pyopenjtalk",
    characters=characters,
)

In [None]:
tokenizer, config = TTSTokenizer.init_from_config(config)

In [None]:
text = "これは、音声合成のテストです。"

In [None]:
text = tokenizer.phonemizer.phonemize(text, separator="")
print(text)

In [None]:
text = tokenizer.intersperse_blank_char(text, True)
print(text)

In [None]:
token_ids = tokenizer.encode(text)
print(token_ids)

In [None]:
tokenizer.characters.blank_id

In [None]:
tokenizer.characters.pad_id

In [None]:
tokenizer.characters._char_to_id

## PyOpenJTalk_Kana_Phonemizer

In [None]:
text = "これは、音声合成のテストです。"
kanas = pyopenjtalk.g2p(text, kana=True)
print(list(kanas))

## PyOpenJTalk_Accent_Phonemizer

In [None]:
text = "これは、音声合成のテストです。"
phones = []
for labels in pyopenjtalk.extract_fullcontext(text):
    p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
    if len(p) == 1:
        phones += [p[0][0], p[0][2], p[0][1]]
print(phones)

## PyOpenJTalk_Accent_with_Pause_Phonemizer

- PyOpenJTalk_Accent_Phonemizerにpau音素が加わっただけ

In [None]:
text = "これは、音声合成のテストです。"
phones = []
for labels in pyopenjtalk.extract_fullcontext(text):
    if labels.split("-")[1].split("+")[0] == "pau":
        phones += ["pau"]
        continue
    p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
    if len(p) == 1:
        phones += [p[0][0], p[0][2], p[0][1]]
print(phones)

## PyOpenJTalk_Prosody_Phonemizer

```
Extract phoneme + prosoody symbol sequence from input full-context labels.
The algorithm is based on `Prosodic features control by symbols as input of
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.

Args:
    text (str): Input text.
    drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
Returns:
    List[str]: List of phoneme + prosody symbols.
Examples:
    >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
    >>> pyopenjtalk_g2p_prosody("こんにちは。")
    ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
    modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
```

In [None]:
text = "これは、音声合成のテストですか？"

def _numeric_feature_by_regex(regex, s):
    match = re.search(regex, s)
    if match is None:
        return -50
    return int(match.group(1))


drop_unvoiced_vowels = True
labels = pyopenjtalk.extract_fullcontext(text)
N = len(labels)

phones = []
for n in range(N):
    lab_curr = labels[n]

    # current phoneme
    p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)

    # deal unvoiced vowels as normal vowels
    if drop_unvoiced_vowels and p3 in "AEIOU":
        p3 = p3.lower()

    # deal with sil at the beginning and the end of text
    if p3 == "sil":
        assert n == 0 or n == N - 1
        if n == 0:
            phones.append("^")
        elif n == N - 1:
            # check question form or not
            e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
            if e3 == 0:
                phones.append("$")
            elif e3 == 1:
                phones.append("?")
        continue
    elif p3 == "pau":
        phones.append("_")
        continue
    else:
        phones.append(p3)

    # accent type and position info (forward or backward)
    a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
    a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
    a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)

    # number of mora in accent phrase
    f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)

    a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
    # accent phrase border
    if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
        phones.append("#")
    # pitch falling
    elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
        phones.append("]")
    # pitch rising
    elif a2 == 1 and a2_next == 2:
        phones.append("[")

print(phones)