coqui-ai · erogol · Nov 21, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
@@ -10,7 +10,7 @@ jobs:
   build-sdist:
     runs-on: ubuntu-20.04
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Verify tag matches version
         run: |
           set -ex
@@ -38,7 +38,7 @@ jobs:
       matrix:
         python-version: ["3.9", "3.10", "3.11"]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -3,14 +3,14 @@
         "multilingual": {
             "multi-dataset": {
                 "xtts_v2": {
-                    "description": "XTTS-v2 by Coqui with 16 languages.",
+                    "description": "XTTS-v2.0.2 by Coqui with 16 languages.",
                     "hf_url": [
                         "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
                         "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
                         "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
                         "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
                     ],
-                    "model_hash": "6a09d1ad43896f06041ed8195956c9698f13b6189dc80f1c74bdc2b8e8d15324",
+                    "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
                     "default_vocoder": null,
                     "commit": "480a6cdf7",
                     "license": "CPML",

diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.20.5
+0.20.6
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -15,6 +15,7 @@
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import quantize
 from TTS.utils.generic_utils import count_parameters
 
 use_cuda = torch.cuda.is_available()
@@ -159,7 +160,7 @@ def inference(
 
 
 def extract_spectrograms(
-    data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
+    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
 ):
     model.eval()
     export_metadata = []
@@ -196,8 +197,8 @@ def extract_spectrograms(
             _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
 
             # quantize and save wav
-            if quantized_wav:
-                wavq = ap.quantize(wav)
+            if quantize_bits > 0:
+                wavq = quantize(wav, quantize_bits)
                 np.save(wavq_path, wavq)
 
             # save TTS mel
@@ -263,7 +264,7 @@ def main(args):  # pylint: disable=redefined-outer-name
         model,
         ap,
         args.output_path,
-        quantized_wav=args.quantized,
+        quantize_bits=args.quantize_bits,
         save_audio=args.save_audio,
         debug=args.debug,
         metada_name="metada.txt",
@@ -277,7 +278,7 @@ def main(args):  # pylint: disable=redefined-outer-name
     parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
     parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
     parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
+    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
     parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
     args = parser.parse_args()
 

diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py
@@ -13,12 +13,18 @@
 import numpy as np
 import torch
 import torch as th
-from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
 from tqdm import tqdm
 
 from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
 
-K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
+try:
+    from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
+
+    K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
+except ImportError:
+    K_DIFFUSION_SAMPLERS = None
+
+
 SAMPLERS = ["dpm++2m", "p", "ddim"]
 
 
@@ -531,6 +537,8 @@ def sample_loop(self, *args, **kwargs):
             if self.conditioning_free is not True:
                 raise RuntimeError("cond_free must be true")
             with tqdm(total=self.num_timesteps) as pbar:
+                if K_DIFFUSION_SAMPLERS is None:
+                    raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
                 return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
         else:
             raise RuntimeError("sampler not impl")

diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py
@@ -441,7 +441,9 @@ def forward(
         audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
 
         # Pad mel codes with stop_audio_token
-        audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
+        audio_codes = self.set_mel_padding(
+            audio_codes, code_lengths - 3
+        )  # -3 to get the real code lengths without consider start and stop tokens that was not added yet
 
         # Build input and target tensors
         # Prepend start token to inputs and append stop token to targets

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
@@ -1,17 +1,73 @@
-import json
 import os
 import re
+import textwrap
 from functools import cached_property
 
 import pypinyin
 import torch
 from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 from num2words import num2words
+from spacy.lang.ar import Arabic
+from spacy.lang.en import English
+from spacy.lang.es import Spanish
+from spacy.lang.ja import Japanese
+from spacy.lang.zh import Chinese
 from tokenizers import Tokenizer
 
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 
+
+def get_spacy_lang(lang):
+    if lang == "zh":
+        return Chinese()
+    elif lang == "ja":
+        return Japanese()
+    elif lang == "ar":
+        return Arabic()
+    elif lang == "es":
+        return Spanish()
+    else:
+        # For most languages, Enlish does the job
+        return English()
+
+
+def split_sentence(text, lang, text_split_length=250):
+    """Preprocess the input text"""
+    text_splits = []
+    if text_split_length is not None and len(text) >= text_split_length:
+        text_splits.append("")
+        nlp = get_spacy_lang(lang)
+        nlp.add_pipe("sentencizer")
+        doc = nlp(text)
+        for sentence in doc.sents:
+            if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
+                # if the last sentence + the current sentence is less than the text_split_length
+                # then add the current sentence to the last sentence
+                text_splits[-1] += " " + str(sentence)
+                text_splits[-1] = text_splits[-1].lstrip()
+            elif len(str(sentence)) > text_split_length:
+                # if the current sentence is greater than the text_split_length
+                for line in textwrap.wrap(
+                    str(sentence),
+                    width=text_split_length,
+                    drop_whitespace=True,
+                    break_on_hyphens=False,
+                    tabsize=1,
+                ):
+                    text_splits.append(str(line))
+            else:
+                text_splits.append(str(sentence))
+
+        if len(text_splits) > 1:
+            if text_splits[0] == "":
+                del text_splits[0]
+    else:
+        text_splits = [text.lstrip()]
+
+    return text_splits
+
+
 _whitespace_re = re.compile(r"\s+")
 
 # List of (regular expression, replacement) pairs for abbreviations:
@@ -115,7 +171,7 @@
             # There are not many common abbreviations in Arabic as in English.
         ]
     ],
-    "zh-cn": [
+    "zh": [
         (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
         for x in [
             # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
@@ -280,7 +336,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
             ("°", " درجة "),
         ]
     ],
-    "zh-cn": [
+    "zh": [
         # Chinese
         (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
         for x in [
@@ -464,7 +520,7 @@ def _expand_number(m, lang="en"):
 
 
 def expand_numbers_multilingual(text, lang="en"):
-    if lang == "zh" or lang == "zh-cn":
+    if lang == "zh":
         text = zh_num2words()(text)
     else:
         if lang in ["en", "ru"]:
@@ -525,7 +581,7 @@ def japanese_cleaners(text, katsu):
     return text
 
 
-def korean_cleaners(text):
+def korean_transliterate(text):
     r = Transliter(academic)
     return r.translit(text)
 
@@ -546,7 +602,7 @@ def __init__(self, vocab_file=None):
             "it": 213,
             "pt": 203,
             "pl": 224,
-            "zh-cn": 82,
+            "zh": 82,
             "ar": 166,
             "cs": 186,
             "ru": 182,
@@ -564,28 +620,31 @@ def katsu(self):
         return cutlet.Cutlet()
 
     def check_input_length(self, txt, lang):
+        lang = lang.split("-")[0]  # remove the region
         limit = self.char_limits.get(lang, 250)
         if len(txt) > limit:
             print(
                 f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
             )
 
     def preprocess_text(self, txt, lang):
-        if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh-cn", "zh-cn"}:
+        if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
             txt = multilingual_cleaners(txt, lang)
-            if lang in {"zh", "zh-cn"}:
+            if lang == "zh":
                 txt = chinese_transliterate(txt)
+            if lang == "ko":
+                txt = korean_transliterate(txt)
         elif lang == "ja":
             txt = japanese_cleaners(txt, self.katsu)
-        elif lang == "ko":
-            txt = korean_cleaners(txt)
         else:
             raise NotImplementedError(f"Language '{lang}' is not supported.")
         return txt
 
     def encode(self, txt, lang):
+        lang = lang.split("-")[0]  # remove the region
         self.check_input_length(txt, lang)
         txt = self.preprocess_text(txt, lang)
+        lang = "zh-cn" if lang == "zh" else lang
         txt = f"[{lang}]{txt}"
         txt = txt.replace(" ", "[SPACE]")
         return self.tokenizer.encode(txt).ids
@@ -682,8 +741,8 @@ def test_expand_numbers_multilingual():
         ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
         ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
         # Chinese (Simplified)
-        ("在12.5秒内", "在十二点五秒内", "zh-cn"),
-        ("有50名士兵", "有五十名士兵", "zh-cn"),
+        ("在12.5秒内", "在十二点五秒内", "zh"),
+        ("有50名士兵", "有五十名士兵", "zh"),
         # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
         # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
         # Turkish
@@ -764,7 +823,7 @@ def test_symbols_multilingual():
         ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
         ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
         ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
-        ("我的电量为 14%", "我的电量为 14 百分之", "zh-cn"),
+        ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
         ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
         ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
         ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),

diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -318,9 +318,10 @@ def eval_step(self, batch, criterion):
         batch["cond_idxs"] = None
         return self.train_step(batch, criterion)
 
-    def on_epoch_start(self, trainer):  # pylint: disable=W0613
-        # guarante that dvae will be in eval mode after .train() on evaluation end
-        self.dvae = self.dvae.eval()
+    def on_train_epoch_start(self, trainer):
+        trainer.model.eval() # the whole model to eval
+        # put gpt model in training mode
+        trainer.model.xtts.gpt.train()
 
     def on_init_end(self, trainer):  # pylint: disable=W0613
         # ignore similarities.pth on clearml save/upload