coqui-ai · erogol · Nov 6, 2023 · Oct 30, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml
@@ -42,6 +42,5 @@ jobs:
         run: |
           python3 -m pip install .[all]
           python3 setup.py egg_info
-      # - name: Lint check
-      #   run: |
-      #     make lint
+      - name: Style check
+        run: make style
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,4 @@ wandb
 depot/*
 coqui_recipes/*
 local_scripts/*
+coqui_demos/*
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 
 ## 🐸Coqui.ai News
+- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
+- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
+- 📣 ⓍTTS can now stream with <200ms latency. 
 - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -2,6 +2,20 @@
     "tts_models": {
         "multilingual": {
             "multi-dataset": {
+                "xtts_v2": {
+                    "description": "XTTS-v2 by Coqui with 16 languages.",
+                    "hf_url": [
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
+                    ],
+                    "default_vocoder": null,
+                    "commit": "480a6cdf7",
+                    "license": "CPML",
+                    "contact": "info@coqui.ai",
+                    "tos_required": true
+                },
                 "xtts_v1": {
                     "description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
                     "hf_url": [

diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.19.0
+0.19.1
diff --git a/TTS/api.py b/TTS/api.py
@@ -264,7 +264,7 @@ def tts_coqui_studio(
         language: str = None,
         emotion: str = None,
         speed: float = 1.0,
-        pipe_out = None,
+        pipe_out=None,
         file_path: str = None,
     ) -> Union[np.ndarray, str]:
         """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
@@ -359,7 +359,7 @@ def tts_to_file(
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = 1.0,
-        pipe_out = None,
+        pipe_out=None,
         file_path: str = "output.wav",
         **kwargs,
     ):
@@ -460,7 +460,7 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
         """
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
             # Lazy code... save it to a temp file to resample it while reading it for VC
-            self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name,speaker_wav=speaker_wav)
+            self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
         if self.voice_converter is None:
             self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
         wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -427,7 +427,9 @@ def main():
                 tts_path = model_path
                 tts_config_path = config_path
                 if "default_vocoder" in model_item:
-                    args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+                    args.vocoder_name = (
+                        model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+                    )
 
             # voice conversion model
             if model_item["model_type"] == "voice_conversion_models":

diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
@@ -59,6 +59,16 @@ class XttsConfig(BaseTTSConfig):
 
         decoder_sampler (str):
             Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
+
+        gpt_cond_len (int):
+            Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
+
+        max_ref_len (int):
+            Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`.
+
+        sound_norm_refs (bool):
+            Whether to normalize the conditioning audio. Defaults to `False`.
+
     Note:
         Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
 
@@ -74,7 +84,24 @@ class XttsConfig(BaseTTSConfig):
     audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
     model_dir: str = None
     languages: List[str] = field(
-        default_factory=lambda: ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"]
+        default_factory=lambda: [
+            "en",
+            "es",
+            "fr",
+            "de",
+            "it",
+            "pt",
+            "pl",
+            "tr",
+            "ru",
+            "nl",
+            "cs",
+            "ar",
+            "zh-cn",
+            "hu",
+            "ko",
+            "ja",
+        ]
     )
 
     # inference params
@@ -88,3 +115,8 @@ class XttsConfig(BaseTTSConfig):
     num_gpt_outputs: int = 1
     decoder_iterations: int = 30
     decoder_sampler: str = "ddim"
+
+    # cloning
+    gpt_cond_len: int = 3
+    max_ref_len: int = 10
+    sound_norm_refs: bool = False
diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py
@@ -562,21 +562,15 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
         if order == 3:
             K = steps // 3 + 1
             if steps % 3 == 0:
-                orders = [
-                    3,
-                ] * (
+                orders = [3,] * (
                     K - 2
                 ) + [2, 1]
             elif steps % 3 == 1:
-                orders = [
-                    3,
-                ] * (
+                orders = [3,] * (
                     K - 1
                 ) + [1]
             else:
-                orders = [
-                    3,
-                ] * (
+                orders = [3,] * (
                     K - 1
                 ) + [2]
         elif order == 2:
@@ -587,9 +581,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
                 ] * K
             else:
                 K = steps // 2 + 1
-                orders = [
-                    2,
-                ] * (
+                orders = [2,] * (
                     K - 1
                 ) + [1]
         elif order == 1:
@@ -1448,10 +1440,7 @@ def sample(
                         model_prev_list[-1] = self.model_fn(x, t)
             elif method in ["singlestep", "singlestep_fixed"]:
                 if method == "singlestep":
-                    (
-                        timesteps_outer,
-                        orders,
-                    ) = self.get_orders_and_timesteps_for_singlestep_solver(
+                    (timesteps_outer, orders,) = self.get_orders_and_timesteps_for_singlestep_solver(
                         steps=steps,
                         order=order,
                         skip_type=skip_type,

diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py
@@ -11,6 +11,7 @@
 
 from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel
 from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder
+from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler
 
 
 def null_position_embeddings(range, dim):
@@ -105,6 +106,8 @@ def __init__(
         checkpointing=False,
         average_conditioning_embeddings=False,
         label_smoothing=0.0,
+        use_perceiver_resampler=False,
+        perceiver_cond_length_compression=256,
     ):
         """
         Args:
@@ -132,13 +135,12 @@ def __init__(
         self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
         self.conditioning_dropout = nn.Dropout1d(0.1)
         self.average_conditioning_embeddings = average_conditioning_embeddings
+        self.use_perceiver_resampler = use_perceiver_resampler
+        self.perceiver_cond_length_compression = perceiver_cond_length_compression
 
         self.text_embedding = nn.Embedding(self.number_text_tokens, model_dim)
         self.mel_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
 
-        self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
-        self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)
-
         (
             self.gpt,
             self.mel_pos_embedding,
@@ -165,9 +167,29 @@ def __init__(
         self.text_head = nn.Linear(model_dim, self.number_text_tokens)
         self.mel_head = nn.Linear(model_dim, self.num_audio_tokens)
 
+        if self.use_perceiver_resampler:
+            # XTTS v2
+            self.conditioning_perceiver = PerceiverResampler(
+                dim=model_dim,
+                depth=2,
+                dim_context=model_dim,
+                num_latents=32,
+                dim_head=64,
+                heads=8,
+                ff_mult=4,
+                use_flash_attn=False,
+            )
+        else:
+            # XTTS v1
+            self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
+            self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)
+
     def get_grad_norm_parameter_groups(self):
         return {
             "conditioning_encoder": list(self.conditioning_encoder.parameters()),
+            "conditioning_perceiver": list(self.conditioning_perceiver.parameters())
+            if self.use_perceiver_resampler
+            else None,
             "gpt": list(self.gpt.parameters()),
             "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
         }
@@ -250,11 +272,8 @@ def get_logits(
         if attn_mask_text is not None:
             attn_mask = torch.cat([attn_mask_text, attn_mask_mel], dim=1)
             if prompt is not None:
-                if attn_mask_cond is not None:
-                    attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)
-                else:
-                    attn_mask_cond = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device)
-                    attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)
+                attn_mask_cond = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device)
+                attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)
 
         gpt_out = self.gpt(
             inputs_embeds=emb,
@@ -318,7 +337,6 @@ def get_prompts(self, prompt_codes):
             prompt_len = 3
             prompt_len = prompt_len * 24  # in frames
             if prompt_codes.shape[-1] >= prompt_len:
-                new_prompt = []
                 for i in range(prompt_codes.shape[0]):
                     if lengths[i] < prompt_len:
                         start = 0
@@ -340,7 +358,9 @@ def get_style_emb(self, cond_input, return_latent=False):
         if not return_latent:
             if cond_input.ndim == 4:
                 cond_input = cond_input.squeeze(1)
-            conds = self.conditioning_encoder(cond_input)
+            conds = self.conditioning_encoder(cond_input)  # (b, d, s)
+            if self.use_perceiver_resampler:
+                conds = self.conditioning_perceiver(conds.permute(0, 2, 1)).transpose(1, 2)  # (b, d, 32)
         else:
             # already computed
             conds = cond_input.unsqueeze(1)
@@ -354,6 +374,7 @@ def forward(
         wav_lengths,
         cond_mels=None,
         cond_idxs=None,
+        cond_lens=None,
         cond_latents=None,
         return_attentions=False,
         return_latent=False,
@@ -379,10 +400,24 @@ def forward(
         max_text_len = text_lengths.max()
         code_lengths = torch.ceil(wav_lengths / self.code_stride_len).long() + 3
 
+        if cond_lens is not None:
+            if self.use_perceiver_resampler:
+                cond_lens = cond_lens // self.perceiver_cond_length_compression
+            else:
+                cond_lens = cond_lens // self.code_stride_len
+
         if cond_idxs is not None:
             # recompute cond idxs for mel lengths
-            for idx, l in enumerate(code_lengths):
-                cond_idxs[idx] = cond_idxs[idx] / self.code_stride_len
+            for idx in range(cond_idxs.size(0)):
+                if self.use_perceiver_resampler:
+                    cond_idxs[idx] = cond_idxs[idx] // self.perceiver_cond_length_compression
+                else:
+                    cond_idxs[idx] = cond_idxs[idx] // self.code_stride_len
+
+        # ensure that the cond_mel does not have padding
+        # if cond_lens is not None and cond_idxs is None:
+        #     min_cond_len = torch.min(cond_lens)
+        #     cond_mels = cond_mels[:, :, :, :min_cond_len]
 
         # If len(codes) + 3 is larger than maxiumum allowed length, we truncate the codes.
         max_mel_len = code_lengths.max()
@@ -450,9 +485,13 @@ def forward(
             )
 
             if cond_idxs is not None:
+                # use masking approach
                 for idx, r in enumerate(cond_idxs):
                     l = r[1] - r[0]
                     attn_mask_cond[idx, l:] = 0.0
+            elif cond_lens is not None:
+                for idx, l in enumerate(cond_lens):
+                    attn_mask_cond[idx, l:] = 0.0
 
             for idx, l in enumerate(text_lengths):
                 attn_mask_text[idx, l + 1 :] = 0.0
@@ -523,7 +562,7 @@ def forward(
 
     def inference(self, cond_latents, text_inputs, **hf_generate_kwargs):
         self.compute_embeddings(cond_latents, text_inputs)
-        return self.generate(cond_latents, text_inputs, input_tokens=None, **hf_generate_kwargs)
+        return self.generate(cond_latents, text_inputs, **hf_generate_kwargs)
 
     def compute_embeddings(
         self,