Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump up to v0.20.0 #3120

Merged
merged 4 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/style_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,5 @@ jobs:
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
# - name: Lint check
# run: |
# make lint
- name: Style check
run: make style
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,4 @@ wandb
depot/*
coqui_recipes/*
local_scripts/*
coqui_demos/*
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@

## 🐸Coqui.ai News
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency.
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
Expand Down
14 changes: 14 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@
"tts_models": {
"multilingual": {
"multi-dataset": {
"xtts_v2": {
"description": "XTTS-v2 by Coqui with 16 languages.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
],
"default_vocoder": null,
"commit": "480a6cdf7",
"license": "CPML",
"contact": "info@coqui.ai",
"tos_required": true
},
"xtts_v1": {
"description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
"hf_url": [
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.19.0
0.19.1
6 changes: 3 additions & 3 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def tts_coqui_studio(
language: str = None,
emotion: str = None,
speed: float = 1.0,
pipe_out = None,
pipe_out=None,
file_path: str = None,
) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Expand Down Expand Up @@ -359,7 +359,7 @@ def tts_to_file(
speaker_wav: str = None,
emotion: str = None,
speed: float = 1.0,
pipe_out = None,
pipe_out=None,
file_path: str = "output.wav",
**kwargs,
):
Expand Down Expand Up @@ -460,7 +460,7 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
"""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# Lazy code... save it to a temp file to resample it while reading it for VC
self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name,speaker_wav=speaker_wav)
self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
Expand Down
4 changes: 3 additions & 1 deletion TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,9 @@ def main():
tts_path = model_path
tts_config_path = config_path
if "default_vocoder" in model_item:
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
args.vocoder_name = (
model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
)

# voice conversion model
if model_item["model_type"] == "voice_conversion_models":
Expand Down
34 changes: 33 additions & 1 deletion TTS/tts/configs/xtts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ class XttsConfig(BaseTTSConfig):

decoder_sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.

gpt_cond_len (int):
Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.

max_ref_len (int):
Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`.

sound_norm_refs (bool):
Whether to normalize the conditioning audio. Defaults to `False`.

Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.

Expand All @@ -74,7 +84,24 @@ class XttsConfig(BaseTTSConfig):
audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
model_dir: str = None
languages: List[str] = field(
default_factory=lambda: ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"]
default_factory=lambda: [
"en",
"es",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"zh-cn",
"hu",
"ko",
"ja",
]
)

# inference params
Expand All @@ -88,3 +115,8 @@ class XttsConfig(BaseTTSConfig):
num_gpt_outputs: int = 1
decoder_iterations: int = 30
decoder_sampler: str = "ddim"

# cloning
gpt_cond_len: int = 3
max_ref_len: int = 10
sound_norm_refs: bool = False
21 changes: 5 additions & 16 deletions TTS/tts/layers/tortoise/dpm_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,21 +562,15 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
if order == 3:
K = steps // 3 + 1
if steps % 3 == 0:
orders = [
3,
] * (
orders = [3,] * (
K - 2
) + [2, 1]
elif steps % 3 == 1:
orders = [
3,
] * (
orders = [3,] * (
K - 1
) + [1]
else:
orders = [
3,
] * (
orders = [3,] * (
K - 1
) + [2]
elif order == 2:
Expand All @@ -587,9 +581,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
] * K
else:
K = steps // 2 + 1
orders = [
2,
] * (
orders = [2,] * (
K - 1
) + [1]
elif order == 1:
Expand Down Expand Up @@ -1448,10 +1440,7 @@ def sample(
model_prev_list[-1] = self.model_fn(x, t)
elif method in ["singlestep", "singlestep_fixed"]:
if method == "singlestep":
(
timesteps_outer,
orders,
) = self.get_orders_and_timesteps_for_singlestep_solver(
(timesteps_outer, orders,) = self.get_orders_and_timesteps_for_singlestep_solver(
steps=steps,
order=order,
skip_type=skip_type,
Expand Down
65 changes: 52 additions & 13 deletions TTS/tts/layers/xtts/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel
from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder
from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler


def null_position_embeddings(range, dim):
Expand Down Expand Up @@ -105,6 +106,8 @@ def __init__(
checkpointing=False,
average_conditioning_embeddings=False,
label_smoothing=0.0,
use_perceiver_resampler=False,
perceiver_cond_length_compression=256,
):
"""
Args:
Expand Down Expand Up @@ -132,13 +135,12 @@ def __init__(
self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
self.conditioning_dropout = nn.Dropout1d(0.1)
self.average_conditioning_embeddings = average_conditioning_embeddings
self.use_perceiver_resampler = use_perceiver_resampler
self.perceiver_cond_length_compression = perceiver_cond_length_compression

self.text_embedding = nn.Embedding(self.number_text_tokens, model_dim)
self.mel_embedding = nn.Embedding(self.num_audio_tokens, model_dim)

self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)

(
self.gpt,
self.mel_pos_embedding,
Expand All @@ -165,9 +167,29 @@ def __init__(
self.text_head = nn.Linear(model_dim, self.number_text_tokens)
self.mel_head = nn.Linear(model_dim, self.num_audio_tokens)

if self.use_perceiver_resampler:
# XTTS v2
self.conditioning_perceiver = PerceiverResampler(
dim=model_dim,
depth=2,
dim_context=model_dim,
num_latents=32,
dim_head=64,
heads=8,
ff_mult=4,
use_flash_attn=False,
)
else:
# XTTS v1
self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)

def get_grad_norm_parameter_groups(self):
return {
"conditioning_encoder": list(self.conditioning_encoder.parameters()),
"conditioning_perceiver": list(self.conditioning_perceiver.parameters())
if self.use_perceiver_resampler
else None,
"gpt": list(self.gpt.parameters()),
"heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
}
Expand Down Expand Up @@ -250,11 +272,8 @@ def get_logits(
if attn_mask_text is not None:
attn_mask = torch.cat([attn_mask_text, attn_mask_mel], dim=1)
if prompt is not None:
if attn_mask_cond is not None:
attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)
else:
attn_mask_cond = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device)
attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)
attn_mask_cond = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device)
attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1)

gpt_out = self.gpt(
inputs_embeds=emb,
Expand Down Expand Up @@ -318,7 +337,6 @@ def get_prompts(self, prompt_codes):
prompt_len = 3
prompt_len = prompt_len * 24 # in frames
if prompt_codes.shape[-1] >= prompt_len:
new_prompt = []
for i in range(prompt_codes.shape[0]):
if lengths[i] < prompt_len:
start = 0
Expand All @@ -340,7 +358,9 @@ def get_style_emb(self, cond_input, return_latent=False):
if not return_latent:
if cond_input.ndim == 4:
cond_input = cond_input.squeeze(1)
conds = self.conditioning_encoder(cond_input)
conds = self.conditioning_encoder(cond_input) # (b, d, s)
if self.use_perceiver_resampler:
conds = self.conditioning_perceiver(conds.permute(0, 2, 1)).transpose(1, 2) # (b, d, 32)
else:
# already computed
conds = cond_input.unsqueeze(1)
Expand All @@ -354,6 +374,7 @@ def forward(
wav_lengths,
cond_mels=None,
cond_idxs=None,
cond_lens=None,
cond_latents=None,
return_attentions=False,
return_latent=False,
Expand All @@ -379,10 +400,24 @@ def forward(
max_text_len = text_lengths.max()
code_lengths = torch.ceil(wav_lengths / self.code_stride_len).long() + 3

if cond_lens is not None:
if self.use_perceiver_resampler:
cond_lens = cond_lens // self.perceiver_cond_length_compression
else:
cond_lens = cond_lens // self.code_stride_len

if cond_idxs is not None:
# recompute cond idxs for mel lengths
for idx, l in enumerate(code_lengths):
cond_idxs[idx] = cond_idxs[idx] / self.code_stride_len
for idx in range(cond_idxs.size(0)):
if self.use_perceiver_resampler:
cond_idxs[idx] = cond_idxs[idx] // self.perceiver_cond_length_compression
else:
cond_idxs[idx] = cond_idxs[idx] // self.code_stride_len

# ensure that the cond_mel does not have padding
# if cond_lens is not None and cond_idxs is None:
# min_cond_len = torch.min(cond_lens)
# cond_mels = cond_mels[:, :, :, :min_cond_len]

# If len(codes) + 3 is larger than maxiumum allowed length, we truncate the codes.
max_mel_len = code_lengths.max()
Expand Down Expand Up @@ -450,9 +485,13 @@ def forward(
)

if cond_idxs is not None:
# use masking approach
for idx, r in enumerate(cond_idxs):
l = r[1] - r[0]
attn_mask_cond[idx, l:] = 0.0
elif cond_lens is not None:
for idx, l in enumerate(cond_lens):
attn_mask_cond[idx, l:] = 0.0

for idx, l in enumerate(text_lengths):
attn_mask_text[idx, l + 1 :] = 0.0
Expand Down Expand Up @@ -523,7 +562,7 @@ def forward(

def inference(self, cond_latents, text_inputs, **hf_generate_kwargs):
self.compute_embeddings(cond_latents, text_inputs)
return self.generate(cond_latents, text_inputs, input_tokens=None, **hf_generate_kwargs)
return self.generate(cond_latents, text_inputs, **hf_generate_kwargs)

def compute_embeddings(
self,
Expand Down
Loading
Loading