In [1]:
# 模型下载
from modelscope import snapshot_download
from ChatTTS import Chat
import os
import torch

model_dir = snapshot_download("mirror013/ChatTTS")
device = torch.device(
    "cuda:0"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# 加载模型
chat = Chat()
chat.load(
    source="custom",
    custom_path=model_dir,
    device=device,
    compile=False,
)

SEED = 1397
torch.manual_seed(SEED)  # 音色种子
# load from local file if exists
if os.path.exists("spk_emb.npy"):
    spk_emb = torch.load("spk_emb.npy", map_location="cpu")
    print("use local speaker embedding")
else:
    spk_emb = chat.sample_random_speaker()
    print("use random speaker embedding")

params_infer_code = Chat.InferCodeParams(
    spk_emb=spk_emb,  # add sampled speaker
    temperature=0.3,  # using custom temperature
    top_P=0.7,  # top P decode
    top_K=20,  # top K decode
)

params_refine_text = Chat.RefineTextParams(
    prompt="[oral_9][laugh_2][break_7]",
)

# 对文本进行预处理
new_texts = []


def filter_punctuation(text):
    allowed_punctuations = {".", ",", "!", "?", "，", "。", "！", "？", " "}
    new_text = ""
    for char in text:
        if char.isalnum() or char in allowed_punctuations:
            new_text += char
    return new_text

2024-07-02 19:05:03,407 - modelscope - INFO - PyTorch version 2.1.0 Found.
2024-07-02 19:05:03,408 - modelscope - INFO - Loading ast index from /Users/charslee/.cache/modelscope/ast_indexer
2024-07-02 19:05:03,498 - modelscope - INFO - Loading done! Current index file version is 1.13.3, with md5 8e4efa69aee288a831cd8dd27b421a93 and a total number of 972 components indexed
  return self.fget.__get__(instance, owner)()


use random speaker embedding


In [2]:
## 尝试有感情的说话
text = "早安！带可莉出去玩吧！我们一起来冒险！"

import IPython.display
from IPython.display import Audio

torch.manual_seed(SEED)
# 先进行参考文本转换
reference_text = chat.infer(
    text=[filter_punctuation(text)],
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    do_text_normalization=False,
    refine_text_only=True,
)
print(f"参考文本: {reference_text}")

found invalid characters: {'！'}
  return torch._weight_norm(weight_v, weight_g, self.dim)
  sorted_logits, sorted_indices = torch.sort(scores, descending=False)
  cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
  indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]
text:   8%|▊         | 30/384(max) [00:02, 14.43it/s]

参考文本: ['早 安 [laugh] 。 然 后 [uv_break] 嗯 [lbreak] 带 可 莉 出 去 玩 吧 [uv_break] 。 然 后 我 们 就 一 起 来 冒 险 [laugh] 。']





In [3]:
# 使用decoder
wav = chat.infer(
    text=reference_text,
    skip_refine_text=True,
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    do_homophone_replacement=False,
)

display(Audio(wav[0], rate=24000))

# 不使用decoder
wav = chat.infer(
    text=reference_text,
    skip_refine_text=True,
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    do_homophone_replacement=False,
    use_decoder=False,
)

display(Audio(wav[0], rate=24000))

code:  12%|█▏        | 246/2048(max) [00:11, 21.61it/s]


code:  12%|█▏        | 253/2048(max) [00:12, 20.27it/s]
