## 测评经过RVC数据的DVAEDecoder模型

In [1]:
import torch
from modules.dvae import DVAEDecoder  # 假设这是你定义的DVAEDecoder模型

# 假设你的模型权重保存在'best_model.pth'这个文件中
checkpoint_path = "./checkpoints/model-55-0.0000.ckpt"

# 加载模型架构
IDIM = 384
ODIM = 100
model = DVAEDecoder(idim=IDIM, odim=ODIM, bn_dim=64, hidden=512, n_layer=8)  # 根据你的模型初始化参数

# 加载训练好的模型权重
checkpoint = torch.load(checkpoint_path, map_location="cpu", mmap=True)
state_dict = checkpoint['state_dict']

# 处理state_dict的key，去除前缀'model.'，如果存在的话
new_state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}

# 加载修正后的state_dict
model.load_state_dict(new_state_dict)

# 切换模型到评估模式
model.eval()

# 如果有GPU，将模型移到GPU上
device = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


DVAEDecoder(
  (conv_in): Sequential(
    (0): Conv1d(384, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): GELU(approximate='none')
    (2): Conv1d(64, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (decoder_block): ModuleList(
    (0-7): 8 x ConvNeXtBlock(
      (dwconv): Conv1d(512, 512, kernel_size=(7,), stride=(1,), padding=(6,), dilation=(2,), groups=512)
      (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
      (pwconv1): Linear(in_features=512, out_features=2048, bias=True)
      (act): GELU(approximate='none')
      (pwconv2): Linear(in_features=2048, out_features=512, bias=True)
    )
  )
  (conv_out): Conv1d(512, 100, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
)

In [2]:
import numpy as np
import os
import random
import torch

def preprocess_features(features_tensor):
    """
    Preprocess the features tensor to prepare it for decoding.
    """
    features_tensor = features_tensor.transpose(1, 2)
    temp = torch.chunk(features_tensor, 2, dim=1)
    temp = torch.stack(temp, -1)
    vq_feats = temp.reshape(*temp.shape[:2], -1)
    return vq_feats

def infer_from_features(model, features_tensor, device):
    """
    Pass preprocessed features through the model and return the output.
    """
    vq_feats = preprocess_features(features_tensor)
    vq_feats = vq_feats.to(device)
    with torch.no_grad():
        output = model(vq_feats)
    return output.to('cpu')

def load_and_forward_npz(model, npz_path, device):
    """
    Load a .npz file's data and pass it through the model's forward method.
    """
    data = np.load(npz_path)
    features = data['hidden']
    features_tensor = torch.from_numpy(features).float()[None]
    
    output = infer_from_features(model, features_tensor, device)
    return output

# 获取训练目录下的所有 .npz 文件路径
npz_files_dir = 'train_rvc/'
npz_files = [os.path.join(npz_files_dir, f) for f in os.listdir(npz_files_dir) if f.endswith('.npz')]

In [3]:
#模型下载
from modelscope import snapshot_download
from ChatTTS import Chat
model_dir = snapshot_download('mirror013/ChatTTS')

# 加载模型
chat = Chat()
chat.load_models(
    source="local",
    local_path=model_dir,
    device='cpu',
    compile=False,
)

SEED = 1397
torch.manual_seed(SEED) # 音色种子
# load from local file if exists
if os.path.exists('spk_emb.npy'):
    spk_emb = torch.load('spk_emb.npy',map_location='cpu')
    print("use local speaker embedding")
else:
    spk_emb = chat.sample_random_speaker()
    print("use random speaker embedding")

params_infer_code = {
    'spk_emb': spk_emb,
    'temperature': 0.1,
    'top_P': 0.7,
    'top_K': 20,
}

# params_refine_text = {}
params_refine_text = {'prompt': '[oral_9][laugh_2][break_7]'}


# 对文本进行预处理
new_texts = []


def filter_punctuation(text):
    allowed_punctuations = {".", ",", "!", "?", "，", "。", "！", "？"," "}
    new_text = ""
    for char in text:
        if char.isalnum() or char in allowed_punctuations:
            new_text += char
    return new_text

2024-06-24 21:07:51,814 - modelscope - INFO - PyTorch version 2.1.0 Found.
2024-06-24 21:07:51,815 - modelscope - INFO - Loading ast index from /Users/charslee/.cache/modelscope/ast_indexer
2024-06-24 21:07:51,844 - modelscope - INFO - Loading done! Current index file version is 1.13.3, with md5 8e4efa69aee288a831cd8dd27b421a93 and a total number of 972 components indexed
INFO:ChatTTS.core:Load from local: /Users/charslee/.cache/modelscope/hub/mirror013/ChatTTS
INFO:ChatTTS.core:vocos loaded.
INFO:ChatTTS.core:dvae loaded.
INFO:ChatTTS.core:gpt loaded.
INFO:ChatTTS.core:decoder loaded.
INFO:ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.core:All initialized.


use local speaker embedding


In [4]:
from IPython.display import Audio
# 随机选择一个文件用来合成音频
selected_file = random.choice(npz_files)
print(f"Selected file: {selected_file}")

# 加载选定的 .npz 文件并进行模型前向传播
mel_rebuild = load_and_forward_npz(model, selected_file,device=device)
print("Model output shape:", mel_rebuild.shape)

with torch.no_grad():
    mel_rebuild = mel_rebuild.to('cpu')
    audio = chat.pretrain_models['vocos'].decode(mel_rebuild)
Audio(audio.cpu().numpy(), rate=24000)

Selected file: train_rvc/149f2d267bd261fb29329332dde893e3.npz
Model output shape: torch.Size([1, 100, 486])


## 尝试从chatTTS中生成特征，使用特制的DVAEDecoder 来进行解码

In [10]:
import IPython

text = "人类很有趣，所以我才如此喜欢人类。明明生命短暂，却能散发映昼光辉。他们之中，有些人的光芒，即使隔开久远时间长河都不曾黯淡。阅读这样的人生，是一大乐事啊。"

# 试下纯文本
torch.manual_seed(SEED)
result = chat.infer_debug(text=filter_punctuation(text),params_infer_code=params_infer_code)
hidden = result['hiddens'][0]
print(f"hidden shape: {hidden.shape}")

# 使用特制的DVAEDeocder
mel_rebuild = infer_from_features(model, hidden[None], device)

# 再通过vocoas 解码
with torch.no_grad():
    mel_rebuild = mel_rebuild.to('cpu')
    audio = chat.pretrain_models['vocos'].decode(mel_rebuild)
IPython.display.display(Audio(audio.cpu().numpy(), rate=24000))

#原来的音频进行解码
IPython.display.display(Audio(result['wav'][0],rate=24000))

INFO:ChatTTS.core:All initialized.
 29%|██▊       | 584/2048 [00:24<01:00, 24.09it/s]


hidden shape: torch.Size([584, 768])


In [12]:
## 尝试有感情的说话

import IPython.display
torch.manual_seed(SEED)
# 先进行参考文本转换
reference_text = chat.infer(text=[filter_punctuation(text)],
                    params_refine_text=params_refine_text,
                    params_infer_code=params_infer_code,
                    do_text_normalization=False,
                    refine_text_only=True)
print(f"参考文本: {reference_text}")


torch.manual_seed(SEED)
hidden = chat.infer(text=reference_text,
                    params_refine_text=params_refine_text,
                    params_infer_code=params_infer_code,
                    do_text_normalization=False,
                    skip_refine_text=True,
                    return_infer_token=True)['hiddens'][0]

# 使用特制的DVAEDeocder
mel_rebuild = infer_from_features(model, hidden[None], device)

# 再通过vocoas 解码
with torch.no_grad():
    mel_rebuild = mel_rebuild.to('cpu')
    audio = chat.pretrain_models['vocos'].decode(mel_rebuild)
IPython.display.display(Audio(audio.cpu().numpy(), rate=24000))

torch.manual_seed(SEED)
all_wavs = chat.infer(text=reference_text,
                    skip_refine_text=True,
                    params_infer_code=params_infer_code,
                    do_text_normalization=False)

# 确保所有数组的维度都是 (1, N)，然后进行合并
combined_wavs = np.concatenate(all_wavs, axis=1)
IPython.display.display(Audio(combined_wavs, rate=24000))


INFO:ChatTTS.core:All initialized.
 23%|██▎       | 87/384 [00:04<00:14, 20.71it/s]
INFO:ChatTTS.core:All initialized.


参考文本: ['人 类 真 的 很 有 趣 ， 所 以 我 才 如 此 喜 欢 [uv_break] 人 类 。 明 明 生 命 短 暂 ， 却 能 散 发 映 昼 光 辉 [uv_break] 。 他 们 之 中 啊 [laugh] ， 有 些 人 的 光 芒 ， 即 使 隔 开 久 远 [uv_break] 时 间 长 河 都 不 曾 黯 淡 。 阅 读 这 样 的 人 生 ， 那 真 的 是 一 大 乐 [uv_break] 事 啊 。']


 37%|███▋      | 753/2048 [00:32<00:55, 23.53it/s]


INFO:ChatTTS.core:All initialized.
 37%|███▋      | 753/2048 [00:34<00:58, 21.97it/s]


mel shape is torch.Size([1, 100, 1506])


In [14]:
print(audio.shape)
print(combined_wavs.shape)

torch.Size([1, 385280])
(1, 385280)
