In [1]:
import os
import sys
import numpy as np

import torch
import utils
import argparse

from scipy.io import wavfile
from text.symbols import symbols
from text import cleaned_text_to_sequence
from vits_pinyin import VITS_PinYin
import IPython.display as ipd

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# pinyin
tts_front = VITS_PinYin("./bert", device)

# config
hps = utils.get_hparams_from_file("configs/bert_vits.json")

# model
net_g = utils.load_class(hps.train.eval_class)(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)

utils.load_model("AISHELL3_G.pth", net_g)
net_g.eval()
net_g.to(device)



SynthesizerTrn(
  (enc_p): TextEncoder(
    (emb): Embedding(219, 192)
    (emb_bert): Linear(in_features=256, out_features=192, bias=True)
    (encoder): Encoder(
      (drop): Dropout(p=0.1, inplace=False)
      (attn_layers): ModuleList(
        (0-5): 6 x MultiHeadAttention(
          (conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_1): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (ffn_layers): ModuleList(
        (0-5): 6 x FFN(
          (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
          (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_2): ModuleList(
        (0-5): 

In [4]:
item = "武术始终被看作我国的国粹"
phonemes, char_embeds = tts_front.chinese_to_phonemes(item)
phonemes

'sil ^ u3 sh u4 sh iii3 zh ong1 b ei4 k an4 z uo4 ^ uo3 g uo2 d e5 g uo2 c uei4 sp sil'

In [8]:
# input_ids = cleaned_text_to_sequence(phonemes)
# print("phonemes:" + str(phonemes))
# print("char_embeds: " + str(char_embeds.shape)) # [L, 256]

# phonemes = "sil j in5 j in5 j in1 t ian1 t ian1 q i4 b u2 c uo4 sp sil"
# i = 1
# insert = np.repeat([char_embeds[i]], 4, axis=0)
# new_embeds = np.insert(char_embeds, i+1, insert, axis=0)

# char_embeds = new_embeds
input_ids = cleaned_text_to_sequence(phonemes)
print("phonemes:" + str(phonemes))
print("char_embeds: " + str(char_embeds.shape)) # [L, 256]

with torch.no_grad():
    sid = torch.LongTensor([0]).to(device)
    x_tst = torch.LongTensor(input_ids).unsqueeze(0).to(device)
    print("x_tst: " + str(x_tst))
    x_tst_lengths = torch.LongTensor([len(input_ids)]).to(device)
    x_tst_prosody = torch.FloatTensor(char_embeds).unsqueeze(0).to(device)
    print("x_tst_prosody: " + str(x_tst_prosody))
    output = net_g.infer(x_tst, x_tst_lengths, x_tst_prosody, sid=sid, noise_scale=0.5, length_scale=1)
    audio = output[0][0, 0].data.cpu().float().numpy()
    index = output[-1]
    print(index)
    # audio = net_g.infer_prolong(x_tst, x_tst_lengths, x_tst_prosody, sid=sid, noise_scale=0.5,
    #                     length_scale=1)[0][0, 0].data.cpu().float().numpy()

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

phonemes:sil j in1 t ian1 t ian1 q i4 b u4 c uo4 h a1 h a1 h a1 sp sil
char_embeds: (21, 256)
x_tst: tensor([[  0,  15, 119,  25,  89,  25,  89,  21,  82,   8, 157,   9, 197,  14,
          29,  14,  29,  14,  29,   2,   0]], device='cuda:7')
x_tst_prosody: tensor([[[-4.4807e-01, -6.3070e-01, -4.4401e-02,  ..., -7.8962e-02,
           1.0006e+00,  9.2337e-01],
         [-2.3832e-01, -6.0560e-01, -2.6822e-01,  ...,  9.7482e-01,
           3.2449e-01,  8.1633e-01],
         [-2.3832e-01, -6.0560e-01, -2.6822e-01,  ...,  9.7482e-01,
           3.2449e-01,  8.1633e-01],
         ...,
         [ 9.5450e-01, -1.6293e+00,  3.2199e-01,  ..., -5.1992e-01,
           2.6638e-01,  3.7271e-01],
         [-3.1436e-02,  3.2551e-01, -5.2022e-01,  ..., -7.6089e-01,
          -2.5881e-04, -3.9230e-01],
         [-4.3577e-01,  1.2030e-01, -4.5207e-01,  ..., -2.3099e-01,
           6.0163e-01,  4.2468e-01]]], device='cuda:7')
x torch.Size([1, 21])
x_length torch.Size([1])
bert torch.Size([1, 21, 256])
te

In [7]:
import random

index = random.randrange(2, x_tst.size(1)-2, 2)
index

6

In [11]:
import wave
import contextlib

with contextlib.closing(wave.open('dys_output/sample2.wav', 'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    duration = frames / float(rate)
    print(f"duration: {duration:.5f}s")


with wave.open('dys_output/sample2.wav', 'r') as f:
    rate = f.getframerate()
    print(f"sr: {rate} Hz")

duration: 2.24000s
sr: 16000 Hz


In [10]:
import soundfile as sf

audio_file_path = 'dys_output/sample2.wav'

sf.write(audio_file_path, audio, hps.data.sampling_rate)

In [None]:
# 1 frame -> 0.016s 
# 174 speakers