# Inference VITS Model on CPU and Compilation on Neuron
* Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech
    * Git: https://github.com/jaywalnut310/vits.git



In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## LJ Speech

In [18]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")

['english_cleaners2']

In [3]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("models/pretrained_ljs.pth", net_g, None)

In [6]:

def play_tts_cpu(stn_tst, net_g):
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        # audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
        audio = net_g(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()    
        ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
    return audio

stn_tst = get_text("VITS is Awesome!", hps)    
audio = play_tts_cpu(stn_tst, net_g)

In [7]:
stn_tst = get_text("VITS is Awesome!, How are you doing today?", hps)    
audio = play_tts_cpu(stn_tst, net_g)

## Neuron Complie Test

In [8]:
import torch
import torch_neuron
from torch_neuron import analyze_model

In [21]:
# wrapping model
class VITSWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x, x_lengths, noise_scale, noise_scale_w, length_scale):
        return self.model.infer(x, x_lengths, 
                                noise_scale=noise_scale.item(),
                                noise_scale_w=noise_scale_w.item(), 
                                length_scale=length_scale.item())

wrapped_model = VITSWrapper(net_g)

In [22]:
# create data
# stn_tst = get_text("VITS is Awesome!", hps)
# print("stn_tst: ", stn_tst)
stn_tst = torch.randint(low=1, high=10, size=(33,), dtype=torch.int64)
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])

# 스칼라 값을 텐서로 변환
noise_scale = torch.tensor(0.667)
noise_scale_w = torch.tensor(0.8)
length_scale = torch.tensor(1.0)

print("x_tst: ", x_tst)
print("x_tst shape: ", x_tst.shape)
print("x_tst_length shape: ", x_tst_lengths.shape)
print("x_tst_length: ", x_tst_lengths)
    

x_tst:  tensor([[7, 5, 9, 8, 9, 8, 5, 8, 4, 4, 5, 2, 8, 3, 1, 1, 9, 1, 7, 4, 2, 9, 9, 4,
         3, 1, 9, 3, 5, 7, 5, 3, 7]])
x_tst shape:  torch.Size([1, 33])
x_tst_length shape:  torch.Size([1])
x_tst_length:  tensor([33])


In [23]:
# torch_neuron.trace 사용
traced_model = torch_neuron.trace(wrapped_model, 
                                  (x_tst, x_tst_lengths, 
                                   noise_scale, noise_scale_w, length_scale))

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  assert t_s == t_t, "Relative attention is only available for self-attention."
  pad_length = max(length - (self.window_size + 1), 0)
  slice_start_position = max((self.window_size + 1) - length, 0)
  if pad_length > 0:
  if torch.min(inputs) < left or torch.max(inputs) > right:
  if min_bin_width * num_bins > 1.0:
  if min_bin_height * num_bins > 1.0:
  assert (discriminant >= 0).all()
	%7880 : Float(1, 192, 46, strides=[8832, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %7875, %7876, %7877, %7878, %7879) # /home/ubuntu/vits/models.py:547:0
This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace()
  _module_class,
With rtol=1e-05 and atol=1e-05, found 11763 element(s) (out of 11776) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest 

In [7]:
# 트레이스된 모델 저장
torch.jit.save(traced_model, "traced_vits_model_neuron.pt")

## 모델 추론

In [11]:
# Prepare input
# stn_tst = get_text("VITS is Awesome!", hps)
stn_tst = get_text("Melon is a great fruit in summer for everybody", hps)

x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
# Convert scalar value to tensor
noise_scale = torch.tensor(0.667)
noise_scale_w = torch.tensor(0.8)
length_scale = torch.tensor(1.0)

In [12]:
# inference
traced_output = traced_model(x_tst, x_tst_lengths, noise_scale, noise_scale_w, length_scale)
print("output shape:", traced_output[0].shape)

ipd.display(ipd.Audio(traced_output, rate=hps.data.sampling_rate, normalize=False))


: 

## 모델 로딩후 추론 테스트

In [1]:
import torch
import torch_neuron
import commons
import utils
from text import text_to_sequence

hps = utils.get_hparams_from_file("./configs/ljs_base.json")

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def load_neuron_model(model_path):
    device = torch.device("cpu")  # 먼저 CPU에 로드
    model = torch.jit.load(model_path, map_location=device)
    return torch_neuron.DataParallel(model)


# Prepare input
stn_tst = get_text("VITS is Awesome!", hps)

x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
# Convert scalar value to tensor
noise_scale = torch.tensor(0.667)
noise_scale_w = torch.tensor(0.8)
length_scale = torch.tensor(1.0)

In [2]:
# 사용 예시
model_path = "traced_vits_model_neuron.pt"
loaded_neuron_model = load_neuron_model(model_path)

In [3]:
# inference
traced_output = loaded_neuron_model(x_tst, x_tst_lengths, noise_scale, noise_scale_w, length_scale)
print("output shape:", traced_output[0].shape)


IndexError: tuple index out of range

## Neuron Analyzer

In [11]:
import torch
import torch_neuron
from torch_neuron import analyze_model

analyzer = analyze_model(wrapped_model, (x_tst, x_tst_lengths, noise_scale, noise_scale_w, length_scale))

  
  from ipykernel import kernelapp as app
  app.launch_new_instance()
	%7880 : Float(1, 192, 100, strides=[19200, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %7875, %7876, %7877, %7878, %7879) # /home/ubuntu/vits/models.py:547:0
This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace()
  _module_class,
expected tensor shape torch.Size([1, 1, 24576]) doesn't match with actual tensor shape torch.Size([1, 1, 25600])!
  _module_class,
expected tensor shape torch.Size([1, 1, 96, 33]) doesn't match with actual tensor shape torch.Size([1, 1, 100, 33])!
  _module_class,
expected tensor shape torch.Size([1, 1, 96]) doesn't match with actual tensor shape torch.Size([1, 1, 100])!
  _module_class,
INFO:Neuron:The following operations are currently supported in torch-neuron for this model:
[INFO] The following operations are currently supported in torch-neuron for this model:
INFO:Neuron:aten::ScalarImplicit
[INFO] aten::Sca