## MB-iSTFT-VITS2 inference

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa

import os
import json
import math

import requests
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import langdetect

from scipy.io.wavfile import write
import re

In [3]:
#- device setting
if torch.cuda.is_available() is True:
    device = "cuda:0"
else:
    device = "cpu"

In [4]:
def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    print(hps.data.text_cleaners, text_norm)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


def langdetector(text):  # from PolyLangVITS
    try:
        lang = langdetect.detect(text).lower()
        if lang == 'ko':
            return f'[KO]{text}[KO]'
        elif lang == 'ja':
            return f'[JA]{text}[JA]'
        elif lang == 'en':
            return f'[EN]{text}[EN]'
        elif lang == 'zh-cn':
            return f'[ZH]{text}[ZH]'
        else:
            return text
    except Exception as e:
        return text


def vcss(inputstr,sid=0): # single
    fltstr = re.sub(r"[\[\]\(\)\{\}]", "", inputstr)
    #fltstr = langdetector(fltstr) #- optional for cjke/cjks type cleaners
    stn_tst = get_text(fltstr, hps)

    speed = 1
    sid = 0
    with torch.no_grad():
        x_tst = stn_tst.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([sid]).to(device)
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1 / speed)[0][
                0, 0].data.cpu().float().numpy()

    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=True))


def vcms(inputstr, sid): # multi
    fltstr = re.sub(r"[\[\]\(\)\{\}]", "", inputstr)
    #fltstr = langdetector(fltstr) #- optional for cjke/cjks type cleaners
    stn_tst = get_text(fltstr, hps)
    # print(stn_tst)
    speed = 1
    with torch.no_grad():
        x_tst = stn_tst.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([sid]).to(device)
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1 / speed)[0][
            0, 0].data.cpu().float().numpy()

    ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=True))

In [5]:
# - paths
path_to_config = "/rhome/eingerman/Projects/DeepLearning/TTS/MB-iSTFT-VITS2/configs/mb_istft_vits2_libritts_text.json" # path to .json
path_to_model = "/rhome/eingerman/Projects/DeepLearning/TTS/MB-iSTFT-VITS2/logs/models/mb_istft_vits2_libritts_from_text/G_390000.pth" # path to G_xxxx.pth

In [6]:
hps = utils.get_hparams_from_file(path_to_config)

if "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True:
    print("Using mel posterior encoder for VITS2")
    posterior_channels = 80  # vits2
    hps.data.use_mel_posterior_encoder = True
else:
    print("Using lin posterior encoder for VITS1")
    posterior_channels = hps.data.filter_length // 2 + 1
    hps.data.use_mel_posterior_encoder = False

net_g = SynthesizerTrn(
    len(symbols),
    posterior_channels,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers, #- for multi speaker
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint(path_to_model, net_g, None)

Using mel posterior encoder for VITS2
512 2
Multi-band iSTFT VITS2


In [7]:
# - text input
input = "I try to get the waiter's attention by blinking in morse code"
input = 'He wound it around the wound, saying "I read it was $10 to read."'
vcss(input,sid=2)

clean_text: he wound it around the wound, saying "i read it was ten dollars to read."

['english_cleaners4'] [50, 47, 16, 65, 57, 63, 56, 46, 16, 51, 62, 16, 43, 60, 57, 63, 56, 46, 16, 62, 50, 47, 16, 65, 57, 63, 56, 46, 3, 16, 61, 43, 67, 51, 56, 49, 16, 11, 51, 16, 60, 47, 43, 46, 16, 51, 62, 16, 65, 43, 61, 16, 62, 47, 56, 16, 46, 57, 54, 54, 43, 60, 61, 16, 62, 57, 16, 60, 47, 43, 46, 4, 11]


In [11]:
# - text input
input = "When these rays reach the observer direct, he sees the lamps or luminiferous bodies themselves, but when he is out of their direct sight, the brightness of their illumination only becomes apparent, through the rays being collected and reflected by some appropriate substance."
# input = "even in mariposa some of the people must have thought so."
# input = "the most hateful combinations are surpassed by the infernal munificence of the unforeseen."
# input = 'He wound it around the wound, saying "I read it was $10 to read."'
# input = "Vice-President Kamala Harris delivered a speech in Guatemala on Monday, warning potential migrants against traveling to the United States."
vcms(input,sid=1)

clean_text: when these rays reach the observer direct, he sees the lamps or luminiferous bodies themselves, but when he is out of their direct sight, the brightness of their illumination only becomes apparent, through the rays being collected and reflected by some appropriate substance.

['english_cleaners4'] [65, 50, 47, 56, 16, 62, 50, 47, 61, 47, 16, 60, 43, 67, 61, 16, 60, 47, 43, 45, 50, 16, 62, 50, 47, 16, 57, 44, 61, 47, 60, 64, 47, 60, 16, 46, 51, 60, 47, 45, 62, 3, 16, 50, 47, 16, 61, 47, 47, 61, 16, 62, 50, 47, 16, 54, 43, 55, 58, 61, 16, 57, 60, 16, 54, 63, 55, 51, 56, 51, 48, 47, 60, 57, 63, 61, 16, 44, 57, 46, 51, 47, 61, 16, 62, 50, 47, 55, 61, 47, 54, 64, 47, 61, 3, 16, 44, 63, 62, 16, 65, 50, 47, 56, 16, 50, 47, 16, 51, 61, 16, 57, 63, 62, 16, 57, 48, 16, 62, 50, 47, 51, 60, 16, 46, 51, 60, 47, 45, 62, 16, 61, 51, 49, 50, 62, 3, 16, 62, 50, 47, 16, 44, 60, 51, 49, 50, 62, 56, 47, 61, 61, 16, 57, 48, 16, 62, 50, 47, 51, 60, 16, 51, 54, 54, 63, 55, 51, 56, 43, 62, 51, 57,