## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch
import torchaudio

import pyopenjtalk

from hparams_v2 import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

  from .autonotebook import tqdm as notebook_tqdm
2023-12-29 01:20:28.319568: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-29 01:20:28.319619: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-29 01:20:28.319640: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-29 01:20:28.325255: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate co

In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')

#### Setup hparams

In [3]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [4]:
#checkpoint_path = "tacotron2_statedict.pt"
checkpoint_path = "outdir3/checkpoint_5000"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().float()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [5]:
waveglow_path = 'waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
print(waveglow)
for k in dir(waveglow):
    print(k)
waveglow.cuda().eval().float()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)



WaveGlow(
  (upsample): ConvTranspose1d(80, 80, kernel_size=(1024,), stride=(256,))
  (WN): ModuleList(
    (0-3): 4 x WN(
      (in_layers): ModuleList(
        (0): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0-6): 7 x Conv1d(256, 512, kernel_size=(1,), stride=(1,))
        (7): Conv1d

  audio = torch.cuda.FloatTensor(spect.size(0),


tensor([[[-0.1270, -0.0726, -0.1395,  ..., -0.0738, -0.1452, -0.1357],
         [-0.1565, -0.2134, -0.1718,  ..., -0.0198, -0.0041,  0.0362],
         [ 1.0151,  1.0696,  1.0802,  ...,  1.1608,  1.1813,  1.1610],
         [ 0.7881,  0.9475,  1.1765,  ...,  1.3461,  1.4748,  1.3000]]],
       device='cuda:0')
s,b
tensor([[[1.0151, 1.0696, 1.0802,  ..., 1.1608, 1.1813, 1.1610],
         [0.7881, 0.9475, 1.1765,  ..., 1.3461, 1.4748, 1.3000]]],
       device='cuda:0')
tensor([[[-0.1270, -0.0726, -0.1395,  ..., -0.0738, -0.1452, -0.1357],
         [-0.1565, -0.2134, -0.1718,  ..., -0.0198, -0.0041,  0.0362]]],
       device='cuda:0')
tensor([[[ 0.0000,  0.0000, -0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000, -0.0000, -0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0460,  0.0249,  0.0473,  ...,  0.0231,  0.0446,  0.0425],
         [ 0.0712,  0.0827,  0.0530,  ...,  0.0051,  0.0009, -0.0099]]],
       device='cuda:0')
tensor([[[ 3.8946e-02,  2.7945e-02,  3.6929e-02,  ..

tensor([[[-1.1797, -0.8312, -1.7381,  ..., -0.1375, -0.1908, -0.2932],
         [-4.8741, -2.5705, -2.5505,  ..., -0.1662, -0.5492, -0.5853],
         [ 1.5688,  0.9355,  0.6347,  ...,  0.1692,  0.5766,  0.4906],
         ...,
         [ 0.5647,  0.7013,  0.6803,  ...,  0.5769,  0.8829,  0.7066],
         [-0.4925, -0.5091, -0.5391,  ..., -0.3949, -0.3811, -0.3953],
         [-0.5088, -0.5284, -0.5486,  ..., -0.4227, -0.4016, -0.4253]]],
       device='cuda:0')
s,b
tensor([[[-0.4151, -0.2014, -0.2202,  ...,  0.0188,  0.2483,  0.0049],
         [ 0.5647,  0.7013,  0.6803,  ...,  0.5769,  0.8829,  0.7066],
         [-0.4925, -0.5091, -0.5391,  ..., -0.3949, -0.3811, -0.3953],
         [-0.5088, -0.5284, -0.5486,  ..., -0.4227, -0.4016, -0.4253]]],
       device='cuda:0')
tensor([[[-1.1797, -0.8312, -1.7381,  ..., -0.1375, -0.1908, -0.2932],
         [-4.8741, -2.5705, -2.5505,  ..., -0.1662, -0.5492, -0.5853],
         [ 1.5688,  0.9355,  0.6347,  ...,  0.1692,  0.5766,  0.4906],
       

#### Prepare text input

In [6]:
#text = "Waveglow is really awesome!"
text = "こんな感じで音声クローニングできます"
#print("入力してね>>>")
#text = input()
phones = pyopenjtalk.g2p(text, kana=False)
phones = phones.replace('pau',',')
phones = phones.replace(' ','')
phones = phones + '.'
print(phones)
sequence = np.array(text_to_sequence(phones, ['basic_cleaners']))[None, :]
#sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()
print(sequence)

ahaaN,daisUkinaNdanaaN,chuchuchu,chuchuchunochuudayo.
tensor([[38, 45, 38, 38, 51,  6, 41, 38, 46, 56, 58, 48, 46, 51, 38, 51, 41, 38,
         51, 38, 38, 51,  6, 40, 45, 58, 40, 45, 58, 40, 45, 58,  6, 40, 45, 58,
         40, 45, 58, 40, 45, 58, 51, 52, 40, 45, 58, 58, 41, 38, 62, 52,  7]],
       device='cuda:0')


#### Decode text input and plot results

In [7]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
#plot_data((mel_outputs.float().data.cpu().numpy()[0],
#           mel_outputs_postnet.float().data.cpu().numpy()[0],
#           alignments.float().data.cpu().numpy()[0].T))

mel_outputs = mel_outputs.to(torch.float32)
mel_outputs_postnet = mel_outputs_postnet.to(torch.float32)
_ = _.to(torch.float32)
alignments = alignments.to(torch.float32)
print(mel_outputs_postnet.type())
data = [mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T]
#print('data=>')
#print(data)
#print('data_elements=>')
#print(mel_outputs.float().data.cpu().numpy()[0])
#print(mel_outputs_postnet.float().data.cpu().numpy()[0])
#print(alignments.float().data.cpu().numpy()[0].T)

figsize=(16, 4)
fig, axes = plt.subplots(1, len(data), figsize=figsize)
for i in range(len(data)):
    axes[i].imshow(data[i], aspect='auto', origin='lower', interpolation='none')
fig.savefig('graph.png')
plt.show()

torch.cuda.FloatTensor


#### Synthesize audio from spectrogram using WaveGlow

In [8]:
with torch.no_grad():
    print(mel_outputs_postnet)
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

#print(audio[0].data.cpu().numpy())
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

tensor([[[ -8.0996,  -7.9605,  -7.8604,  ...,  -7.2568,  -7.7487,  -8.6511],
         [ -8.4654,  -8.2547,  -8.2030,  ...,  -7.7795,  -8.2981,  -9.0811],
         [ -8.7140,  -8.2800,  -8.3013,  ...,  -7.9934,  -8.3760,  -9.0939],
         ...,
         [-10.8625, -10.8922, -10.9245,  ..., -10.9013, -11.2591, -11.3975],
         [-10.9606, -10.8914, -10.8039,  ..., -11.0719, -11.3989, -11.4997],
         [-11.1348, -10.9809, -10.8704,  ..., -10.8894, -11.2981, -11.4719]]],
       device='cuda:0', grad_fn=<AddBackward0>)
start infer
tensor([[[ -8.0996,  -7.9605,  -7.8604,  ...,  -7.2568,  -7.7487,  -8.6511],
         [ -8.4654,  -8.2547,  -8.2030,  ...,  -7.7795,  -8.2981,  -9.0811],
         [ -8.7140,  -8.2800,  -8.3013,  ...,  -7.9934,  -8.3760,  -9.0939],
         ...,
         [-10.8625, -10.8922, -10.9245,  ..., -10.9013, -11.2591, -11.3975],
         [-10.9606, -10.8914, -10.8039,  ..., -11.0719, -11.3989, -11.4997],
         [-11.1348, -10.9809, -10.8704,  ..., -10.8894, -11.298

tensor([[[ 0.0200,  0.2787,  0.1328,  ..., -0.0073,  0.1050,  0.0062],
         [ 0.1771,  0.1499,  0.1640,  ..., -0.2821, -0.2226, -0.1402],
         [ 0.2106,  0.0842,  0.3155,  ...,  0.0034,  0.0970,  0.0636],
         [ 1.0119,  1.0865,  1.1020,  ...,  1.0523,  1.0331,  1.0324],
         [ 0.7156,  0.9012,  0.9949,  ...,  1.0771,  1.0630,  1.0441],
         [ 1.4375,  1.5646,  1.6562,  ...,  1.6245,  1.6367,  1.5980]]],
       device='cuda:0')
s,b
tensor([[[1.0119, 1.0865, 1.1020,  ..., 1.0523, 1.0331, 1.0324],
         [0.7156, 0.9012, 0.9949,  ..., 1.0771, 1.0630, 1.0441],
         [1.4375, 1.5646, 1.6562,  ..., 1.6245, 1.6367, 1.5980]]],
       device='cuda:0')
tensor([[[ 0.0200,  0.2787,  0.1328,  ..., -0.0073,  0.1050,  0.0062],
         [ 0.1771,  0.1499,  0.1640,  ..., -0.2821, -0.2226, -0.1402],
         [ 0.2106,  0.0842,  0.3155,  ...,  0.0034,  0.0970,  0.0636]]],
       device='cuda:0')
tensor([[[-0.0201, -0.0581, -0.0357,  ...,  0.0325,  0.0237,  0.0070],
         [-0.

tensor([[[ 9.7749e-02, -2.5613e-02,  6.7110e-02,  ..., -2.9521e-02,
          -4.0661e-02, -6.9169e-02],
         [-1.4588e-01,  3.2283e-02, -8.1388e-02,  ...,  5.8045e-03,
           3.3008e-02, -1.4918e-03],
         [-4.3847e-02,  4.3323e-02, -4.3409e-02,  ...,  1.2769e-02,
           1.9905e-02,  1.1167e-01],
         ...,
         [ 3.7617e+00,  3.6913e+00,  3.6855e+00,  ...,  3.9835e+00,
           3.9772e+00,  3.9749e+00],
         [ 3.9764e+00,  3.9153e+00,  3.9179e+00,  ...,  4.2582e+00,
           4.2505e+00,  4.2412e+00],
         [ 3.7034e+00,  3.6541e+00,  3.6527e+00,  ...,  3.9939e+00,
           3.9851e+00,  3.9780e+00]]], device='cuda:0')
s,b
tensor([[[4.6267, 4.6070, 4.6156,  ..., 4.9562, 4.9492, 4.9419],
         [3.7617, 3.6913, 3.6855,  ..., 3.9835, 3.9772, 3.9749],
         [3.9764, 3.9153, 3.9179,  ..., 4.2582, 4.2505, 4.2412],
         [3.7034, 3.6541, 3.6527,  ..., 3.9939, 3.9851, 3.9780]]],
       device='cuda:0')
tensor([[[ 0.0977, -0.0256,  0.0671,  ..., -0.0

#### (Optional) Remove WaveGlow bias

In [9]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 

  win_sq = librosa_util.pad_center(win_sq, n_fft)


#### Audio Download

In [10]:
audio_tensor = torch.from_numpy(audio[0].data.cpu().numpy())
audio_tensor = audio_tensor.unsqueeze(0)  # 2次元のテンソルに変換
audio_tensor = audio_tensor.to(torch.float32)
torchaudio.save(uri='result.wav', src=audio_tensor, sample_rate=hparams.sampling_rate, format='wav')