## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [13]:
'''import matplotlib
%matplotlib inline
import matplotlib.pylab as plt'''

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [2]:
'''def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='bottom', 
                       interpolation='none')'''

#### Setup hparams

In [11]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [14]:
checkpoint_path = "waveglow/checkpoint_7000"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

In [16]:
import re
from unicodedata import normalize

def tts(text):    
    text = normalize("NFC", text).lower()
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    audio_denoised = denoiser(audio, strength=0.005)[:, 0].cpu().numpy()

    return audio_denoised

def pts(para):
    audio = np.zeros((1,0))
    sentence_ls = para.split(".")

    for sen in sentence_ls:
        sub_stn_ls = re.split(",|;|-|:", sen)
        for sub_stn in sub_stn_ls:
            audio = np.append(audio, tts(sub_stn), axis=1)
            audio = np.append(audio, np.zeros((1, int(hparams.sampling_rate/8)), dtype=np.uint8) , axis=1)
        audio = np.append(audio, np.zeros((1, int(hparams.sampling_rate/4)), dtype=np.uint8) , axis=1)
    return audio

In [17]:
pts("Xin chào các cháu, các cháu nhớ cho ông một like để ủng hộ ông nha. Ông đang rất là vui, hôm nay ông sẽ hướng dẫn các cháu làm món bánh siêu to khổng lồ nhé")

AttributeError: 'WN' object has no attribute 'cond_layer'

#### Load WaveGlow for mel2audio synthesis and denoiser

In [25]:
waveglow_path = 'waveglow/waveglow_256channels.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser=Denoiser(waveglow)

AttributeError: 'WN' object has no attribute 'cond_layer'

#### Prepare text input

In [18]:
text = "Waveglow is really awesome!"
sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

#### Decode text input and plot results

In [22]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
'''plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))'''

'plot_data((mel_outputs.float().data.cpu().numpy()[0],\n           mel_outputs_postnet.float().data.cpu().numpy()[0],\n           alignments.float().data.cpu().numpy()[0].T))'

#### Synthesize audio from spectrogram using WaveGlow

In [23]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

AttributeError: 'WN' object has no attribute 'cond_layer'

#### (Optional) Remove WaveGlow bias

In [9]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate) 