## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [18]:
import matplotlib
%matplotlib inline
import matplotlib.pylab as plt

import IPython.display as ipd

import sys
sys.path.append('waveglow/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

In [19]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')
    plt.show()

#### Setup hparams

In [20]:
hparams = create_hparams()
hparams['sampling_rate'] = 22050

#### Load model from checkpoint

In [21]:
#checkpoint_path = "tacotron2_statedict.pt"
checkpoint_path = "checkpoints/checkpoint_20000"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [22]:
#waveglow_path = 'waveglow_256channels.pt'
waveglow_path = "checkpoints/waveglow_256channels_universal_v5.pt"
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

#### Prepare text input

In [23]:
text = "Good morning!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

In [24]:
speaker_id = 0
speaker_id = torch.IntTensor([speaker_id]).cuda()
#speaker_id = speaker_id.unsqueeze(1)
speaker_id 

tensor([0], device='cuda:0', dtype=torch.int32)

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#### Decode text input and plot results

In [28]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, speaker_id, device)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

tensor([0], device='cuda:0', dtype=torch.int32)


In [29]:
mel_outputs_postnet

tensor([[[-11.5312, -11.5547, -11.5859,  ..., -10.5234, -10.2969,  -9.8203],
         [-11.4609, -11.5234, -11.5547,  ..., -10.5234, -10.2969,  -9.8203],
         [-11.4375, -11.5625, -11.5781,  ..., -10.5703, -10.3438,  -9.8828],
         ...,
         [-11.3594, -11.3984, -11.3359,  ..., -10.5000, -10.2422,  -9.7031],
         [-11.3594, -11.3828, -11.3047,  ..., -10.6250, -10.3438,  -9.8125],
         [-11.4141, -11.4375, -11.3281,  ..., -10.5938, -10.3125,  -9.7891]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

#### Synthesize audio from spectrogram using WaveGlow

In [31]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams['sampling_rate'])

#### (Optional) Remove WaveGlow bias

In [30]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams['sampling_rate']) 