# 訓練済みHifiGANモデルによる推論

In [None]:
import torch
from TTS.utils.audio import AudioProcessor
from TTS.vocoder.configs import HifiganConfig
from TTS.vocoder.models.gan import GAN

## モデルのロード

In [None]:
config = HifiganConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=5,
    epochs=1000,
    seq_len=8192,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=10,
    print_step=25,
    print_eval=False,
    mixed_precision=False,
    lr_gen=1e-4,
    lr_disc=1e-4
)

In [None]:
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

In [None]:
%ls ../recipes/jsut/hifigan/run-November-17-2022_02+48PM-367d940b/

In [None]:
device = torch.device("cuda:0")
model = GAN(config, ap).to(device)
checkpoint_path = "../recipes/jsut/hifigan/run-November-17-2022_02+48PM-367d940b/checkpoint_50000.pth"
model.load_checkpoint(config, checkpoint_path, eval=True)

## ボコーダーの推論

In [None]:
import matplotlib.pyplot as plt
from IPython.display import display, Audio
%matplotlib inline

In [None]:
wavpath = "../recipes/jsut/jsut_ver1.1/voiceactress100/wav/VOICEACTRESS100_001.wav"

In [None]:
audio = ap.load_wav(wavpath, sr=config.audio.sample_rate)
audio.shape

In [None]:
display(Audio(audio, rate=config.audio.sample_rate))

In [None]:
mel = ap.melspectrogram(audio)
mel = torch.from_numpy(mel)
mel = mel.unsqueeze(0).to(device)
mel.shape

In [None]:
recon_audio = model.inference(mel)
recon_audio = recon_audio.squeeze().cpu().numpy()
recon_audio.shape

In [None]:
plt.plot(recon_audio);

In [None]:
display(Audio(recon_audio, rate=config.audio.sample_rate))