In [None]:
from google.colab import drive

drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/tts_hifiGAN/hifi-gan

/content/drive/MyDrive/tts_hifiGAN/hifi-gan


In [None]:
%cd drive/MyDrive/tts_hifiGAN/hifi-gan
!pip install -r requirements.txt
!pip install -q unidecode tensorboardX
!pip install numba==0.48

### library

In [None]:
"""import os
# os.chdir('/content')
%cd drive/MyDrive/tts_hifiGAN
!git clone https://github.com/jik876/hifi-gan.git

!pip install -q unidecode tensorboardX
%cd hifi-gan
!pip install -r requirements.txt 
!pip install numba==0.48"""

In [None]:
# terminal implement

!pip install kora
from kora import console
console.start()

In [None]:
import easydict
import itertools
import os
import time
import argparse
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DistributedSampler, DataLoader
import torch.multiprocessing as mp
from torch.distributed import init_process_group
from torch.nn.parallel import DistributedDataParallel
from env import AttrDict, build_env
from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
from models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
    discriminator_loss
from utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint, init_weights, get_padding

### Training

In [None]:
a = easydict.EasyDict({ "group_name": None , 
                        "input_wavs_dir": "/content/drive/MyDrive/tts_hifiGAN/dataset/wav_data",
                        "input_mels_dir": "/content/drive/MyDrive/tts_hifiGAN/dataset/ft_dataset", # 노 쓸모
                        "input_training_file":"/content/drive/MyDrive/tts_hifiGAN/dataset/jss_text.tsv", # 노 쓸모
                        "input_validation_file": "/content/drive/MyDrive/tts_hifiGAN/dataset/jss_text.tsv", # 노 쓸모
                        "checkpoint_path": "cp_hifigan",
                        "config":"config_v1.json",
                        "training_epochs":3000,
                        "stdout_interval":5,
                        "checkpoint_interval":4000,
                        "summary_interval":100,
                        "validation_interval":1000,
                        "fine_tuning":False})

with open(a.config) as f:
    data = f.read()

In [None]:
training_files = [os.path.join(a.input_wavs_dir, f'{i}.wav') for i in range(9500)]
validation_files = [os.path.join(a.input_wavs_dir, f'{i}.wav') for i in range(9500,10000)]

training_filelist, validation_filelist = training_files, validation_files

In [None]:
json_config = json.loads(data)
h = AttrDict(json_config)
build_env(a.config, 'config.json', a.checkpoint_path)

torch.manual_seed(h.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(h.seed)
    h.num_gpus = torch.cuda.device_count()
    h.batch_size = int(h.batch_size / h.num_gpus)
    print('Batch size per GPU :', h.batch_size)
else:
    pass

Batch size per GPU : 16


In [None]:
training_filelist, validation_filelist = training_files, validation_files

trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels,
                      h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0,
                      shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device='cpu',
                      fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir)

In [None]:
torch.backends.cudnn.benchmark = True


def train(rank, a, h):

    wandb.config.update(json_config)
    if h.num_gpus > 1:
        init_process_group(backend=h.dist_config['dist_backend'], init_method=h.dist_config['dist_url'],
                           world_size=h.dist_config['world_size'] * h.num_gpus, rank=rank)

    torch.cuda.manual_seed(h.seed)
    device = torch.device('cuda:{:d}'.format(rank))

    generator = Generator(h).to(device)
    mpd = MultiPeriodDiscriminator().to(device)
    msd = MultiScaleDiscriminator().to(device)

    wandb.watch(generator)
    wandb.watch(mpd)
    wandb.watch(msd)

    if rank == 0:
        print(generator)
        os.makedirs(a.checkpoint_path, exist_ok=True)
        print("checkpoints directory : ", a.checkpoint_path)

    if os.path.isdir(a.checkpoint_path):
        cp_g = scan_checkpoint(a.checkpoint_path, 'g_')
        cp_do = scan_checkpoint(a.checkpoint_path, 'do_')

    steps = 0
    if cp_g is None or cp_do is None:
        state_dict_do = None
        last_epoch = -1
    else:
        state_dict_g = load_checkpoint(cp_g, device)
        state_dict_do = load_checkpoint(cp_do, device)
        generator.load_state_dict(state_dict_g['generator'])
        mpd.load_state_dict(state_dict_do['mpd'])
        msd.load_state_dict(state_dict_do['msd'])
        steps = state_dict_do['steps'] + 1
        last_epoch = state_dict_do['epoch']

    if h.num_gpus > 1:
        generator = DistributedDataParallel(generator, device_ids=[rank]).to(device)
        mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
        msd = DistributedDataParallel(msd, device_ids=[rank]).to(device)

    optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2])
    optim_d = torch.optim.AdamW(itertools.chain(msd.parameters(), mpd.parameters()),
                                h.learning_rate, betas=[h.adam_b1, h.adam_b2])

    if state_dict_do is not None:
        optim_g.load_state_dict(state_dict_do['optim_g'])
        optim_d.load_state_dict(state_dict_do['optim_d'])

    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch)
    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch)

    # training_filelist, validation_filelist = get_dataset_filelist(a)
    training_filelist, validation_filelist = training_files, validation_files

    trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels,
                          h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0,
                          shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device,
                          fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir)

    train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None

    train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False,
                              sampler=train_sampler,
                              batch_size=h.batch_size,
                              pin_memory=True,
                              drop_last=True)

    if rank == 0:
        validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels,
                              h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0,
                              fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning,
                              base_mels_path=a.input_mels_dir)
        
        #validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels,
        #              h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0,
        #              shuffle=False, fmax_loss=h.fmax_for_loss, device=device,
        #              fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir)
        
        validation_loader = DataLoader(validset, num_workers=1, shuffle=False,
                                       sampler=None,
                                       batch_size=1,
                                       pin_memory=True,
                                       drop_last=True)

        sw = SummaryWriter(os.path.join(a.checkpoint_path, 'logs'))

    generator.train()
    mpd.train()
    msd.train()
    for epoch in range(max(0, last_epoch), a.training_epochs):
        if rank == 0:
            start = time.time()
            print("Epoch: {}".format(epoch+1))

        if h.num_gpus > 1:
            train_sampler.set_epoch(epoch)

        for i, batch in enumerate(train_loader):
            if rank == 0:
                start_b = time.time()
            x, y, _, y_mel = batch
            x = torch.autograd.Variable(x.to(device, non_blocking=True))
            y = torch.autograd.Variable(y.to(device, non_blocking=True))
            y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
            y = y.unsqueeze(1)

            y_g_hat = generator(x)
            y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size,
                                          h.fmin, h.fmax_for_loss)

            optim_d.zero_grad()

            # MPD
            y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
            loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g)

            # MSD
            y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach())
            loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g)

            loss_disc_all = loss_disc_s + loss_disc_f

            loss_disc_all.backward()
            optim_d.step()

            # Generator
            optim_g.zero_grad()

            # L1 Mel-Spectrogram Loss
            loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45

            y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
            y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat)
            loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
            loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
            loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
            loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
            loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel

            # print('loss_gen_all: ', loss_gen_all)
            
            loss_gen_all.backward()
            optim_g.step()

            if rank == 0:
                # STDOUT logging
                if steps % a.stdout_interval == 0:
                    with torch.no_grad():
                        mel_error = F.l1_loss(y_mel, y_g_hat_mel).item()

                    print('Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}'.
                          format(steps, loss_gen_all, mel_error, time.time() - start_b))
                    
                    wandb.log({'loss_gen_all':loss_gen_all,
                               'mel_error':mel_error})

                # checkpointing
                if steps % a.checkpoint_interval == 0 and steps != 0:
                    checkpoint_path = "{}/g_{:08d}".format(a.checkpoint_path, steps)
                    save_checkpoint(checkpoint_path,
                                    {'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()})
                    checkpoint_path = "{}/do_{:08d}".format(a.checkpoint_path, steps)
                    save_checkpoint(checkpoint_path, 
                                    {'mpd': (mpd.module if h.num_gpus > 1
                                                         else mpd).state_dict(),
                                     'msd': (msd.module if h.num_gpus > 1
                                                         else msd).state_dict(),
                                     'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps,
                                     'epoch': epoch})

                # Tensorboard summary logging
                if steps % a.summary_interval == 0:
                    sw.add_scalar("training/gen_loss_total", loss_gen_all, steps)
                    sw.add_scalar("training/mel_spec_error", mel_error, steps)

                # Validation
                if steps % a.validation_interval == 0:  # and steps != 0:
                    generator.eval()
                    torch.cuda.empty_cache()
                    val_err_tot = 0
                    with torch.no_grad():
                        for j, batch in enumerate(validation_loader):

                            # y = y.unsqueeze(1)
            
                            x, y, _, y_mel = batch
                            
                            x = torch.autograd.Variable(x.to(device,non_blocking=True))
                            y = torch.autograd.Variable(y.to(device,non_blocking=True))
                            y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
                    
                            y_g_hat = generator(x)
            
                            # y_g_hat = generator(x.to(device))

                            y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size,
                                                          h.fmin, h.fmax_for_loss)
                            

                            val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item()
                            
                            

                            if j <= 4:
                                if steps == 0:
                                    sw.add_audio('gt/y_{}'.format(j), y[0], steps, h.sampling_rate)
                                    sw.add_figure('gt/y_spec_{}'.format(j), plot_spectrogram(x.cpu().numpy()[0]), steps)

        

                                sw.add_audio('generated/y_hat_{}'.format(j), y_g_hat[0], steps, h.sampling_rate)
 
                                y_hat_spec = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels,
                                                             h.sampling_rate, h.hop_size, h.win_size,
                                                             h.fmin, h.fmax)

                                sw.add_figure('generated/y_hat_spec_{}'.format(j),
                                              plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()), steps)


                        val_err = val_err_tot / (j+1)
                        wandb.log({'val_err':val_err})
                        print('val err tot: ',val_err)
                        sw.add_scalar("validation/mel_spec_error", val_err, steps)

                    generator.train()

            steps += 1

        scheduler_g.step()
        scheduler_d.step()
        if rank == 0:
            print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time() - start)))


In [None]:
!pip install --upgrade wandb -qq

import wandb
wandb.login()
wandb.init(project="TTS-vocoder-pretrain", name='hifi-gan')

[K     |████████████████████████████████| 1.8 MB 30.4 MB/s 
[K     |████████████████████████████████| 146 kB 72.9 MB/s 
[K     |████████████████████████████████| 181 kB 22.9 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkeonwookim[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
train(0, a, h)

Generator(
  (conv_pre): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
  (ups): ModuleList(
    (0): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
    (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
    (2): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
    (3): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
  )
  (resblocks): ModuleList(
    (0): ResBlock1(
      (convs1): ModuleList(
        (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
        (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
      )
      (convs2): ModuleList(
        (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (2): Conv1d(256, 256, kernel_size=(3,), stride

KeyboardInterrupt: ignored

### Test Infer (vocoder itself)

In [None]:
import glob
import os
import argparse
import json
import torch
from scipy.io.wavfile import write
from env import AttrDict
from meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
from models import Generator

h = None
device = None

def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict


def get_mel(x):
    return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)


def scan_checkpoint(cp_dir, prefix):
    pattern = os.path.join(cp_dir, prefix + '*')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return ''
    return sorted(cp_list)[-1]


def inference(a):
    generator = Generator(h).to(device)

    state_dict_g = load_checkpoint(a.checkpoint_file, device)
    generator.load_state_dict(state_dict_g['generator'])

    filelist = os.listdir(a.input_wavs_dir)

    os.makedirs(a.output_dir, exist_ok=True)

    generator.eval()
    generator.remove_weight_norm()
    with torch.no_grad():
        for i, filname in enumerate(filelist):
            wav, sr = load_wav(os.path.join(a.input_wavs_dir, filname))
            wav = wav / MAX_WAV_VALUE
            wav = torch.FloatTensor(wav).to(device)
            x = get_mel(wav.unsqueeze(0))
            
            y_g_hat = generator(x)
            audio = y_g_hat.squeeze()
            audio = audio * MAX_WAV_VALUE
            audio = audio.cpu().numpy().astype('int16')

            output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated.wav')
            write(output_file, h.sampling_rate, audio)
            print(output_file)

In [None]:
print('Initializing Inference Process..')

a = easydict.EasyDict({ "group_name": None , 
                        "input_wavs_dir": "/content/drive/MyDrive/tts_hifiGAN/dataset/wav_data",
                        "output_dir": "/content/drive/MyDrive/tts_hifiGAN/output_result",
                        "checkpoint_file" : "/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/g_00162000",
                       # "checkpoint_file" : "/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/univ/g_02511000"
                        })


config_file = "/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/config.json"

with open(config_file) as f:
    data = f.read()

In [None]:
global h
json_config = json.loads(data)
h = AttrDict(json_config)

torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
    torch.cuda.manual_seed(h.seed)
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

inference(a)

In [None]:
from scipy.io import wavfile
import IPython.display as ipd

path = "/content/drive/MyDrive/tts_hifiGAN/output_result/9005_generated.wav"

fs, data = wavfile.read(path)
ipd.Audio(data, rate=22050)

### Test Infer (mel-spectrogram from Tacotron2)

In [None]:
import glob
import os
import easydict
import numpy as np
import argparse
import json
import torch
from scipy.io.wavfile import write
from env import AttrDict
from meldataset import MAX_WAV_VALUE
from models import Generator

h = None
device = None


def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict


def scan_checkpoint(cp_dir, prefix):
    pattern = os.path.join(cp_dir, prefix + '*')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return ''
    return sorted(cp_list)[-1]


def inference(a):
    generator = Generator(h).to(device)

    state_dict_g = load_checkpoint(a.checkpoint_file, device)
    generator.load_state_dict(state_dict_g['generator'])

    filelist = os.listdir(a.input_mels_dir)

    os.makedirs(a.output_dir, exist_ok=True)

    generator.eval()
    generator.remove_weight_norm()
    with torch.no_grad():
        for i, filname in enumerate(filelist):
            x = np.load(os.path.join(a.input_mels_dir, filname))
            x = torch.FloatTensor(x).to(device)
            
            
            if len(x.shape) == 2:
              
              x = x.unsqueeze(0)
              print('축 변환 완료 ')
              print('\n')

            y_g_hat = generator(x)
            audio = y_g_hat.squeeze()
            audio = audio * MAX_WAV_VALUE
            audio = audio.cpu().numpy().astype('int16')

            output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
            write(output_file, h.sampling_rate, audio)
            print(output_file)


  _resample_loop_p(x, t_out, interp_win, interp_delta, num_table, scale, y)


In [None]:
print('Initializing Inference Process..')

a = easydict.EasyDict({ 
                        "input_mels_dir": "/content/drive/MyDrive/tts_hifiGAN/dataset/mel_data/bye",
                        # "input_mels_dir": "/content/drive/MyDrive/tts_hifiGAN/dataset/mel_data/second",
                        "output_dir": "/content/drive/MyDrive/tts_hifiGAN/USING_output/bye",
                        "checkpoint_file" : "/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/g_00200000",
                        # "checkpoint_file" : "/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/univ/g_02500000"
                        })


config_file = "/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/config.json"

with open(config_file) as f:
    data = f.read()

Initializing Inference Process..


In [None]:
global h
json_config = json.loads(data)
h = AttrDict(json_config)

torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
    torch.cuda.manual_seed(h.seed)
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

inference(a)

Loading '/content/drive/MyDrive/tts_hifiGAN/hifi-gan/cp_hifigan/g_00200000'
Complete.
Removing weight norm...
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/byebye-final-meloutput-new_generated_e2e.wav
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/byebye-final2-meloutput-new_generated_e2e.wav
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/byebye-final3-meloutput-new_generated_e2e.wav
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/byebye-final4-meloutput-new_generated_e2e.wav
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/ex1-mj-meloutput_generated_e2e.wav
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/ex2-mj-meloutput_generated_e2e.wav
/content/drive/MyDrive/tts_hifiGAN/USING_output/bye/ex-mj-meloutput_generated_e2e.wav


In [None]:
from scipy.io import wavfile
import IPython.display as ipd

path = "/content/drive/MyDrive/tts_hifiGAN/USING_output/0-meloutput-new_generated_e2e.wav"

fs, data = wavfile.read(path)
ipd.Audio(data, rate=22050)

pytorch 공식 튜토리올 --> mel-spec (En) 버전 test가능
https://pytorch.org/hub/nvidia_deeplearningexamples_tacotron2/
https://colab.research.google.com/drive/1auDqb-MzrIexPW87v1fEqVnnzAGFvwOG?usp=sharing (적용)

한국어 pretrained FastSpeech checkpoint 있음
https://github.com/HGU-DLLAB/Korean-FastSpeech2-Pytorch




한국어 End to End pretrained chekcpoint 있음.(huggingface) Tacotron2 + MelGAN
https://github.com/TensorSpeech/TensorFlowTTS/issues/183
https://colab.research.google.com/drive/1ybWwOS5tipgPFttNulp77P6DAB5MtiuN?usp=sharing#scrollTo=0wzU9yQElSDK

jss tacotron2 한국어 학습. 결과 잘 나온건지는 모르겟음 
https://colab.research.google.com/drive/18iRei4P9x0kiO2ZQ1V6KXPJ3MSurGCC6?hl=ko#scrollTo=eJYtqvGRwDzb


In [None]:
import librosa
tmp = np.load("/content/drive/MyDrive/tts_hifiGAN/dataset/mel_data/0-meloutput-new (1).npy")
tmp = tmp.squeeze(0)
wav_data = librosa.feature.inverse.mel_to_audio(tmp, sr=22050)
import soundfile

soundfile.write(file="tmp.wav",data=wav_data, samplerate=22050)
from scipy.io import wavfile
import IPython.display as ipd

fs, data = wavfile.read("tmp.wav")
ipd.Audio(data, rate=22050)