In [1]:
import sys
import os
import torch
from torch import nn, optim
from torch.nn import functional as F
import numpy as np
sys.path.append('logger')
from logger import Logger
from logger_utils import prepare_directories_and_logger

sys.path.append('utils')
from save_and_load import load_checkpoint
import random
from optim_step import *
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
vocoder = torch.hub.load('descriptinc/melgan-neurips', 'load_melgan')

Using cache found in /home/ericwudayi/.cache/torch/hub/descriptinc_melgan-neurips_master


In [2]:
import os
import librosa
from librosa.filters import mel as librosa_mel_fn
class Audio2Mel(nn.Module):
    def __init__(
        self,
        n_fft=1024,
        hop_length=256,
        win_length=1024,
        sampling_rate=22050,
        n_mel_channels=240,
        mel_fmin=0.0,
        mel_fmax=None,
    ):
        super().__init__()
        ##############################################
        # FFT Parameters                              #
        ##############################################
        window = torch.hann_window(win_length).float()
        mel_basis = librosa_mel_fn(
            sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax
        )
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer("mel_basis", mel_basis)
        self.register_buffer("window", window)
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.sampling_rate = sampling_rate
        self.n_mel_channels = n_mel_channels

    def forward(self, audio):
        p = (self.n_fft - self.hop_length) // 2
        audio = F.pad(audio, (p, p), "reflect").squeeze(1)
        fft = torch.stft(
            audio,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            win_length=self.win_length,
            window=self.window,
            center=False,
        )
        real_part, imag_part = fft.unbind(-1)
        magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2)
        mel_output = torch.matmul(self.mel_basis, magnitude)
        log_mel_spec = torch.log10(torch.clamp(mel_output, min=1e-5))
        
        return log_mel_spec
n_fft = 1024
hop_length = 256
win_length = 1024
sampling_rate = 22050
n_mel_channels = 80
extract_func = Audio2Mel(n_fft, hop_length, win_length, sampling_rate, n_mel_channels)


def convert_file(path, trim= False):
    y, _ = librosa.load(path, sr=22050)
    if trim:
        y, index = librosa.effects.trim(y,top_db=20)
    #peak = np.abs(y).max()
    #y /= peak
    
    y = torch.from_numpy(y)

    y = y[None, None]
    mel = extract_func(y)
    mel = mel.numpy()
    mel = mel[0]

    return mel.astype(np.float32)

In [37]:
import importlib
model = importlib.import_module(f'model.vqvc+_multi_codebook_lj.vq_model')
model = getattr(model, 'VC_MODEL')
model = (model(in_channel=80,channel=512,n_embed=32)).cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

model, opt, iteration = load_checkpoint(f'checkpoint/vqvc+_multi_codebook_lj_n32_ch512_train_ljvae/gen', model, opt)
opt = torch.optim.Adam(model.dec.parameters(), lr=1e-3)
#model.enc.eval()
#for param in model.enc.parameters(): 
#    param.requires_grad = False
#for param in model.quantize.parameters(): 
#    param.requires_grad = False

Loading checkpoint 'checkpoint/vqvc+_multi_codebook_lj_n32_ch512_train_ljvae/gen'
Loaded checkpoint 'checkpoint/vqvc+_multi_codebook_lj_n32_ch512_train_ljvae/gen' from iteration 5792


In [243]:
npy_path = '/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel_all'
file = random.choice(os.listdir(npy_path))
file = os.path.join(npy_path,file)

from utils.dataloader import AudioMetaLoader, VCTK_META_collate, AudioNpyLoader, VCTK_collate
from torch.utils.data import DataLoader
def make_inf_iterator(data_iterator):
    while True:
        for data in data_iterator:
            yield data

enc_dataset = AudioNpyLoader(file)
enc_loader = DataLoader(enc_dataset, batch_size=32, shuffle=True, num_workers=8,collate_fn=VCTK_collate)
inf_enc_loader = make_inf_iterator(enc_loader)

criterion = nn.L1Loss()

In [95]:
audio = torch.tensor(audio).cuda()
audio = (audio*25 + 50) / 50
audio = audio.unsqueeze(0)


  """Entry point for launching an IPython kernel.


In [251]:
for i in range(5000):
    audio = next(inf_enc_loader)
    audio = audio.cuda()
    audio = (audio*25 + 50) / 50
    time_step = audio.size(2)
    factor = 32
    audio_shuffle = [[] for i in range (time_step//factor)]
    nums = [x for x in range(time_step//factor)]
    random.shuffle(nums)

    for i_n, n in enumerate(nums):
        sf = random.uniform(0.8, 1.2)
        audio_shuffle[n] = F.interpolate(audio[...,factor*n : factor*(n+1)], scale_factor=sf, mode='nearest')
    
    audio_shuffle = torch.cat(audio_shuffle,dim=2) 
    
    audio_tmp = audio_shuffle[...,:audio_shuffle.size(2)//16*16]
    audio_middile =  F.interpolate(audio_tmp, scale_factor= 1/2)
    audio_middile = audio_middile[:, :audio_middile.size(1)//2, :]

    audio_low = F.interpolate(audio_middile, scale_factor= 1/2)
    audio_low = audio_low[:, :audio_low.size(1)//2, :]
    audio_list = [audio_low, audio_middile, audio_tmp]
    
    out, enc_b, latent_loss= model(audio_tmp)

    inner_recon_loss = 0

    for num in range(3):
        inner_recon_loss += criterion(out[num], 
            audio_list[num])
    OptimStep([(model, opt,  inner_recon_loss+latent_loss.mean()*0.01 , False)], 3)
    if i%200==0:
        print (inner_recon_loss)

tensor(0.8791, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5463, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5340, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5207, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5071, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5099, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5190, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5104, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5055, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5186, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5143, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5093, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5064, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.5070, device='cuda:0', grad_fn=<AddBack

KeyboardInterrupt: 

In [252]:
a = (out[-1]*50 - 50)/25
a = vocoder.inverse(a)

In [6]:
import IPython.display as ipd
ipd.Audio(a[0].cpu().numpy(),autoplay=False, rate=22050)

NameError: name 'a' is not defined

In [47]:
npy_path = '/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/mel.melgan'
file = random.choice(os.listdir(npy_path))
file = os.path.join(npy_path,file)
#audio_file_test = os.listdir(file)
#audio_file_test = random.choice(audio_file_test)
#audio_file_test = os.path.join(file,audio_file_test)
audio_test = np.load(file)

In [48]:
audio_test = torch.tensor(audio_test).unsqueeze(0)
test = vocoder.inverse(audio_test)
ipd.Audio(test[0].cpu().numpy(),autoplay=False, rate=22050)

In [49]:
print (audio_test.size())
audio_test = audio_test[...,:audio_test.size(2)//16*16]
print (audio_test.size())
out_conversion,_ ,_ = model(((audio_test*25 + 50) / 50).cuda())
out_conversion = (out_conversion[-1]*50 -50)/25
print (out_conversion.size())
conversion = vocoder.inverse(out_conversion)
ipd.Audio(conversion[0].cpu().numpy(),autoplay=False, rate=22050)

torch.Size([1, 80, 200])
torch.Size([1, 80, 192])
torch.Size([1, 80, 192])


In [35]:

enc_b, sp_embedding_block, std_block, diff_total, index_list = pretrained_model.encode(((audio_test*25 + 50) / 50).cuda())
_, sp_embedding_block_tg, std_block_tg, diff_total, index_list = pretrained_model.encode((audio).cuda())
out_conversion = pretrained_model.decode(enc_b, sp_embedding_block_tg, std_block_tg)

out_conversion = (out_conversion[-1]*50 -50)/25
print (out_conversion.size())
conversion = vocoder.inverse(out_conversion)
ipd.Audio(conversion[0].cpu().numpy(),autoplay=False, rate=22050)

NameError: name 'pretrained_model' is not defined

In [385]:
source = torch.tensor(source).unsqueeze(0).cuda()
target = torch.tensor(target).unsqueeze(0).cuda()

NameError: name 'source' is not defined

In [56]:
source = source[:,:,:source.size(2)//8 * 8]
target = target[:,:,:target.size(2)//8 * 8]
print (source.size(), target.size())

torch.Size([1, 80, 320]) torch.Size([1, 80, 320])


In [57]:
q_after_block, sp_embedding_block, std_block, _ = model.encode(source)
q_after_block_tg, sp_embedding_block_tg, std_block_tg, _ = model.encode(target)

In [58]:
dec = model.decode(q_after_block, sp_embedding_block_tg, std_block_tg)

In [59]:
a = torch.stack([source[0], target[0], dec[0]], dim = 0)
#print (a.size())
a = vocoder.inverse(a)

In [60]:
a = a.detach().cpu().numpy()

In [61]:
import IPython.display as ipd
ipd.Audio(a[2],autoplay=False, rate=22050)

In [4]:
import os
audio_dir = '/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan'
audio = os.listdir(audio_dir)
for name in audio:
    name = name.split('_')[0]
    os.makedirs(f'{audio_dir}/{name}', exist_ok=True)

In [8]:
import shutil
for name in audio:
    dirr = name.split('_')[0]
    destination = f'{audio_dir}/{dirr}'
    print (destination)
    shutil.move(f'{audio_dir}/{name}',destination)

/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p295
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p281
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p275
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p247
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p225
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p323
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p239
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p258
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p318
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p268
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p243
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/mel.melgan/p302
/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_meta/me