In [3269]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import os
from os import listdir
import sys
import argparse
from librosa.filters import mel as librosa_mel_fn
import librosa.core as core
import numpy as np

from vqvae import VQVAE, NetD
from utils.dataloader import AudioNpyLoader, VCTK_collate
import utils.fileio as fio
from trainer import train
from torch.nn import functional as F
sys.path.append('logger')
from logger import Logger
from logger_utils import prepare_directories_and_logger
from plotting_utils import  plot_spectrogram_to_numpy

sys.path.append('loss')
sys.path.append('utils')
from began_loss import BEGANRecorder, BEGANLoss
from optim_step import *
from save_and_load import save_checkpoint, load_checkpoint
from tqdm import tqdm
import random
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
vocoder = torch.hub.load('descriptinc/melgan-neurips', 'load_melgan')
sys.path.append('../ai_singing/vocoder/melgan-neurips')
from mel2wav.interface import MelVocoder
from mel2wav.interface import MelVocoder
#vocoder = MelVocoder(path = "vocoder/melgan-neurips/scripts/logs/VCTK")

Using cache found in /home/ericwudayi/.cache/torch/hub/descriptinc_melgan-neurips_master


In [4122]:
class Audio2Mel(nn.Module):
    def __init__(
        self,
        n_fft=1024,
        hop_length=256,
        win_length=1024,
        sampling_rate=22050,
        n_mel_channels=240,
        mel_fmin=0.0,
        mel_fmax=None,
    ):
        super().__init__()
        ##############################################
        # FFT Parameters                              #
        ##############################################
        window = torch.hann_window(win_length).float()
        mel_basis = librosa_mel_fn(
            sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax
        )
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer("mel_basis", mel_basis)
        self.register_buffer("window", window)
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.sampling_rate = sampling_rate
        self.n_mel_channels = n_mel_channels

    def forward(self, audio):
        p = (self.n_fft - self.hop_length) // 2
        audio = F.pad(audio, (p, p), "reflect").squeeze(1)
        fft = torch.stft(
            audio,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            win_length=self.win_length,
            window=self.window,
            center=False,
        )
        real_part, imag_part = fft.unbind(-1)
        magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2)
        mel_output = torch.matmul(self.mel_basis, magnitude)
        log_mel_spec = torch.log10(torch.clamp(mel_output, min=1e-5))
        
        return log_mel_spec
n_fft = 1024
hop_length = 256
win_length = 1024
sampling_rate = 22050
n_mel_channels = 80
extract_func = Audio2Mel(n_fft, hop_length, win_length, sampling_rate, n_mel_channels)


def convert_file(path, trim= False):
    y, _ = librosa.load(path, sr=22050)
    if trim:
        y, index = librosa.effects.trim(y,top_db=16)
    #peak = np.abs(y).max()
    #y /= peak
    
    y = torch.from_numpy(y)

    y = y[None, None]
    mel = extract_func(y)
    mel = mel.numpy()
    mel = mel[0]

    return mel.astype(np.float32)

In [4222]:
model = (VQVAE(in_channel=80,channel=512,n_embed=64,n_res_block=2,n_res_channel=64,embed_dim=80//8)).cuda()
opt = optim.Adam(model.parameters())
model, opt, iteration = load_checkpoint(f'checkpoint/n_embed_64_true/gen', model, opt)

Loading checkpoint 'checkpoint/n_embed_64_true/gen'
Loaded checkpoint 'checkpoint/n_embed_64_true/gen' from iteration 104349


In [4124]:
#model, opt, iteration = load_checkpoint(f'checkpoint/n_embed_1024_de_ad/gen', model, opt)

In [4125]:
sample_num = 3

In [4126]:
import os
import librosa
audio_path =  "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/wav48"
choiced = ['p330','p345','p376', 'p333','p334', 'p335', 'p336','p339','p340', 'p341','p343'
           ,'p345','p347','p351','p360','p361','p362','p363','p364']
#choiced = os.listdir(audio_path)
#choiced = [n for n in choiced if n not in not_choiced]
source = f'{audio_path}/{random.choice(choiced)}'
target = f'{audio_path}/{random.choice(choiced)}'
target = 'p347'
source= 'p330'
source = os.path.join(audio_path,source)
target = os.path.join(audio_path,target)
#source = f'{source}/p347'
#target = f'{target}/p351'
source = f'{source}/{random.choice(os.listdir(source))}'
target = f'{target}/{random.choice(os.listdir(target))}'
source_name = source.split('/')[-2]
target_name = target.split('/')[-2]
print (source.split('/')[-2])
print (target.split('/')[-2])

p330
p347


In [4127]:
import IPython.display as ipd
#os.makedirs(f'subject/{source_name}_{target_name}_{sample_num}',exist_ok = True)
source_audio, index = librosa.effects.trim(librosa.load(source, sr=48000)[0] ,top_db=16)
#librosa.output.write_wav(f'subject/{source_name}_{target_name}_{sample_num}/source.wav', source_audio, 48000)

In [4128]:
ipd.Audio(source_audio,autoplay=False, rate=48000)

In [4129]:
target_audio, index = librosa.effects.trim(librosa.load(target, sr=48000)[0] ,top_db=16)
#librosa.output.write_wav(f'subject/{source_name}_{target_name}_{sample_num}/target.wav', target_audio, 48000)

In [4130]:
ipd.Audio(target_audio,autoplay=False, rate=48000)

In [4223]:
audio_path = "./subject"
subject = os.listdir(audio_path)
source = f'./subject/{subject[0]}/source.wav'
target = f'./subject/{subject[0]}/target.wav'
source_audio = librosa.load(source,sr=48000)[0]
target_audio = librosa.load(target,sr=48000)[0]
ipd.Audio(source,autoplay=False, rate=48000)

In [4224]:
ipd.Audio(target_audio, autoplay=False, rate=48000)

In [4225]:
source = convert_file(source,False)
target = convert_file(target, False)
source = torch.tensor(source).unsqueeze(0).cuda()
target = torch.tensor(target).unsqueeze(0).cuda()
source = source[:,:,:source.size(2)//8 * 8]
target = target[:,:,:target.size(2)//8 * 8]
q_after_block, sp_embedding_block, std_block, _ = model.encode(source)
q_after_block_tg, sp_embedding_block_tg, std_block_tg, _ = model.encode(target)
dec = model.decode(q_after_block, sp_embedding_block_tg, std_block_tg)
a = vocoder.inverse(dec)
source = vocoder.inverse(source)[0].cpu().numpy()
target = vocoder.inverse(target)[0].cpu().numpy()
#librosa.output.write_wav(f'subject/{source_name}_{target_name}_{sample_num}/conversion.wav', a[0].detach().cpu().numpy(),22050)

In [4226]:
ipd.Audio(a[0].cpu().numpy(),autoplay=False, rate=22050)

In [4003]:
sample_num += 1

In [4004]:
ipd.Audio(source,autoplay=False, rate=22050)

In [3969]:
ipd.Audio(target,autoplay=False, rate=22050)

In [167]:
import librosa
audio_dir =  "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_clip_sil/mel.test"

class AudioNpyLoader(torch.utils.data.Dataset):
    """
        1) loads audio
    """
    def __init__(self, audio_path):
        self.audio_path = audio_path
        self.audios = os.listdir(self.audio_path)
        
        random.seed(1234)
        random.shuffle(self.audios)

    def __getitem__(self, index):
        item = f'{self.audio_path}/{self.audios[index]}'
        item = np.load(item)
        
        return item, self.audios[index].split('_')[0]

    def __len__(self):
        return len(self.audios)
    
def VCTK_collate(batch):
    maxn = 256
    audio = []
    name = []
    for item in batch:
        
        item_len = int(item[0].shape[1])
        if item_len>maxn:
            rand = np.random.randint(item_len-maxn)
            item_128 = item[0][:,rand:rand+maxn]
        else:
            item_128 = item[0]
        audio += [item_128]
        name += [item[1]]
    for i in range(len(audio)):
        a = audio[i]
        a = np.pad(a,((0,0),(0,maxn-a.shape[1])),'reflect')
        audio[i] = a
        
    return torch.tensor((np.array(audio))), name
dataset_test = AudioNpyLoader(audio_dir)
test_loader = DataLoader(dataset_test, batch_size=8, shuffle=True, num_workers=4,collate_fn=VCTK_collate)

sp_id = []
sp = []
for i, (audio, name) in enumerate(test_loader):
    sp += name
    #print (name)
    source = audio[:,:,:audio.size(2)//8 * 8].cuda()
    #target = target[:,:,:target.size(2)//8 * 8]
    q_after_block, sp_embedding_block, std_block, _ = model.encode(source)

    #q_after_block_tg, sp_embedding_block_tg, std_block_tg, _ = model.encode(target)
    dec = model.decode(q_after_block, sp_embedding_block, std_block)
    sp += [sp_embedding_block[0][0,:,0]]
    #source_audio =vocoder.inverse(source_audio).detach().cpu().numpy()
    #target_audio = vocoder.inverse(target_audio).detach().cpu().numpy()
    

KeyboardInterrupt: 

In [75]:
ipd.Audio(s,autoplay=False, rate=22050)

In [54]:
import IPython.display as ipd
ipd.Audio(target_audio[0],autoplay=False, rate=22050)

In [55]:
source = torch.tensor(source).unsqueeze(0).cuda()
target = torch.tensor(target).unsqueeze(0).cuda()

In [56]:
source = source[:,:,:source.size(2)//8 * 8]
target = target[:,:,:target.size(2)//8 * 8]
print (source.size(), target.size())

torch.Size([1, 80, 320]) torch.Size([1, 80, 320])


In [57]:
q_after_block, sp_embedding_block, std_block, _ = model.encode(source)
q_after_block_tg, sp_embedding_block_tg, std_block_tg, _ = model.encode(target)

In [58]:
dec = model.decode(q_after_block, sp_embedding_block_tg, std_block_tg)

In [59]:
a = torch.stack([source[0], target[0], dec[0]], dim = 0)
#print (a.size())
a = vocoder.inverse(a)

In [60]:
a = a.detach().cpu().numpy()

In [61]:
import IPython.display as ipd
ipd.Audio(a[2],autoplay=False, rate=22050)

In [598]:
from model import *
def VCTK_collate(batch):
    maxn = 256
    audio = []
    name = []
    for item in batch:
        item, name_ = item
        item_len = int(item.shape[1])
        if item_len>maxn:
            rand = np.random.randint(item_len-maxn)
            item_128 = item[:,rand:rand+maxn]
        else:
            item_128 = item
        audio += [item_128]
        name += [name_]
    for i in range(len(audio)):
        a = audio[i]
        a = np.pad(a,((0,0),(0,maxn-a.shape[1])),'reflect')
        audio[i] = a
        
    return torch.tensor((np.array(audio))), torch.tensor(np.array(name))
    
class AudioNpyLoader(torch.utils.data.Dataset):
    """
        1) loads audio
    """
    def __init__(self, audio_path):
        self.audio_path = audio_path
        self.audios = os.listdir(self.audio_path)
        
        random.seed(1234)
        random.shuffle(self.audios)

    def __getitem__(self, index):
        item = f'{self.audio_path}/{self.audios[index]}'
        item = np.load(item)
        
        return item, int(self.audios[index].split('_')[0][1:])
    def __len__(self):
        return len(self.audios)

spk_cls = LatentClassifier(nc = 400, dim_deck = 10).cuda()
opt_spk = optim.Adam(spk_cls.parameters())
model.eval()


VQVAE(
  (enc): ModuleList(
    (0): Sequential(
      (0): Conv1d(80, 512, kernel_size=(4,), stride=(2,), padding=(1,))
      (1): LeakyReLU(negative_slope=0.01)
      (2): Conv1d(512, 40, kernel_size=(3,), stride=(1,), padding=(1,))
    )
    (1): Sequential(
      (0): Conv1d(40, 512, kernel_size=(4,), stride=(2,), padding=(1,))
      (1): LeakyReLU(negative_slope=0.01)
      (2): Conv1d(512, 20, kernel_size=(3,), stride=(1,), padding=(1,))
    )
    (2): Sequential(
      (0): Conv1d(20, 512, kernel_size=(4,), stride=(2,), padding=(1,))
      (1): LeakyReLU(negative_slope=0.01)
      (2): Conv1d(512, 10, kernel_size=(3,), stride=(1,), padding=(1,))
    )
  )
  (quantize): ModuleList(
    (0): Quantize(
      (embedding): Embedding(256, 40)
      (inorm): InstanceNorm1d(40, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
    )
    (1): Quantize(
      (embedding): Embedding(256, 20)
      (inorm): InstanceNorm1d(20, eps=1e-05, momentum=0.1, affine=False, track_runn

In [8]:
audio_dir = "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel_clip_sil2/mel.melgan"
dataset = AudioNpyLoader(audio_dir)
#valdataset = VCTKDataSet(audio_dir)

loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2,collate_fn=VCTK_collate)
criterion = nn.NLLLoss()
accu = 0
it = 0
for epoch in range(20):
    for i, (audio,name) in enumerate(loader):
        cluster_size = audio.size(1)
        audio = audio.cuda()
        name = name.long().cuda()
        q_after_block, sp_embedding_block, std_block, _ = model.encode(audio)
        n = spk_cls(q_after_block[2].detach())
        #print (n.size())
        #name = name.unsqueeze(1)
        loss = criterion(n, name)
        OptimStep([(spk_cls, opt_spk,  loss , False)], 3)
        if i %100 == 0:
            it += 1
            values, n = torch.max(n, 1)
            print (loss.item())
            accu += (name == n).sum()/float(n.size(0))
            print (accu / it)

5.975026607513428
tensor(0., device='cuda:0')
4.809162139892578
tensor(0.0312, device='cuda:0')
4.749805927276611
tensor(0.0208, device='cuda:0')
4.713329315185547
tensor(0.0156, device='cuda:0')
4.652519226074219
tensor(0.0125, device='cuda:0')
4.719099998474121
tensor(0.0156, device='cuda:0')
4.697415828704834
tensor(0.0179, device='cuda:0')
4.765286922454834
tensor(0.0156, device='cuda:0')
4.680025577545166
tensor(0.0139, device='cuda:0')
4.658003807067871
tensor(0.0156, device='cuda:0')
4.682055473327637
tensor(0.0170, device='cuda:0')
4.709247589111328
tensor(0.0182, device='cuda:0')
4.684818267822266
tensor(0.0168, device='cuda:0')
4.715783596038818
tensor(0.0156, device='cuda:0')
4.698464393615723
tensor(0.0146, device='cuda:0')
4.743445873260498
tensor(0.0137, device='cuda:0')
4.684892177581787
tensor(0.0129, device='cuda:0')
4.705193042755127
tensor(0.0122, device='cuda:0')
4.679933547973633
tensor(0.0115, device='cuda:0')
4.691586494445801
tensor(0.0109, device='cuda:0')
4.71

In [None]:
# n_embed 32 : q0 --> 19.5 %, q1 -->11.8 % , q2 --> 6.8% 0.23
# n_ebmed 64 : q0 --> 23.2 %, q1 -- 16.6 % , q2 -->7.0% 0.20
# n_embed 128 : q0 --> 33.3%, q1 --> 17.0%, q3 --> 10.25% 0.19
# n_embed 256 : q0 --> 35.8%, q1 --> 18.1%, q3 --> 12.50% 0.17
# IN only : q0 --> 71.7%, q2 --> 36.8% , q3 --> 5% 0.08

In [2]:
import os
li = []
for root, dirs, files in os.walk("/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/"):
    #print (root)
    for file in files:
        #print (os.path.join(root, file))
        if file[-3:] == 'wav':
            li += [(os.path.join(root, file))]

print (li[:10])
for root, dirs, files in os.walk("/home/ericwudayi/nas189/homes/ericwudayi/MUSDB18_WAV/10seconds_wav/"):
    #print (root)
    for file in files:
        #print (os.path.join(root, file))
        if file[-3:] == 'wav':
            li += [(os.path.join(root, file))]
li_ = li[-10:]
li = li[:-10]
print (li[-10:])

with open("/home/ericwudayi/nas189/homes/ericwudayi/DSD/train_files.txt", 'w') as f:
    for item in li:
        f.write("%s\n" % item)
with open("/home/ericwudayi/nas189/homes/ericwudayi/DSD/test_files.txt", 'w') as f:
    for item in li_:
        f.write("%s\n" % item)


['/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk33.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk26.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk9.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk19.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk29.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk27.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk35.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk6.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_wav/Patrick Talbot - A Reason To L/chunk8.wav', '/home/ericwudayi/nas189/homes/ericwudayi/DSD/10seconds_w

In [9]:
import numpy as np
a = np.load('/home/ericwudayi/nas189/homes/ericwudayi/NUS/train/ADIZ/sing/01.npy')

In [10]:
print (a.shape)

(512, 3978)
