In [35]:
import sys
import os
import torch
from torch import nn, optim
from torch.nn import functional as F
import numpy as np
sys.path.append('logger')
from logger import Logger
from logger_utils import prepare_directories_and_logger

sys.path.append('utils')
from save_and_load import load_checkpoint
import random
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
vocoder = torch.hub.load('descriptinc/melgan-neurips', 'load_melgan')

Using cache found in /home/ericwudayi/.cache/torch/hub/descriptinc_melgan-neurips_master


In [154]:
import os
import librosa
from librosa.filters import mel as librosa_mel_fn
class Audio2Mel(nn.Module):
    def __init__(
        self,
        n_fft=1024,
        hop_length=256,
        win_length=1024,
        sampling_rate=22050,
        n_mel_channels=240,
        mel_fmin=0.0,
        mel_fmax=None,
    ):
        super().__init__()
        ##############################################
        # FFT Parameters                              #
        ##############################################
        window = torch.hann_window(win_length).float()
        mel_basis = librosa_mel_fn(
            sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax
        )
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer("mel_basis", mel_basis)
        self.register_buffer("window", window)
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.sampling_rate = sampling_rate
        self.n_mel_channels = n_mel_channels

    def forward(self, audio):
        p = (self.n_fft - self.hop_length) // 2
        audio = F.pad(audio, (p, p), "reflect").squeeze(1)
        fft = torch.stft(
            audio,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            win_length=self.win_length,
            window=self.window,
            center=False,
        )
        real_part, imag_part = fft.unbind(-1)
        magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2)
        mel_output = torch.matmul(self.mel_basis, magnitude)
        log_mel_spec = torch.log10(torch.clamp(mel_output, min=1e-5))
        
        return log_mel_spec
n_fft = 1024
hop_length = 256
win_length = 1024
sampling_rate = 22050
n_mel_channels = 80
extract_func = Audio2Mel(n_fft, hop_length, win_length, sampling_rate, n_mel_channels)


def convert_file(path, trim= False):
    y, _ = librosa.load(path, sr=22050)
    if trim:
        y, index = librosa.effects.trim(y,top_db=25)
    #peak = np.abs(y).max()
    #y /= peak
    
    y = torch.from_numpy(y)

    y = y[None, None]
    mel = extract_func(y)
    mel = mel.numpy()
    mel = mel[0]

    return mel.astype(np.float32)

In [264]:
import importlib
# Test for multi-scale vqvc+
model = importlib.import_module(f'model.vqvc+_multi_simple.vq_model')
model = getattr(model, 'VC_MODEL')
model = (model(in_channel=80,channel=512,n_embed=128)).cuda()
opt = optim.Adam(model.parameters())

# this model is pretrained on LibriTTS-clean 100
model, opt, iteration = load_checkpoint(f'checkpoint/vqvc+_multi_simple_n128_ch512_train_simple_normalize_multi/gen', model, opt)

Loading checkpoint 'checkpoint/vqvc+_multi_simple_n128_ch512_train_simple_normalize_multi/gen'
Loaded checkpoint 'checkpoint/vqvc+_multi_simple_n128_ch512_train_simple_normalize_multi/gen' from iteration 1505518


In [259]:
# your path to VCTK wavfile
audio_path =  "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/wav48"
audio_path =  "/home/ericwudayi/nas189/homes/ericwudayi/LibriTTS/mel"

choiced = os.listdir(audio_path)
source = f'{audio_path}/{random.choice(choiced)}'
target = f'{audio_path}/{random.choice(choiced)}'

source = os.path.join(audio_path,source)
target = os.path.join(audio_path,target)

source = f'{source}/{random.choice(os.listdir(source))}'
target = f'{target}/{random.choice(os.listdir(target))}'
print (source)
#source_name = source.split('/')[-2]
#target_name = target.split('/')[-2]
#print (source.split('/')[-2])
#print (target.split('/')[-2])

/home/ericwudayi/nas189/homes/ericwudayi/LibriTTS/mel/mel.melgan/8419_286667_8419_286667_000013_000004.npy


In [265]:
import IPython.display as ipd
#os.makedirs(f'subject/{source_name}_{target_name}_{sample_num}',exist_ok = True)
source_audio, index = librosa.effects.trim(librosa.load(source, sr=48000)[0] ,top_db=16)
#librosa.output.write_wav(f'subject/{source_name}_{target_name}_{sample_num}/source.wav', source_audio, 48000)

RuntimeError: Error opening tensor([[-0.0118,  0.0300, -0.0005,  ...,  0.0669, -0.0203, -0.0321]],
       device='cuda:0'): System error.

In [221]:
ipd.Audio(source_audio,autoplay=False, rate=48000)

In [222]:
target_audio, index = librosa.effects.trim(librosa.load(target, sr=48000)[0] ,top_db=16)
#librosa.output.write_wav(f'subject/{source_name}_{target_name}_{sample_num}/target.wav', target_audio, 48000)



NoBackendError: 

In [None]:
ipd.Audio(target_audio,autoplay=False, rate=48000)

In [260]:
#source = convert_file(source, True)
#target = convert_file(target, True)
source = torch.tensor(np.load(source)).unsqueeze(0).cuda()
target = torch.tensor(np.load(target)).unsqueeze(0).cuda()
source = source[:,:,:source.size(2)//16* 16]
target = target[:,:,:target.size(2)//16 * 16]
source = (source*25 + 50) / 50
target  = (target*25 + 50) / 50
q_after_block, sp_embedding_block, std_block, _, _ = model.encode(source)
q_after_block_tg, sp_embedding_block_tg, std_block_tg, _, _ = model.encode(target)

dec = model.decode(q_after_block, sp_embedding_block_tg, std_block_tg)[-1]
dec = (dec*50 -50)/25
source = (source*50 - 50)/25
target = (target*50 - 50)/25
a = vocoder.inverse(dec)
source = vocoder.inverse(source)
target = vocoder.inverse(target)
#librosa.output.write_wav(f'subject/{source_name}_{target_name}_{sample_num}/conversion.wav', a[0].detach().cpu().numpy(),22050)

In [261]:
ipd.Audio(a[0].cpu().numpy(),autoplay=False, rate=22050)

In [262]:
ipd.Audio(source[0].cpu().numpy(),autoplay=False, rate=22050)

In [263]:
ipd.Audio(target[0].cpu().numpy(),autoplay=False, rate=22050)