In [1]:
from collections import OrderedDict
from argparse import Namespace
from math import ceil

import torch
import numpy as np
import soundfile as sf
from librosa.filters import mel
from numpy.random import RandomState
from scipy import signal
from scipy.signal import get_window

from model_bl import D_VECTOR
from model_vc import Generator
from synthesis import build_model, wavegen

In [2]:
cn_wav = '/home/cheul/projects/clones/autovc_mod/wavs/dementiabank/cn/002-0.wav'
dm_wav = '/home/cheul/projects/clones/autovc_mod/wavs/dementiabank/dm/001-0.wav'


def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype="high", analog=False)
    return b, a


def pySTFT(x, fft_length=1024, hop_length=256):

    x = np.pad(x, int(fft_length // 2), mode="reflect")

    noverlap = fft_length - hop_length
    shape = x.shape[:-1] + ((x.shape[-1] - noverlap) // hop_length, fft_length)
    strides = x.strides[:-1] + (hop_length * x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)

    fft_window = get_window("hann", fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T

    return np.abs(result)


def get_melspec(wav_path, seed=0):
    mel_basis = mel(16000, 1024, fmin=80, fmax=7600, n_mels=80).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, 16000, order=5)

    # Read audio file
    x, fs = sf.read(wav_path)

    # Remove drifting noise
    y = signal.filtfilt(b, a, x)

    # Add a little random noise for model roubstness
    wav = (
        y * 0.96
        + (RandomState(seed).rand(y.shape[0]) - 0.5)
        * 1e-06
    )

    # Compute spect
    D = pySTFT(wav).T

    # Convert to mel and normalize
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = np.clip((D_db + 100) / 100, 0, 1)

    return S.astype(np.float32)


In [3]:
S_cn, S_dm = get_melspec(cn_wav), get_melspec(dm_wav)
print(S_cn.shape, S_dm.shape)

(3922, 80) (3450, 80)


In [4]:
# load pretrained speaker encoder,
# with LSTM layers of size 768 and bottleneck fc layer of size 256
E_S = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
ckpt = torch.load("assets/3000000-BL.ckpt")

new_state_dict = OrderedDict()
for key, val in ckpt["model_b"].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
E_S.load_state_dict(new_state_dict)

print(E_S)

D_VECTOR(
  (lstm): LSTM(80, 768, num_layers=3, batch_first=True)
  (embedding): Linear(in_features=768, out_features=256, bias=True)
)


In [5]:
# get speaker embeddings from mel-spectrograms
S_cn_tensor = torch.from_numpy(S_cn).unsqueeze(0)
S_dm_tensor = torch.from_numpy(S_dm).unsqueeze(0)
print(S_cn_tensor.shape, S_dm_tensor.shape)

I_cn = E_S(S_cn_tensor.cuda()).detach().cpu()
I_dm = E_S(S_dm_tensor.cuda()).detach().cpu()
print(I_cn.shape, I_dm.shape)

torch.Size([1, 3922, 80]) torch.Size([1, 3450, 80])
torch.Size([1, 256]) torch.Size([1, 256])


In [6]:
config = Namespace(
    batch_size=2,
    data_dir='./spmel',
    dim_neck=32,
    dim_emb=256,
    dim_pre=512,
    freq=32,
    lambda_cd=1,
    len_crop=128,
    log_step=10,
    num_iters=1000000
)

device = 'cuda'
G = Generator(config.dim_neck, config.dim_emb, config.dim_pre, config.freq).eval().to(device)
G.load_state_dict(torch.load('assets/autovc.ckpt', map_location=device)['model'])
print(G)

Generator(
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(336, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 32, num_layers=2, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (lstm1): LSTM(320, 512, batch_first=True)
    (convolutions): ModuleList(
      (0): Sequential(
        (0

In [7]:
def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad


S_cn_padded, len_pad_cn = pad_seq(S_cn)
print(S_cn_padded.shape, len_pad_cn)

S_dm_padded, len_pad_dm = pad_seq(S_dm)
print(S_dm_padded.shape, len_pad_dm)

S_cn_cuda = torch.from_numpy(S_cn_padded).unsqueeze(0).to(device)
S_dm_cuda = torch.from_numpy(S_dm_padded).unsqueeze(0).to(device)
I_cn_cuda = I_cn.to(device)
I_dm_cuda = I_dm.to(device)

print(S_cn_cuda.shape, S_dm_cuda.shape, I_cn_cuda.shape, I_dm_cuda.shape)

(3936, 80) 14
(3456, 80) 6
torch.Size([1, 3936, 80]) torch.Size([1, 3456, 80]) torch.Size([1, 256]) torch.Size([1, 256])


In [8]:
with torch.no_grad():
    _, X_cn_to_dm, _ = G(S_cn_cuda, I_cn_cuda, I_dm_cuda)
    _, X_dm_to_cn, _ = G(S_dm_cuda, I_dm_cuda, I_cn_cuda)

print(X_cn_to_dm.shape, X_dm_to_cn.shape)

torch.Size([1, 1, 3936, 80]) torch.Size([1, 1, 3456, 80])


In [9]:
X_cn_to_dm = X_cn_to_dm[0, 0, :-len_pad_cn, :].cpu().numpy()
X_dm_to_cn = X_dm_to_cn[0, 0, :-len_pad_dm, :].cpu().numpy()

print(X_cn_to_dm.shape, X_dm_to_cn.shape)

(3922, 80) (3450, 80)


In [10]:
np.save("test/cn2dm.npy", X_cn_to_dm)
np.save("test/dm2cn.npy", X_dm_to_cn)

In [13]:
from parallel_wavegan.utils import download_pretrained_model

download_pretrained_model("arctic_slt_parallel_wavegan.v1", "/home/cheul/data/models/parallel_wavegan")

Downloading...
From: https://drive.google.com/uc?id=1_MXePg40-7DTjD0CDVzyduwQuW_O9aA1
To: /home/cheul/data/models/parallel_wavegan/arctic_slt_parallel_wavegan.v1.tar.gz
100%|██████████| 15.7M/15.7M [00:00<00:00, 17.5MB/s]


'/home/cheul/data/models/parallel_wavegan/arctic_slt_parallel_wavegan.v1/checkpoint-400000steps.pkl'

In [12]:
vocoder = build_model().to(device)
vocoder_ckpt = torch.load("assets/checkpoint_step001000000_ema.pth")
vocoder.load_state_dict(vocoder_ckpt["state_dict"])
print(vocoder)

WaveNet(
  (first_conv): Conv1d(1, 512, kernel_size=(1,), stride=(1,))
  (conv_layers): ModuleList(
    (0): ResidualConv1dGLU(
      (conv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(2,))
      (conv1x1c): Conv1d(80, 512, kernel_size=(1,), stride=(1,))
      (conv1x1_out): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
      (conv1x1_skip): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
    )
    (1): ResidualConv1dGLU(
      (conv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
      (conv1x1c): Conv1d(80, 512, kernel_size=(1,), stride=(1,))
      (conv1x1_out): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
      (conv1x1_skip): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
    )
    (2): ResidualConv1dGLU(
      (conv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
      (conv1x1c): Conv1d(80, 512, kernel_size=(1,), stride=(1,))
      (conv1x1_out): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    

In [14]:
wav_cn_to_dm = wavegen(vocoder, c=X_cn_to_dm)
wav_dm_to_cn = wavegen(vocoder, c=X_dm_to_cn)

sf.write(f"test/outputs/wavenet/cn2dm.wav", wav_cn_to_dm, samplerate=16000)
sf.write(f"test/outputs/wavenet/dm2cn.wav", wav_dm_to_cn, samplerate=16000)

100%|██████████| 1004032/1004032 [3:43:50<00:00, 74.76it/s] 
