<a href="https://colab.research.google.com/github/eccheng/knn-vc/blob/master/knnvc_interpolation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Voice interpolation with kNN-VC

Original work: https://bshall.github.io/knn-vc/

This notebook is modified from: https://colab.research.google.com/github/bshall/knn-vc/blob/master/knnvc_demo.ipynb

In [1]:
from pathlib import Path
import torch, torchaudio
from IPython.display import Audio, display

knn_vc = torch.hub.load('eccheng/knn-vc', 'knn_vc', trust_repo=True, prematched=True, pretrained=True, device='cuda')

Downloading: "https://github.com/eccheng/knn-vc/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/bshall/knn-vc/releases/download/v0.1/prematch_g_02500000.pt" to /root/.cache/torch/hub/checkpoints/prematch_g_02500000.pt
100%|██████████| 63.1M/63.1M [00:00<00:00, 135MB/s]
Downloading: "https://github.com/bshall/knn-vc/releases/download/v0.1/WavLM-Large.pt" to /root/.cache/torch/hub/checkpoints/WavLM-Large.pt


Removing weight norm...
[HiFiGAN] Generator loaded with 16,523,393 parameters.


100%|██████████| 1.18G/1.18G [00:12<00:00, 101MB/s] 


WavLM-Large loaded with 315,453,120 parameters.


### Load data

In [4]:
# Download LibriSpeech to use its speakers (optional if uploading own reference audio)
!mkdir ./librispeech
torchaudio.datasets.LIBRISPEECH('./librispeech', 'dev-clean', download=True)

100%|██████████| 322M/322M [00:17<00:00, 18.9MB/s]


<torchaudio.datasets.librispeech.LIBRISPEECH at 0x7f7bcc189c90>

In [5]:
# Load target speakers

def load_wav(fpath, target_srate=16000, channel=None, normalize=True):
    wav, srate = torchaudio.load(fpath)
    wav = wav.squeeze(0) if channel is None else wav[channel]
    assert wav.ndim == 1

    if srate != target_srate:
        wav = torchaudio.functional.resample(wav, srate, target_srate)

    if normalize:
        wav = wav - wav.mean()
        wav /= wav.abs().max()

    return wav

def build_matching_set(fpaths):
    return knn_vc.get_matching_set([load_wav(fpath) for fpath in fpaths])

def get_librispeech_paths(spk_id):
    return [p.as_posix() for p in Path(f'./librispeech/LibriSpeech/dev-clean/{spk_id}').glob('**/*.flac')]

ref_wav_paths = {
    # REMOVE THESE IF NOT USING LIBRISPEECH
    '84': get_librispeech_paths('84'),
    '174': get_librispeech_paths('174'),
    '652': get_librispeech_paths('652'),
    '1919': get_librispeech_paths('1919'),

    #'YOUR SPEAKER HERE': ['YOUR_AUDIO_PATHS_HERE', ...]
}

matching_sets = {spk: build_matching_set(paths) for spk, paths in ref_wav_paths.items()}

In [6]:
# Load source audio
src_wav_path, src_wav_channel = './librispeech/LibriSpeech/dev-clean/3000/15664/3000-15664-0001.flac', 0

src_wav = load_wav(src_wav_path, channel=src_wav_channel)
query_seq = knn_vc.get_features(src_wav)

### VC Experiments

In [7]:
# Standard voice conversion to each target
for spk_id in matching_sets:
    print(f'Speaker {spk_id}:')
    display(Audio(knn_vc.match(query_seq, matching_sets[spk_id], topk=4), rate=16000))

Speaker 84:


Speaker 174:


Speaker 652:


Speaker 1919:


In [8]:
# Static interpolation
interp_speakers = ['84', '174']
interp_weights = [0.5, 0.5]
out_wav = knn_vc.match_interpolated(query_seq, [matching_sets[spk_id] for spk_id in interp_speakers], torch.Tensor(interp_weights))
Audio(out_wav, rate=16000)

In [9]:
# Time-varying interpolation
interp_speakers = ['652', '1919']
dynamic_weights = torch.linspace(0., 1., steps=len(query_seq)).unsqueeze(1) * torch.Tensor([-1., 1.]) + torch.Tensor([1., 0.])
out_wav = knn_vc.match_interpolated(query_seq, [matching_sets[spk_id] for spk_id in interp_speakers], dynamic_weights)
Audio(out_wav, rate=16000)

### Save audio

In [None]:
torchaudio.save('knnvc1_out.wav', out_wav[None], 16000)

<audio name="abstract-reader" controls preload src="https://github.com/bshall/knn-vc/releases/download/v0.1/david-attenborough.wav"></audio>