In [None]:
!git clone https://github.com/nguyenvulebinh/voice-filter.git

Cloning into 'voice-filter'...
remote: Enumerating objects: 159, done.[K
remote: Counting objects: 100% (159/159), done.[K
remote: Compressing objects: 100% (133/133), done.[K
remote: Total 159 (delta 36), reused 128 (delta 21), pack-reused 0[K
Receiving objects: 100% (159/159), 3.27 MiB | 14.20 MiB/s, done.
Resolving deltas: 100% (36/36), done.


In [None]:
%cd voice-filter

/content/voice-filter


In [None]:
!pip install -r requirements.txt

In [None]:
from src.model.modeling_enh import VoiceFilter
import torch
from huggingface_hub import hf_hub_download
import os
import glob
import csv
from tqdm import tqdm
import librosa
import numpy as np
import soundfile as sf

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [None]:
use_gpu = True
if use_gpu:
    if not torch.cuda.is_available():
        use_gpu = False

In [None]:
def cal_xvector_sincnet_embedding(xvector_model, ref_wav, max_length=5, sr=16000):
    wavs = []
    for i in range(0, len(ref_wav), max_length*sr):
        wav = ref_wav[i:i + max_length*sr]
        wav = np.concatenate([wav, np.zeros(max(0, max_length * sr - len(wav)))])
        wavs.append(wav)
    wavs = torch.from_numpy(np.stack(wavs))
    if use_gpu:
        wavs = wavs.cuda()
    embed = xvector_model(wavs.unsqueeze(1).float())
    return torch.mean(embed, dim=0).detach().cpu()

In [None]:
# Load models
repo_id = 'nguyenvulebinh/voice-filter'
enh_model = VoiceFilter.from_pretrained(repo_id, cache_dir='./cache')
if use_gpu:
    enh_model = enh_model.cuda()

Downloading:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/217M [00:00<?, ?B/s]



# Voice filter only with reference audio

In [None]:
# Load some audio sample
mix_wav_path = hf_hub_download(repo_id=repo_id, filename="binh_linh_newspaper_music_noise.wav", cache_dir='./cache')
# ref_wav_path = hf_hub_download(repo_id=repo_id, filename="binh_ref_long.wav", cache_dir='./cache')
ref_wav_path = hf_hub_download(repo_id=repo_id, filename="linh_ref_long.wav", cache_dir='./cache')
output_wav_path = "output.wav"
mixed_wav, _ = librosa.load(mix_wav_path, sr=16000)
ref_wav, _ = librosa.load(ref_wav_path, sr=16000)

Downloading:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

In [None]:
# Calculate target speaker embedding
xvector = cal_xvector_sincnet_embedding(enh_model.xvector_model, ref_wav)
# Speech enhancing
max_amp = np.abs(mixed_wav).max()
mix_scaling = 1 / max_amp
mixed_wav = mix_scaling * mixed_wav
mixed_wav_tf = torch.from_numpy(mixed_wav)
if use_gpu:
    mixed_wav_tf = mixed_wav_tf.cuda()
    xvector= xvector.cuda()
est_wav = enh_model.do_enh(mixed_wav_tf, xvector).cpu().detach().numpy()
# Normalize estimated wav
max_amp = np.abs(est_wav).max()
mix_scaling = 1 / max_amp
est_wav = mix_scaling * est_wav
# write output file
sf.write(output_wav_path, est_wav, 16000)

  wavs = istft(


### Play audio

In [None]:
import IPython

Noisy audio

In [None]:
IPython.display.Audio(data=mixed_wav, rate=16000)

Reference audio

In [None]:
IPython.display.Audio(data=ref_wav, rate=16000)

Voice filter output

In [None]:
IPython.display.Audio(data=est_wav, rate=16000)

# Voice filter only without reference audio
Work only if having one dominant voice inside the audio

In [17]:
# Load some audio sample
mix_wav_path = hf_hub_download(repo_id=repo_id, filename="binh_noise.wav", cache_dir='./cache')
output_wav_path = "output.wav"
mixed_wav, _ = librosa.load(mix_wav_path, sr=16000)

In [19]:
max_amp = np.abs(mixed_wav).max()
mix_scaling = 1 / max_amp
mixed_wav = mix_scaling * mixed_wav
mixed_wav_tf = torch.from_numpy(mixed_wav)
xvector = torch.zeros(512)
if use_gpu:
    mixed_wav_tf = mixed_wav_tf.cuda()
    xvector= xvector.cuda()

In [20]:
est_wav = enh_model.do_enh(mixed_wav_tf, xvector).cpu().detach().numpy()
# Normalize estimated wav
max_amp = np.abs(est_wav).max()
mix_scaling = 1 / max_amp
est_wav = mix_scaling * est_wav
# write output file
sf.write(output_wav_path, est_wav, 16000)

In [22]:
IPython.display.Audio(data=mixed_wav, rate=16000)

In [21]:
IPython.display.Audio(data=est_wav, rate=16000)