#Pipeline to clean audio for RVC

Idea is to create a fully automated method to colect and process data for RVC models.
Right now, this notebook takes in one youtube link, removes the background music and clips out silences. <br>
Works decently well with one singer songs as well. <br>

###Examples
[Source File Link](https://dl.sndup.net/2w7z/srk-raw.mp3) || [Cleaned File Link](https://dl.sndup.net/qhp6/srk-cleaned.mp3)


### Next Up
* Visualisation + Controls to clip silences / bgm
* Elevenlabs Integration to generate phonetically balanced datasets


In [None]:
# @title Necessary Functions (Run this once)
!pip install yt-dlp librosa noisereduce soundfile pydub pyannote.audio
!python3 -m pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs

import os
import subprocess
import io
from pathlib import Path
import select
from shutil import rmtree
import subprocess as sp
import sys
from typing import Dict, Tuple, Optional, IO
from pyannote.audio import Pipeline
import torch

from google.colab import files
from IPython.display import Audio

from pydub import AudioSegment
from pydub.silence import split_on_silence

def setup_project(yt_link):
    video_id = yt_link.split("=")[1]
    subprocess.run(f"mkdir /content/{video_id}", shell=True)
    subprocess.run(f"mkdir /content/{video_id}/output", shell=True)
    subprocess.run(f"mkdir /content/{video_id}/input", shell=True)
    out_dir = "/content/" + video_id + "/input/file.mp3"
    subprocess.run(f"yt-dlp -x --audio-format mp3 -o {out_dir} {yt_link}", shell = True)


def find_files(in_path):
    out = []
    for file in Path(in_path).iterdir():
        if file.suffix.lower().lstrip(".") in extensions:
            out.append(file)
    return out

def copy_process_streams(process: sp.Popen):
    def raw(stream: Optional[IO[bytes]]) -> IO[bytes]:
        assert stream is not None
        if isinstance(stream, io.BufferedIOBase):
            stream = stream.raw
        return stream

    p_stdout, p_stderr = raw(process.stdout), raw(process.stderr)
    stream_by_fd: Dict[int, Tuple[IO[bytes], io.StringIO, IO[str]]] = {
        p_stdout.fileno(): (p_stdout, sys.stdout),
        p_stderr.fileno(): (p_stderr, sys.stderr),
    }
    fds = list(stream_by_fd.keys())

    while fds:
        # `select` syscall will wait until one of the file descriptors has content.
        ready, _, _ = select.select(fds, [], [])
        for fd in ready:
            p_stream, std = stream_by_fd[fd]
            raw_buf = p_stream.read(2 ** 16)
            if not raw_buf:
                fds.remove(fd)
                continue
            buf = raw_buf.decode()
            std.write(buf)
            std.flush()

def separate(inp=None, outp=None):
    inp = inp or in_path
    outp = outp or out_path
    cmd = ["python3", "-m", "demucs.separate", "-o", str(outp), "-n", model]
    if mp3:
        cmd += ["--mp3", f"--mp3-bitrate={mp3_rate}"]
    if float32:
        cmd += ["--float32"]
    if int24:
        cmd += ["--int24"]
    if two_stems is not None:
        cmd += [f"--two-stems={two_stems}"]
    files = [str(f) for f in find_files(inp)]
    if not files:
        print(f"No valid audio files in {in_path}")
        return
    p = sp.Popen(cmd + files, stdout=sp.PIPE, stderr=sp.PIPE)
    copy_process_streams(p)
    p.wait()
    if p.returncode != 0:
        print("Command failed, something went wrong.")


def remove_silences(yt_link):
    project_dir = "/content/" + yt_link.split("=")[1]
    file_path = project_dir + "/output/htdemucs/file/vocals.mp3"
    file_name = "trimmed.mp3"
    audio_format = "mp3"
    sound = AudioSegment.from_file(file_path, format = audio_format)
    audio_chunks = split_on_silence(sound
                                ,min_silence_len = 100
                                ,silence_thresh = -45
                                ,keep_silence = 50
                            )

    combined = AudioSegment.empty()
    for chunk in audio_chunks:
        combined += chunk
    combined.export(f'{project_dir}/{file_name}', format = audio_format)

def diarize(hf_token, yt_link):
    file_path = project_dir = "/content/" + yt_link.split("=")[1] + "/trimmed.mp3"
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=hf_token)

    pipeline.to(torch.device("cuda"))
    diarization = pipeline(file_path)
    audio = AudioSegment.from_mp3(file_path)

    speakers = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        speakers.append(speaker)

    speakers = list(set(speakers))

    buffer = {}
    for s in speakers:
        buffer[s] = AudioSegment.empty()


    list(set(speakers))
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        start = int(turn.start * 1000)
        end = int(turn.end * 1000)
        speaker_audio = audio[start:end]
        buffer[speaker] += speaker_audio

    for s in speakers:
        buffer[s].export("/content/" + yt_link.split("=")[1] + "/" + s + ".mp3")

    return speakers



In [None]:
# @title Isolating vocals and removing silences
# @markdown This would give good enough results, if your output does not come out good, move to the next step. Ensure that you run this cell either ways.

yt_link = 'https://www.youtube.com/watch?v=817P8W8-mGE'  # @param {type: "string"}
single_speaker_file = False # @param {type:"boolean"}
huggingface_token = ''  # @param {type: "string"}

if "shorts" in yt_link:
  #https://www.youtube.com/shorts/6ZOldMPhOoA
  id = yt_link.split("/")[4]
  yt_link = "https://youtube.com/watch?v=" + id

setup_project(yt_link)

project_id = yt_link.split("=")[1]
project_folder = f"/content/{project_id}"

model = "htdemucs"
extensions = ["mp3", "wav", "ogg", "flac"]
two_stems = "vocals"
mp3 = True
mp3_rate = 320
float32 = False
int24 = False

in_path = project_folder + "/input"
out_path = project_folder + "/output"

separate()
remove_silences(yt_link)

#files.download(project_folder + "/trimmed.mp3")

if single_speaker_file:
  Audio(project_folder + "/trimmed.mp3")
else:
  speakers = diarize(huggingface_token, yt_link)
  for s in speakers:
    files.download(project_folder + "/" + s + ".mp3")



Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /content/817P8W8-mGE/output/htdemucs
Separating track /content/817P8W8-mGE/input/file.mp3


100%|██████████████████████████████████████████████████████████████████████| 263.25/263.25 [00:11<00:00, 22.37seconds/s]
