# [Retrieval-based-Voice-Conversion-WebUI](https://github.com/giannifiore/Retrieval-based-Voice-Conversion-WebUI) Training notebook

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giannifiore/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb)

In [None]:
# @title Check GPU
!nvidia-smi

In [None]:
# @title Mount Google Drive

from google.colab import drive

drive.mount("/content/drive")

In [None]:
# @title Install dependencies
!apt-get -y install build-essential python3-dev ffmpeg
!pip3 install --upgrade setuptools wheel av
!pip3 install 'pip<24.1'
import sys
py_ver = sys.version_info
if py_ver >= (3, 12):
    numpy_spec = "numpy==1.26.4"
    numba_spec = "numba==0.59.1"
    llvmlite_spec = "llvmlite==0.42.0"
elif py_ver >= (3, 11):
    numpy_spec = "numpy==1.26.4"
    numba_spec = "numba==0.58.1"
    llvmlite_spec = "llvmlite==0.41.1"
else:
    numpy_spec = "numpy==1.23.5"
    numba_spec = "numba==0.56.4"
    llvmlite_spec = "llvmlite==0.39.0"
COMMON_PACKAGES = f"faiss-cpu==1.8.0 gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld {numpy_spec} {numba_spec} {llvmlite_spec} librosa==0.9.2 torchcrepe==0.0.24"
!pip3 install {COMMON_PACKAGES}
FAIRSEQ_PKG = 'fairseq==0.12.2' if sys.version_info < (3, 11) else 'fairseq@git+https://github.com/One-sixth/fairseq.git'
!pip3 install {FAIRSEQ_PKG}


In [None]:
# @title Clone repository

!git clone --depth=1 -b main https://github.com/giannifiore/Retrieval-based-Voice-Conversion-WebUI
%cd /content/Retrieval-based-Voice-Conversion-WebUI
!mkdir -p pretrained uvr5_weights

In [None]:
# @title Update repository (usually unnecessary)
!git pull

In [None]:
# @title Install aria2
!apt -y install -qq aria2

In [None]:
# @title Download base models
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o D32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o D40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o D48k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o G32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o G40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o G48k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o f0D32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o f0D40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o f0D48k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o f0G32k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o f0G40k.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/pretrained -o f0G48k.pth

In [None]:
# @title Download vocal separation models
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth

In [None]:
# @title Download hubert_base
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/hubert -o hubert_base.pt

In [None]:
# @title #Download RMVPE model
!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/assets/rmvpe
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/rmvpe -o rmvpe.pt
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.onnx -d /content/Retrieval-based-Voice-Conversion-WebUI/assets/rmvpe -o rmvpe.onnx

In [None]:
# @title Load packaged dataset from Google Drive to /content/dataset

# @markdown Dataset path
DATASET = (
    "/content/drive/MyDrive/RVC_Voice_Data-20251122T130953Z-1-001.zip"  # @param {type:"string"}
)

!mkdir -p /content/dataset
!unzip -d /content/dataset -B {DATASET}

In [None]:
# @title Rename duplicate files in the dataset
!ls -a /content/dataset/
!rename 's/(\w+)\.(\w+)~(\d*)/$1_$3.$2/' /content/dataset/*.*~*

In [None]:
# @title Launch web UI
%cd /content/Retrieval-based-Voice-Conversion-WebUI
# %load_ext tensorboard
# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs
!python3 infer-web.py --colab --pycmd python3

In [None]:
# @title Manually back up trained models to Google Drive
# @markdown Check the filenames under logs manually and update the command accordingly

# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown Model epoch
MODELEPOCH = 9600  # @param {type:"integer"}

!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/

!cp /content/Retrieval-based-Voice-Conversion-WebUI/assets/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth

In [None]:
# @title Restore pth from Google Drive
# @markdown Check the filenames under logs manually and update the command accordingly

# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown Model epoch
MODELEPOCH = 7500  # @param {type:"integer"}

!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}

!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth
!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth
!cp /content/drive/MyDrive/*.index /content/
!cp /content/drive/MyDrive/*.npy /content/
!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/assets/weights/{MODELNAME}.pth

In [None]:
# @title Manual preprocessing (not recommended)
# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown Sample rate
BITRATE = 48000  # @param {type:"integer"}
# @markdown Number of processes
THREADCOUNT = 8  # @param {type:"integer"}

!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True

In [None]:
# @title Manual feature extraction (not recommended)
# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown Number of processes
THREADCOUNT = 8  # @param {type:"integer"}
# @markdown Pitch extraction algorithm
ALGO = "harvest"  # @param {type:"string"}

!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}

!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME} True

In [None]:
# @title Manual training (not recommended)
# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown GPU to use
USEGPU = "0"  # @param {type:"string"}
# @markdown Batch size
BATCHSIZE = 32  # @param {type:"integer"}
# @markdown Stop at epoch
MODELEPOCH = 3200  # @param {type:"integer"}
# @markdown Checkpoint save interval
EPOCHSAVE = 100  # @param {type:"integer"}
# @markdown Sample rate
MODELSAMPLE = "48k"  # @param {type:"string"}
# @markdown Cache dataset in GPU memory
CACHEDATA = 1  # @param {type:"integer"}
# @markdown Only keep the latest ckpt file
ONLYLATEST = 0  # @param {type:"integer"}

!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}

In [None]:
# @title Delete other pth files, keep the selected one (handle with care)
# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown Selected model epoch
MODELEPOCH = 9600  # @param {type:"integer"}

!echo "Backing up the selected model..."
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth

!echo "Deleting..."
!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}
!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth

!echo "Restoring the selected model..."
!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth
!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth

!echo "Deletion complete"
!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}

In [None]:
# @title Clean all project files, keep the selected model only (handle with care)
# @markdown Model name
MODELNAME = "lulu"  # @param {type:"string"}
# @markdown Selected model epoch
MODELEPOCH = 9600  # @param {type:"integer"}

!echo "Backing up the selected model..."
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth
!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth

!echo "Deleting..."
!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}
!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*

!echo "Restoring the selected model..."
!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth
!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth

!echo "Deletion complete"
!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}