Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
ddPn08 committed Apr 22, 2023
2 parents 6f3389e + a5b47bb commit 193d6e7
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 88 deletions.
66 changes: 40 additions & 26 deletions lib/rvc/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from typing import *
import os
import traceback
from typing import *

import faiss
# from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip
from fairseq.models.hubert import HubertModel
import numpy as np
import parselmouth
import pyworld
import scipy.signal as signal
import torch
import torch.nn.functional as F
# from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip
from fairseq.models.hubert import HubertModel
from transformers import HubertModel as TrHubertModel
from transformers import Wav2Vec2FeatureExtractor

from .models import SynthesizerTrnMs256NSFSid

Expand Down Expand Up @@ -98,13 +100,13 @@ def get_f0(

def _convert(
self,
model: HubertModel,
model: Union[HubertModel, Tuple[Wav2Vec2FeatureExtractor, TrHubertModel]],
net_g: SynthesizerTrnMs256NSFSid,
sid: int,
audio: np.ndarray,
pitch: np.ndarray,
pitchf: np.ndarray,
index,
index: faiss.IndexIVFFlat,
big_npy: np.ndarray,
index_rate: float,
):
Expand All @@ -120,27 +122,39 @@ def _convert(
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)

is_feats_dim_768 = net_g.emb_channels == 768

inputs = (
{
"source": feats.to(self.device),
"padding_mask": padding_mask,
"output_layer": 9, # layer 9
}
if not is_feats_dim_768
else {
"source": feats.to(self.device),
"padding_mask": padding_mask,
# no pass "output_layer"
}
)

with torch.no_grad():
logits = model.extract_features(**inputs)
if is_feats_dim_768:
feats = logits[0]

if isinstance(model, tuple):
feats = model[0](feats.squeeze(0).squeeze(0).to(self.device), return_tensors="pt", sampling_rate=16000)
if self.is_half:
feats = feats.input_values.to(self.device).half()
else:
feats = model.final_proj(logits[0])
feats = feats.input_values.to(self.device)
with torch.no_grad():
if is_feats_dim_768:
feats = model[1](feats).last_hidden_state
else:
feats = model[1](feats).extract_features
else:
inputs = (
{
"source": feats.to(self.device),
"padding_mask": padding_mask,
"output_layer": 9, # layer 9
}
if not is_feats_dim_768
else {
"source": feats.to(self.device),
"padding_mask": padding_mask,
# no pass "output_layer"
}
)

with torch.no_grad():
logits = model.extract_features(**inputs)
if is_feats_dim_768:
feats = logits[0]
else:
feats = model.final_proj(logits[0])

if (
isinstance(index, type(None)) == False
Expand Down Expand Up @@ -192,7 +206,7 @@ def _convert(

def __call__(
self,
model: HubertModel,
model: Union[HubertModel, Tuple[Wav2Vec2FeatureExtractor, TrHubertModel]],
net_g: SynthesizerTrnMs256NSFSid,
sid: int,
audio: np.ndarray,
Expand Down
137 changes: 100 additions & 37 deletions lib/rvc/preprocessing/extract_feature.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import traceback
from concurrent import futures
from concurrent.futures import ProcessPoolExecutor
from typing import *

Expand All @@ -9,26 +10,65 @@
import torch.nn.functional as F
from fairseq import checkpoint_utils
from tqdm import tqdm
from transformers import HubertModel as TrHubertModel
from transformers import Wav2Vec2FeatureExtractor

from modules import shared


def load_embedder(embedder_path: str, device):
models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task(
[embedder_path],
suffix="",
)
embedder_model = models[0]
embedder_model = embedder_model.to(device)
if device != "cpu":
embedder_model = embedder_model.half()
else:
embedder_model = embedder_model.float()
embedder_model.eval()
try:
models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task(
[embedder_path],
suffix="",
)
embedder_model = models[0]
embedder_model = embedder_model.to(device)
if device != "cpu":
embedder_model = embedder_model.half()
else:
embedder_model = embedder_model.float()
embedder_model.eval()
except Exception as e:
print(f"Error: {e} {embedder_path}")
traceback.print_exc()

return embedder_model, cfg


def load_transformers_hubert(repo_name: str, device):
try:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo_name)
embedder_model = TrHubertModel.from_pretrained(repo_name).to(device)
if device != "cpu":
embedder_model = embedder_model.half()
else:
embedder_model = embedder_model.float()
embedder_model.eval()
except Exception as e:
print(f"Error: {e} {repo_name}")
traceback.print_exc()

return (feature_extractor, embedder_model), None


def load_transformers_hubert_local(embedder_path: str, device):
try:
embedder_path = os.path.join(shared.ROOT_DIR, embedder_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(embedder_path, local_files_only=True)
embedder_model = TrHubertModel.from_pretrained(embedder_path, local_files_only=True).to(device)
if device != "cpu":
embedder_model = embedder_model.half()
else:
embedder_model = embedder_model.float()
embedder_model.eval()
except Exception as e:
print(f"Error: {e} {embedder_path}")
traceback.print_exc()

return (feature_extractor, embedder_model), None


# wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path)
Expand All @@ -48,14 +88,21 @@ def processor(
todo: List[str],
device: torch.device,
embedder_path: str,
embedder_load_from: str,
is_feats_dim_768: bool,
wav_dir: str,
out_dir: str,
process_id: int,
):
if not os.path.exists(embedder_path):
if embedder_load_from == "local" and not os.path.exists(embedder_path):
return f"Embedder not found: {embedder_path}"
model, cfg = load_embedder(embedder_path, device)

if embedder_load_from == "hf":
model, cfg = load_transformers_hubert(embedder_path, device)
elif embedder_load_from == "tr-local":
model, cfg = load_transformers_hubert_local(embedder_path, device)
else:
model, cfg = load_embedder(embedder_path, device)

for file in tqdm(todo, position=1 + process_id):
try:
Expand All @@ -67,32 +114,45 @@ def processor(
continue

os.makedirs(os.path.dirname(out_filepath), exist_ok=True)

feats = readwave(wav_filepath, normalize=cfg.task.normalize)

is_normalize = False if cfg is None else cfg.task.normalize
feats = readwave(wav_filepath, normalize=is_normalize)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = (
{
"source": feats.half().to(device)
if device != "cpu"
else feats.to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
if not is_feats_dim_768
else {
"source": feats.half().to(device)
if device != "cpu"
else feats.to(device),
"padding_mask": padding_mask.to(device),
# no pass "output_layer"
}
)
with torch.no_grad():
logits = model.extract_features(**inputs)
if is_feats_dim_768:
feats = logits[0]
if isinstance(model, tuple):
feats = model[0](feats.squeeze(0).squeeze(0).to(device), return_tensors="pt", sampling_rate=16000)
if device != "cpu":
feats = feats.input_values.to(device).half()
else:
feats = model.final_proj(logits[0])
feats = feats.input_values.to(device)
with torch.no_grad():
if is_feats_dim_768:
feats = model[1](feats).last_hidden_state
else:
feats = model[1](feats).extract_features
else:
inputs = (
{
"source": feats.half().to(device)
if device != "cpu"
else feats.to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
if not is_feats_dim_768
else {
"source": feats.half().to(device)
if device != "cpu"
else feats.to(device),
"padding_mask": padding_mask.to(device),
# no pass "output_layer"
}
)
with torch.no_grad():
logits = model.extract_features(**inputs)
if is_feats_dim_768:
feats = logits[0]
else:
feats = model.final_proj(logits[0])

feats = feats.squeeze(0).float().cpu().numpy()
if np.isnan(feats).sum() == 0:
Expand All @@ -107,6 +167,7 @@ def processor(
def run(
training_dir: str,
embedder_path: str,
embedder_load_from: str,
is_feats_dim_768: bool,
gpu_ids: List[int],
device: Optional[Union[torch.device, str]] = None,
Expand Down Expand Up @@ -143,6 +204,7 @@ def run(
todo,
device,
embedder_path,
embedder_load_from,
is_feats_dim_768,
wav_dir,
out_dir,
Expand All @@ -156,6 +218,7 @@ def run(
todo[i::num_gpus],
torch.device(f"cuda:{id}"),
embedder_path,
embedder_load_from,
is_feats_dim_768,
wav_dir,
out_dir,
Expand Down
8 changes: 8 additions & 0 deletions lib/rvc/preprocessing/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def norm_write(
tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + (
1 - alpha
) * tmp_audio
else:
# clip level to max (cause sometimes when floating point decoding)
audio_min = np.min(tmp_audio)
if audio_min < -max:
tmp_audio = tmp_audio / -audio_min * max
audio_max = np.max(tmp_audio)
if audio_max > max:
tmp_audio = tmp_audio / audio_max * max

wavfile.write(
os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"),
Expand Down
17 changes: 5 additions & 12 deletions lib/rvc/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,13 @@
from . import commons, utils
from .checkpoints import save
from .config import DatasetMetadata, TrainConfig
from .data_utils import (
DistributedBucketSampler,
TextAudioCollate,
TextAudioCollateMultiNSFsid,
TextAudioLoader,
TextAudioLoaderMultiNSFsid,
)
from .data_utils import (DistributedBucketSampler, TextAudioCollate,
TextAudioCollateMultiNSFsid, TextAudioLoader,
TextAudioLoaderMultiNSFsid)
from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
from .models import (
MultiPeriodDiscriminator,
SynthesizerTrnMs256NSFSid,
SynthesizerTrnMs256NSFSidNono,
)
from .models import (MultiPeriodDiscriminator, SynthesizerTrnMs256NSFSid,
SynthesizerTrnMs256NSFSidNono)


def glob_dataset(glob_str: str, speaker_id: int):
Expand Down

0 comments on commit 193d6e7

Please sign in to comment.