# 1. Load models and setup

In [9]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from svc_helper.svc.rvc import RVCModel
from svc_helper.sfeatures.models import RVCHubertModel
from huggingface_hub import hf_hub_download
import torch

device = 'cuda'

rvc_model = RVCModel()
test_model_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='RarityS1/Rarity.pth')
test_index_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='RarityS1/added_IVF1866_Flat_nprobe_1_Rarity_v2.index')
rvc_model.load_model(model_path = test_model_path,
    index_path = test_index_path)

sfeatures_model = RVCHubertModel(device = device, is_half=True)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-base.en")
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-base.en")

if (device == 'cuda'):
    model = model.cuda()

import os
import yaml
from glob import glob

def load_config_and_latest_checkpoint(log_dir):
    """
    Loads the config data from a YAML file and retrieves the latest checkpoint file 
    from the corresponding checkpoints folder.

    Args:
        log_dir (str): The path to the lightning_logs/version_0 directory.

    Returns:
        config (dict): The loaded configuration data from the YAML file.
        latest_checkpoint (str): The path to the latest checkpoint file.
    """
    # Path to config YAML file
    config_path = os.path.join(log_dir, "hparams.yaml")
    
    # Load the config data from the YAML file
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Config YAML file not found: {config_path}")
    
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    
    # Path to the checkpoints folder
    checkpoints_dir = os.path.join(log_dir, "checkpoints")
    
    if not os.path.exists(checkpoints_dir):
        raise FileNotFoundError(f"Checkpoints directory not found: {checkpoints_dir}")
    
    # Find all checkpoint files in the folder (assuming they have .ckpt extension)
    checkpoint_files = glob(os.path.join(checkpoints_dir, "*.ckpt"))
    
    if not checkpoint_files:
        raise FileNotFoundError(f"No checkpoint files found in: {checkpoints_dir}")
    
    # Get the latest checkpoint file (based on the latest modification time)
    latest_checkpoint = max(checkpoint_files, key=os.path.getmtime)
    
    return config, latest_checkpoint

2024-08-27 21:57:05 | INFO | svc_helper.svc.rvc.modules.vc.modules | Get sid: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\RarityS1\Rarity.pth
2024-08-27 21:57:05 | INFO | svc_helper.svc.rvc.modules.vc.modules | Loading: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\RarityS1\Rarity.pth
  self.cpt = torch.load(person, map_location="cpu")
  WeightNorm.apply(module, name, dim)
  state = torch.load(f, map_location=torch.device("cpu"))
2024-08-27 21:57:06 | INFO | fairseq.tasks.hubert_pretraining | current directory is d:\Code\raraai\5_aligner
2024-08-27 21:57:06 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size

# 2. Load model

In [10]:
from train_module import LitModel
checkpoint_folder = './lightning_logs/version_0'

config, latest_checkpoint = load_config_and_latest_checkpoint(
    checkpoint_folder)
lmodel = LitModel.load_from_checkpoint(
    latest_checkpoint, config).to(device)

  return torch.load(f, map_location=map_location)  # type: ignore[arg-type]


# 3. Inference

In [11]:
import IPython.display as ipd
import torch.nn.functional as F
from einops import rearrange
import soundfile as sf
in_file = './srcaudio4.flac'
#in_file = '../3_basicdenoiser/ood5_male.wav'
transpose=0
index_rate=1.0

def feature_override(aud):
    sf.write('from_rvc.wav', aud, 16000)
    # ContentVec
    feats = sfeatures_model.extract_features(aud)

    # Whisper decoder features
    input_features = processor(
        aud, sampling_rate=16000, return_tensors='pt'
    ).input_features.to(device)
    output = model.generate(input_features, 
        output_hidden_states=True, 
        return_dict_in_generate=True)
    decoder_states = torch.cat(
        [t[0] for t in output.decoder_hidden_states], dim=1)


    hubert_len = torch.tensor([feats.shape[1]]).to(device)
    whisper_len = torch.tensor([decoder_states.shape[1]]).to(device)

    print(decoder_states.shape)
    print(whisper_len)
    print(feats.shape)
    print(hubert_len)

    with torch.no_grad():
        (align_m, align_logs, latent_feats,
            true_decoded, pred_decoded) = lmodel.model(
                x = decoder_states,
                x_lens = whisper_len,
                z = feats,
                z_lens = hubert_len
            )

    #pred_decoded = rearrange(pred_decoded, 'b n c -> n b c')
    print(pred_decoded.shape)
    print(feats.shape)

    print('mse loss: ',F.mse_loss(pred_decoded, feats))
    print('mse loss: ',F.mse_loss(true_decoded, feats))
    #return pred_decoded
    return true_decoded

print('Unmodified')
wav_opt = rvc_model.infer_file(in_file, index_rate=index_rate,
    transpose=transpose)
ipd.display(ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate()))
print('Transform')
wav_opt = rvc_model.infer_file(in_file, index_rate=0.0,
    transpose=transpose, extra_hooks={
        'feature_override': feature_override})
ipd.display(ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate()))

Unmodified


2024-08-27 21:57:14 | INFO | fairseq.tasks.hubert_pretraining | current directory is d:\Code\raraai\5_aligner
2024-08-27 21:57:14 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-08-27 21:57:14 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'fina

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transform
torch.Size([1, 3, 512])
tensor([3], device='cuda:0')
torch.Size([1, 505, 768])
tensor([505], device='cuda:0')


  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\svc_helper\svc\rvc\modules\vc\modules.py", line 270, in vc_single
    audio_opt = self.pipeline.pipeline(
  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\svc_helper\svc\rvc\modules\vc\pipeline.py", line 520, in pipeline
    self.vc(
  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\svc_helper\svc\rvc\modules\vc\pipeline.py", line 241, in vc
    feats = feature_override(feats).to(self.device) # Pass in the padded audio
  File "C:\Users\vul\AppData\Local\Temp\ipykernel_25940\619532462.py", line 36, in feature_override
    true_decoded, pred_decoded) = lmodel.model(
  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "c:\Users\vul\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py"

ValueError: No audio data found. Expecting filename, url, or data.

In [14]:
from dataset import SpeechFeatureDataset
import librosa
import soundfile as sf
import numpy as np

data = SpeechFeatureDataset(
    data_path='dataset_unconditional_50.parquet', split='train')
print(data.dataset[0]['source'])
print(data.dataset[0]['transcription'])
hubert, hubert_len, whisper, whisper_len = data[0]
hubert = hubert.unsqueeze(0).to(device)
whisper = whisper.unsqueeze(0).to(device)
hubert_len = torch.Tensor([hubert_len]).to(device)
whisper_len = torch.Tensor([whisper_len]).to(device)

audio = data.dataset[0]['audio']['array']
audio_resamp = librosa.resample(audio, orig_sr=48000, target_sr=16000)
audio_max = np.abs(audio_resamp).max() / 0.95
if audio_max > 1:
    audio_resamp /= audio_max
print(len(audio_resamp))
print(audio_resamp[0])
print(audio_resamp[-1])
audio_array = sfeatures_model.pad_audio(audio_resamp)
print(len(audio_array))
sf.write('from_us.wav', audio_array, 16000)

feats = sfeatures_model.extract_features(
    torch.from_numpy(audio_array).to(device)
)
print(feats.shape)
print(hubert.shape)


#print('hubert:',torch.sum(hubert))
#print('whisper:',torch.sum(whisper))
#with torch.no_grad():
#    (align_m, align_logs, latent_feats,
#        true_decoded, pred_decoded) = lmodel.model(
#            x = whisper,
#            x_lens = whisper_len,
#            z = hubert,
#            z_lens = hubert_len
#        )
#    print(F.mse_loss(true_decoded, hubert))
#    print(F.mse_loss(pred_decoded, hubert))
#    print(F.mse_loss(true_decoded, pred_decoded))

fim:s4e13
Let's dispense with the charade, shall we?
40304
3.0481805879389867e-07
0.00012390565825626254
136304


NameError: name 'audio_padded' is not defined