# 1. Load models and setup

In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from svc_helper.svc.rvc import RVCModel
from svc_helper.sfeatures.models import RVCHubertModel
from huggingface_hub import hf_hub_download
from svc_helper.pitch.rmvpe import RMVPEModel
import torch


device = 'cuda'

pitch_model = RMVPEModel()
rvc_model = RVCModel()
test_model_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='RarityS1/Rarity.pth')
test_index_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='RarityS1/added_IVF1866_Flat_nprobe_1_Rarity_v2.index')
rvc_model.load_model(model_path = test_model_path,
    index_path = test_index_path)

sfeatures_model = RVCHubertModel(device = device, is_half=True)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-base.en")
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-base.en")

if (device == 'cuda'):
    model = model.cuda()

import os
import yaml
from glob import glob

def load_config_and_latest_checkpoint(log_dir):
    """
    Loads the config data from a YAML file and retrieves the latest checkpoint file 
    from the corresponding checkpoints folder.

    Args:
        log_dir (str): The path to the lightning_logs/version_0 directory.

    Returns:
        config (dict): The loaded configuration data from the YAML file.
        latest_checkpoint (str): The path to the latest checkpoint file.
    """
    # Path to config YAML file
    config_path = os.path.join(log_dir, "hparams.yaml")
    
    # Load the config data from the YAML file
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Config YAML file not found: {config_path}")
    
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    
    # Path to the checkpoints folder
    checkpoints_dir = os.path.join(log_dir, "checkpoints")
    
    if not os.path.exists(checkpoints_dir):
        raise FileNotFoundError(f"Checkpoints directory not found: {checkpoints_dir}")
    
    # Find all checkpoint files in the folder (assuming they have .ckpt extension)
    checkpoint_files = glob(os.path.join(checkpoints_dir, "*.ckpt"))
    
    if not checkpoint_files:
        raise FileNotFoundError(f"No checkpoint files found in: {checkpoints_dir}")
    
    # Get the latest checkpoint file (based on the latest modification time)
    latest_checkpoint = max(checkpoint_files, key=os.path.getmtime)
    
    return config, latest_checkpoint

  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore
    PyTorch 2.1.0+cu118 with CUDA 1108 (you have 2.4.0+cu121)
    Python  3.10.11 (you have 3.10.7)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Error caught was: No module named 'triton'
  def forward(cls, ctx, x, w1, b1, w2, b2, w3, b3):
  def backward(cls, ctx, dx5):


RVC_RMVPE_PATH: D:\hf_cache\hub\models--therealvul--svc_helper\snapshots\92e54a9bf752ac35f4e3453c232c6da3b82ce4ae\rvc_rmvpe.pt


  ckpt = torch.load(model_path, map_location="cpu")
2024-08-28 10:07:45 | INFO | svc_helper.svc.rvc.configs.config | Found GPU NVIDIA GeForce RTX 3080 Ti Laptop GPU
2024-08-28 10:07:46 | INFO | svc_helper.svc.rvc.modules.vc.modules | Get sid: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\RarityS1\Rarity.pth


is_half:True, device:cuda:0


2024-08-28 10:07:46 | INFO | svc_helper.svc.rvc.modules.vc.modules | Loading: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\RarityS1\Rarity.pth
  self.cpt = torch.load(person, map_location="cpu")
  WeightNorm.apply(module, name, dim)
  state = torch.load(f, map_location=torch.device("cpu"))
2024-08-28 10:07:46 | INFO | fairseq.tasks.hubert_pretraining | current directory is d:\Code\raraai\5_aligner
2024-08-28 10:07:46 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-08-28 10:07:46 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50

# 2. Load model

In [2]:
from train_module import LitModel
checkpoint_folder = './lightning_logs/version_0'

config, latest_checkpoint = load_config_and_latest_checkpoint(
    checkpoint_folder)
lmodel = LitModel.load_from_checkpoint(
    latest_checkpoint, config).to(device)

  return torch.load(f, map_location=map_location)  # type: ignore[arg-type]


# 3. Inference

In [18]:
import IPython.display as ipd
import torch.nn.functional as F
from einops import rearrange
import soundfile as sf
import librosa
in_file = './srcaudio6.flac'

data, rate = librosa.load(in_file,
    sr=RVCHubertModel.expected_sample_rate)
feats = sfeatures_model.extract_features(
    torch.from_numpy(data))
pitch = pitch_model.extract_pitch(data)
pitch, pitchf = rvc_model.f0_transform(pitch, 0)

# Whisper decoder features
input_features = processor(
    data, sampling_rate=16000, return_tensors='pt'
).input_features.to(device)
output = model.generate(input_features, 
    output_hidden_states=True, 
    return_dict_in_generate=True)
decoder_states = torch.cat(
    [t[0] for t in output.decoder_hidden_states], dim=1)
hubert_len = torch.tensor([feats.shape[1]]).to(device)
whisper_len = torch.tensor([decoder_states.shape[1]]).to(device)

with torch.no_grad():
    (align_m, align_logs, latent_feats,
        true_decoded, pred_decoded) = lmodel.model(
            x = decoder_states,
            x_lens = whisper_len,
            z = feats.float(),
            z_lens = hubert_len
        )

print('Autoencoder mse loss: ',F.mse_loss(true_decoded, feats))
print('Pred mse loss: ',F.mse_loss(pred_decoded, feats))

print('True')
wav_opt = rvc_model.raw_infer(feats, pitch, pitchf)
ipd.display(ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate()))
print('Through autoencoder')
wav_opt = rvc_model.raw_infer(true_decoded, pitch, pitchf)
ipd.display(ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate()))
print('Predicted')
wav_opt = rvc_model.raw_infer(true_decoded, pitch, pitchf)
ipd.display(ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate()))

Autoencoder mse loss:  tensor(0.0994, device='cuda:0')
Pred mse loss:  tensor(0.1120, device='cuda:0')
True


Through autoencoder


Predicted


# 4. Is our network actually doing anything?

In [13]:
in_file = './srcaudio5.flac'
in_file2 = './srcaudio.flac'

data, rate = librosa.load(in_file,
    sr=RVCHubertModel.expected_sample_rate)
feats = sfeatures_model.extract_features(
    torch.from_numpy(data))
data, rate = librosa.load(in_file2,
    sr=RVCHubertModel.expected_sample_rate)
feats2 = sfeatures_model.extract_features(
    torch.from_numpy(data))
minlen = min(feats.shape[1], feats2.shape[1])
feats = feats[:, :minlen, :]
feats2 = feats2[:, :minlen, :]
print('mse loss between two unrelated items: ',F.mse_loss(feats, feats2).item())

# It's around 0.16 to 0.199

mse loss between two unrelated items:  0.199951171875


# S.1. Split items

In [17]:
from dataset import SpeechFeatureDataset
data = SpeechFeatureDataset()
print(data.dataset[0]['transcription'])
print(data.dataset[0]['source'])
print(data.dataset[0]['speaker'])

that's it. Keep them closed. Don't look.
fim:s1e14
Rarity
