# 1. Download audio dataset, set up models

In [1]:
from datasets import load_dataset, Audio
dataset = load_dataset("synthbot/pony-speech")
train_data = dataset['train']

from svc_helper.sfeatures.models import RVCHubertModel
import torch
sfeatures_model = RVCHubertModel(device = torch.device('cuda'))


import librosa
def add_speech_features(example):
    audio = example['audio']['array']
    audio_resamp = librosa.resample(audio,
        orig_sr=example['audio']['sampling_rate'],
        target_sr=RVCHubertModel.expected_sample_rate)
    audio_padded = sfeatures_model.pad_audio(audio_resamp)
    feats = sfeatures_model.extract_features(audio=
        torch.from_numpy(audio_padded))
    example['rvc_features'] = feats.cpu().numpy()
    return example

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

    PyTorch 2.1.0+cu118 with CUDA 1108 (you have 2.3.1+cu121)
    Python  3.10.11 (you have 3.10.7)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Error caught was: No module named 'triton'
2024-07-27 00:03:28 | INFO | fairseq.tasks.hubert_pretraining | current directory is d:\Code\raraai\2_featureexpl
2024-07-27 00:03:28 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-07-27 00:03:28 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name'

normalize: False


# 2. Select speakers and extract speech features

In [13]:
speakers = ['Rarity', 'Pinkie Pie']
speakers_data = {}
n_data = 50
for speaker in speakers:
    speakers_data[speaker] = train_data.filter(lambda ex, speaker=speaker:
        ex['speaker']==speaker, num_proc=16).shuffle().select(range(n_data)).map(
            add_speech_features
        )



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/64659 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

# 3. Calculate mean over each feature, over speaker

In [14]:
import numpy as np
import matplotlib.pyplot as plt
def get_features_stats(row):
    data = row['rvc_features']
    row['rvc_features_mean'] = np.mean(data, axis=1)
    return row

def get_stats(subset):
    summary = subset.map(get_features_stats)
    mean = np.array(summary['rvc_features_mean']).squeeze().mean(axis=0)
    return mean

speaker_stats = {}
for speaker in speakers_data.keys():
    stats = get_stats(speakers_data[speaker])
    speaker_stats[speaker] = stats

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

# 4. Prepare RVC

In [16]:
import numpy as np
from huggingface_hub import hf_hub_download
from svc_helper.svc.rvc import RVCModel
import torch

rvc_model = RVCModel()
# test_model_path = hf_hub_download(repo_id='therealvul/RVCv2', 
#     filename='RarityS1/Rarity.pth')
# test_index_path = hf_hub_download(repo_id='therealvul/RVCv2', 
#     filename='RarityS1/added_IVF1866_Flat_nprobe_1_Rarity_v2.index')
test_model_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='PinkiePieS1/PinkieS1.pth')
test_index_path = hf_hub_download(repo_id='therealvul/RVCv2', 
    filename='PinkiePieS1/added_IVF1260_Flat_nprobe_1_PinkieS1_v2.index')
rvc_model.load_model(model_path = test_model_path,
    index_path = test_index_path)

PinkieS1.pth:   0%|          | 0.00/57.6M [00:00<?, ?B/s]

(…)_IVF1260_Flat_nprobe_1_PinkieS1_v2.index:   0%|          | 0.00/155M [00:00<?, ?B/s]

2024-07-27 00:14:39 | INFO | svc_helper.svc.rvc.modules.vc.modules | Get sid: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\PinkiePieS1\PinkieS1.pth
2024-07-27 00:14:39 | INFO | svc_helper.svc.rvc.modules.vc.modules | Loading: D:\hf_cache\hub\models--therealvul--RVCv2\snapshots\87778762d011892db45370f0dd963be836d55a08\PinkiePieS1\PinkieS1.pth


In [26]:
import IPython.display as ipd
import soundfile as sf
input_path = 'test_speech_2.wav'
def force_to_mean(features, speaker='Pinkie Pie', a=1.0):
    input_features_mean = torch.mean(features, dim=1)
    target_mean = torch.from_numpy(speaker_stats[speaker]).to(features.device)
    delta_mean = target_mean - input_features_mean
    #print(torch.sum(delta_mean*a))
    print(torch.mean(delta_mean*a))
    print(torch.mean(target_mean))
    print(torch.mean(features + delta_mean*a))
    #print(delta_mean.shape)
    return features + delta_mean*a

wav_opt = rvc_model.infer_file(input_path, transpose=14,
    feature_transform=lambda feat: force_to_mean(feat), index_rate=0.75)
ipd.Audio(wav_opt, rate=rvc_model.output_sample_rate())
sf.write('pinkie_a5_ir0.75.wav', data=wav_opt, samplerate=48000)

tensor(-0.0006, device='cuda:0', dtype=torch.float64)
tensor(-0.0083, device='cuda:0', dtype=torch.float64)
tensor(-0.0083, device='cuda:0', dtype=torch.float64)
