# 1. Download dataset, set up models

In [6]:
from datasets import load_dataset, Audio
dataset = load_dataset('synthbot/pony-speech')
train_data = dataset['train']

from svc_helper.sfeatures.models import RVCHubertModel
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import numpy as np

device = 'cuda'
sfeatures_model = RVCHubertModel(device = device, is_half=True)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-base.en")
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-base.en")
if (device == 'cuda'):
    model.cuda()

def add_speech_features(example):
    audio = example['audio']['array']
    audio_resamp = librosa.resample(audio,
        orig_sr=example['audio']['sampling_rate'],
        target_sr=RVCHubertModel.expected_sample_rate)
    audio_max = np.abs(audio_resamp).max() / 0.95
    if audio_max > 1:
        audio_resamp /= audio_max

    # ContentVec
    feats = sfeatures_model.extract_features(audio=
        torch.from_numpy(audio_resamp).to(device))

    # Whisper decoder features
    input_features = processor(
        audio_resamp, sampling_rate=16000, return_tensors='pt'
    ).input_features.to(device)
    output = model.generate(input_features, 
        output_hidden_states=True, 
        return_dict_in_generate=True)
    decoder_states = torch.cat(
        [t[0] for t in output.decoder_hidden_states], dim=1)

    example['rvc_features'] = feats.cpu().numpy()
    example['whisper_decoder_features'] = decoder_states.cpu().numpy()
    print('f',feats.shape)
    print('d',decoder_states.shape)
    return example

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

# 2. Select speakers and extract features (unconditional dataset)

In [5]:
speakers = {'Rarity'}
speakers_data = {}
n_data = 50

filtered_dataset = train_data.filter(lambda ex, speakers=speakers:
    ex['speaker'] in speakers, num_proc=16).shuffle().select(range(n_data)).map(
        add_speech_features)
filtered_dataset.to_parquet(f'dataset_unconditional_{n_data}.parquet')

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

f torch.Size([1, 560, 768])
d torch.Size([1, 32, 512])
f torch.Size([1, 372, 768])
d torch.Size([1, 44, 512])
f torch.Size([1, 355, 768])
d torch.Size([1, 5, 512])
f torch.Size([1, 489, 768])
d torch.Size([1, 30, 512])
f torch.Size([1, 443, 768])
d torch.Size([1, 25, 512])
f torch.Size([1, 333, 768])
d torch.Size([1, 447, 512])
f torch.Size([1, 363, 768])
d torch.Size([1, 9, 512])
f torch.Size([1, 575, 768])
d torch.Size([1, 20, 512])
f torch.Size([1, 364, 768])
d torch.Size([1, 3, 512])
f torch.Size([1, 412, 768])
d torch.Size([1, 5, 512])
f torch.Size([1, 486, 768])
d torch.Size([1, 39, 512])
f torch.Size([1, 441, 768])
d torch.Size([1, 447, 512])
f torch.Size([1, 406, 768])
d torch.Size([1, 3, 512])
f torch.Size([1, 345, 768])
d torch.Size([1, 3, 512])
f torch.Size([1, 420, 768])
d torch.Size([1, 11, 512])
f torch.Size([1, 362, 768])
d torch.Size([1, 29, 512])
f torch.Size([1, 486, 768])
d torch.Size([1, 15, 512])
f torch.Size([1, 564, 768])
d torch.Size([1, 27, 512])
f torch.Size([

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

1044288812