# 1. Download dataset, set up models

In [7]:
from datasets import load_dataset, Audio
dataset = load_dataset('synthbot/pony-speech')
train_data = dataset['train']

from svc_helper.sfeatures.models import RVCHubertModel
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import numpy as np

device = 'cuda'
sfeatures_model = RVCHubertModel(device = device, is_half=True)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-base.en")
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-base.en")
if (device == 'cuda'):
    model.cuda()

def add_speech_features(example):
    audio = example['audio']['array']
    audio_resamp = librosa.resample(audio,
        orig_sr=example['audio']['sampling_rate'],
        target_sr=RVCHubertModel.expected_sample_rate)
    audio_max = np.abs(audio_resamp).max() / 0.95
    if audio_max > 1:
        audio_resamp /= audio_max

    # ContentVec
    feats = sfeatures_model.extract_features(audio=
        torch.from_numpy(audio_resamp).to(device))

    # Whisper decoder features
    input_features = processor(
        audio_resamp, sampling_rate=16000, return_tensors='pt'
    ).input_features.to(device)
    output = model.generate(input_features, 
        output_hidden_states=True, 
        return_dict_in_generate=True)
    decoder_states = torch.cat(
        [t[0] for t in output.decoder_hidden_states], dim=1)

    example['rvc_features'] = feats.cpu().numpy()
    example['whisper_decoder_features'] = decoder_states.cpu().numpy()
    print('f',feats.shape)
    print('d',decoder_states.shape)
    return example

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

2024-08-27 23:41:01 | INFO | fairseq.tasks.hubert_pretraining | current directory is d:\Code\raraai\5_aligner
2024-08-27 23:41:01 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-08-27 23:41:01 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'fina

# 2. Select speakers and extract features (unconditional dataset)

In [9]:
speakers = {'Rarity'}
speakers_data = {}
n_data = 1000

filtered_dataset = train_data.filter(lambda ex, speakers=speakers:
    ex['speaker'] in speakers, num_proc=16).shuffle().select(range(n_data)).map(
        add_speech_features)
filtered_dataset.to_parquet(f'dataset_unconditional_{n_data}.parquet')

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

f torch.Size([1, 128, 768])
d torch.Size([1, 12, 512])
f torch.Size([1, 68, 768])
d torch.Size([1, 6, 512])
f torch.Size([1, 220, 768])
d torch.Size([1, 19, 512])
f torch.Size([1, 407, 768])
d torch.Size([1, 27, 512])
f torch.Size([1, 274, 768])
d torch.Size([1, 25, 512])
f torch.Size([1, 61, 768])
d torch.Size([1, 7, 512])
f torch.Size([1, 204, 768])
d torch.Size([1, 23, 512])
f torch.Size([1, 115, 768])
d torch.Size([1, 7, 512])
f torch.Size([1, 150, 768])
d torch.Size([1, 10, 512])
f torch.Size([1, 142, 768])
d torch.Size([1, 9, 512])
f torch.Size([1, 136, 768])
d torch.Size([1, 11, 512])
f torch.Size([1, 126, 768])
d torch.Size([1, 14, 512])
f torch.Size([1, 111, 768])
d torch.Size([1, 12, 512])
f torch.Size([1, 91, 768])
d torch.Size([1, 9, 512])
f torch.Size([1, 167, 768])
d torch.Size([1, 19, 512])
f torch.Size([1, 85, 768])
d torch.Size([1, 8, 512])
f torch.Size([1, 164, 768])
d torch.Size([1, 15, 512])
f torch.Size([1, 149, 768])
d torch.Size([1, 13, 512])
f torch.Size([1, 81,

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

499802781