In [83]:
# For setting dataset cache dir if default is not desired
import os
os.environ['HF_HOME'] = r'D:\hf_cache'

In [84]:
# 1. Download the audio dataset
from datasets import load_dataset, Audio
dataset = load_dataset("synthbot/pony-speech")

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [85]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['speaker', 'source', 'start', 'end', 'style', 'noise', 'transcription', 'audio', 'duration'],
        num_rows: 64735
    })
})


In [117]:
# 2. Extract speech features
from svc_helper.sfeatures.models import RVCHubertModel
sfeatures_model = RVCHubertModel(device=torch.device('cuda'))

2024-07-09 21:40:54 | INFO | fairseq.tasks.hubert_pretraining | current directory is D:\Code\raragan\1_speechdiscrim
2024-07-09 21:40:54 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-07-09 21:40:54 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1

In [114]:
import tqdm
import librosa
import torch
from datasets import Array3D, Features

def add_speech_features(example):
    audio = example['audio']['array']
    audio_resamp = librosa.resample(audio,
        orig_sr=example['audio']['sampling_rate'],
        target_sr=RVCHubertModel.expected_sample_rate)
    feats = sfeatures_model.extract_features(audio=
        torch.from_numpy(audio_resamp))
    example['rvc_features'] = feats.cpu().numpy()
    return example

train_data = dataset['train']
features = train_data.features.copy()
features['rvc_features'] = Array3D(shape=(1, None, 768), dtype='float32')

character_durations = {}
character_counts = {}
threshold_duration = 300

def aggregate_durations(example):
    if not example['speaker'] in character_durations:
        character_durations[example['speaker']] = 0.0
        character_counts[example['speaker']] = 0
    character_durations[example['speaker']] += (example['end'] - example['start'])
    character_counts[example['speaker']] += 1

train_data.map(aggregate_durations)
qualified_speakers = {speaker for speaker, duration in character_durations.items() if duration > threshold_duration}
#dataset_with_features = train_data.shuffle(seed=42).select(range(10)).map(add_speech_features)

# Unfortunately, HF dataset's audio decoding does not account for soundfile throwing exceptions
import io
import soundfile as sf
def soundfile_validate_filter(example):
    try:
        b = io.BytesIO(example['audio']['bytes'])
        array, sr = sf.read(b)
        return True
    except sf.LibsndfileError as e:
        return False

theidx = 0
def test_map(example, idx):
    global theidx
    theidx = idx
    return example['speaker'] in qualified_speakers
total_examples = len(train_data)
reversed_indices = list(range(total_examples - 1, -1, -1))

filtered_dataset = train_data.select(reversed_indices).cast_column(
    'audio', Audio(decode=False)).filter(
    soundfile_validate_filter).filter(
    test_map, with_indices=True, num_proc=1).cast_column(
    'audio', Audio(decode=True))


Map:   0%|          | 0/64735 [00:00<?, ? examples/s]

Filter:   0%|          | 0/64735 [00:00<?, ? examples/s]

Filter:   0%|          | 0/64734 [00:00<?, ? examples/s]

In [120]:
filtered_dataset

dataset_with_features = filtered_dataset.map(add_speech_features)
dataset_with_features = dataset_with_features.remove_columns(['audio'])

dataset_with_features.to_parquet('dataset_over5min.parquet')

Map:   0%|          | 0/59603 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/597 [00:00<?, ?ba/s]

39293853130

In [1]:
from datasets import load_dataset
dataset_with_features = load_dataset('parquet', data_files=['dataset_over5min.parquet'])['train']
dataset_with_features = dataset_with_features.remove_columns(['audio'])
dataset_with_features.to_parquet('dataset_over5min_noaudio.parquet')

Loading dataset shards:   0%|          | 0/75 [00:00<?, ?it/s]

AttributeError: 'DatasetDict' object has no attribute 'to_parquet'