In [None]:
from datasets import load_dataset, concatenate_datasets, Features, Audio, Value, DatasetDict
import librosa
import numpy as np

# Load the Datasets

In [2]:
openslr_ds = load_dataset("chuuhtetnaing/myanmar-speech-dataset-openslr-80")
fleurs_ds = load_dataset("chuuhtetnaing/myanmar-speech-dataset-google-fleurs")

In [3]:
fleurs_ds = fleurs_ds.remove_columns(["id", "num_samples", "path", "gender", "lang_id", "language", "lang_group_id", "transcription"])
fleurs_ds = fleurs_ds.rename_column("raw_transcription", "transcription")

In [4]:
fleurs_ds['train'].features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

In [5]:
openslr_ds['train'].features

{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

# Downsample the higher sampling rate data

In [6]:
# Define the target features structure
target_features = Features({
    'audio': Audio(sampling_rate=16000),
    'transcription': Value('string'),
    'source': Value('string')
})

# Function to resample audio and ensure consistent feature structure
def process_example(example, dataset_name):
    # Get current audio
    audio_array = example['audio']['array']
    current_sr = example['audio']['sampling_rate']
    
    # Resample if needed (for openslr dataset)
    if current_sr != 16000:
        audio_array = librosa.resample(
            y=np.array(audio_array, dtype=np.float32),
            orig_sr=current_sr,
            target_sr=16000
        )
    
    # Create new example with consistent structure
    return {
        'audio': {'array': audio_array, 'sampling_rate': 16000},
        'transcription': example['transcription'],
        'source': dataset_name
    }

In [8]:
openslr_ds = openslr_ds.map(
    lambda x: process_example(x, 'openslr'),
    features=target_features,
    desc="Processing OpenSLR dataset",
    num_proc=10
)

fleurs_ds = fleurs_ds.map(
    lambda x: process_example(x, 'fleurs'),
    features=target_features,
    desc="Processing FLEURS dataset"
)

Processing OpenSLR dataset (num_proc=10):   0%|          | 0/2277 [00:00<?, ? examples/s]

Processing OpenSLR dataset (num_proc=10):   0%|          | 0/253 [00:00<?, ? examples/s]

Processing FLEURS dataset:   0%|          | 0/3938 [00:00<?, ? examples/s]

Processing FLEURS dataset:   0%|          | 0/384 [00:00<?, ? examples/s]

# Combine the Datasets

In [10]:
combined_train_ds = concatenate_datasets([fleurs_ds['train'], openslr_ds['train']])
combined_test_ds = concatenate_datasets([fleurs_ds['test'], openslr_ds['test']])

In [12]:
ds = DatasetDict({
    'train': combined_train_ds,
    'test': combined_test_ds
})

# Upload to HuggingFace

In [13]:
ds.push_to_hub("chuuhtetnaing/myanmar-speech-dataset-for-asr")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/637 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-speech-dataset/commit/cbee81b64e0a9d26bcd4a5a8e0a5713ba4d77b99', commit_message='Upload dataset', commit_description='', oid='cbee81b64e0a9d26bcd4a5a8e0a5713ba4d77b99', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-speech-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-speech-dataset'), pr_revision=None, pr_num=None)