In [1]:
!pip install torchcodec



In [2]:
from datasets import load_dataset, concatenate_datasets, Features, Audio, Value, DatasetDict, Dataset
import librosa
import numpy as np

# Load the Datasets

In [None]:
openslr_ds = load_dataset("chuuhtetnaing/myanmar-speech-dataset-openslr-80")
fleurs_ds = load_dataset("chuuhtetnaing/myanmar-speech-dataset-google-fleurs")

In [None]:
fleurs_ds = fleurs_ds.remove_columns(["id", "num_samples", "path", "gender", "lang_id", "language", "lang_group_id", "transcription"])
fleurs_ds = fleurs_ds.rename_column("raw_transcription", "transcription")

In [None]:
fleurs_ds['train'].features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

In [None]:
openslr_ds['train'].features

{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

# Downsample the higher sampling rate data

In [3]:
# Define the target features structure
target_features = Features({
    'audio': Audio(sampling_rate=16000),
    'transcription': Value('string'),
    'source': Value('string')
})

# Simplified function - return just the array, not the full audio dict
def process_example(example, dataset_name):
    # Get current audio
    audio_array = example['audio']['array']
    current_sr = example['audio']['sampling_rate']

    # Convert to numpy array first
    audio_array = np.array(audio_array, dtype=np.float32)

    # Resample if needed
    if current_sr != 16000:
        audio_array = librosa.resample(
            y=audio_array,
            orig_sr=current_sr,
            target_sr=16000
        )

    # Return just the numpy array for 'audio', not a dict
    return {
        'audio': audio_array,  # Just the array
        'transcription': example['transcription'],
        'source': dataset_name
    }

In [None]:
openslr_ds = openslr_ds.map(
    lambda x: process_example(x, 'openslr'),
    features=target_features,
    desc="Processing OpenSLR dataset",
    num_proc=10
)

fleurs_ds = fleurs_ds.map(
    lambda x: process_example(x, 'fleurs'),
    features=target_features,
    desc="Processing FLEURS dataset"
)

Processing OpenSLR dataset (num_proc=10):   0%|          | 0/2277 [00:00<?, ? examples/s]

Processing OpenSLR dataset (num_proc=10):   0%|          | 0/253 [00:00<?, ? examples/s]

Processing FLEURS dataset:   0%|          | 0/3938 [00:00<?, ? examples/s]

Processing FLEURS dataset:   0%|          | 0/384 [00:00<?, ? examples/s]

# Combine the Datasets

In [None]:
combined_train_ds = concatenate_datasets([fleurs_ds['train'], openslr_ds['train']])
combined_test_ds = concatenate_datasets([fleurs_ds['test'], openslr_ds['test']])

In [None]:
ds = DatasetDict({
    'train': combined_train_ds,
    'test': combined_test_ds
})

# Upload to HuggingFace

In [None]:
ds.push_to_hub("chuuhtetnaing/myanmar-speech-dataset-for-asr")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Map:   0%|          | 0/1243 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/637 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-speech-dataset/commit/cbee81b64e0a9d26bcd4a5a8e0a5713ba4d77b99', commit_message='Upload dataset', commit_description='', oid='cbee81b64e0a9d26bcd4a5a8e0a5713ba4d77b99', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-speech-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-speech-dataset'), pr_revision=None, pr_num=None)

# Add the Ko-Yin-Maung/mig-english-myanmar-translation Dataset

In [4]:
mig_ds = load_dataset("Ko-Yin-Maung/mig-burmese-audio-transcription")

In [5]:
mig_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'speaker', 'sex', 'age', 'size', 'duration', 'title', 'category', 'type', 'bit_rate'],
        num_rows: 2682
    })
    test: Dataset({
        features: ['audio', 'transcription', 'speaker', 'sex', 'age', 'size', 'duration', 'title', 'category', 'type', 'bit_rate'],
        num_rows: 140
    })
})

In [6]:
mig_ds = mig_ds.remove_columns(['speaker', 'sex', 'age', 'size', 'duration', 'title', 'category', 'type', 'bit_rate'])

In [7]:
mig_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 2682
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 140
    })
})

In [8]:
mig_ds = mig_ds.map(
    lambda x: process_example(x, 'mig-burmese-audio-transcription'),
    desc="Processing MIG Burmese Audio Transcription Dataset",
    num_proc=2
)

In [9]:
final_mig_ds = {}

for split in ['train', 'test']:
    processed_data = {
        'audio': [],
        'transcription': [],
        'source': []
    }

    for example in mig_ds[split]:
        # Ensure it's a numpy array
        audio_array = np.array(example['audio'], dtype=np.float32)

        processed_data['audio'].append({
            'array': audio_array,
            'sampling_rate': 16000
        })
        processed_data['transcription'].append(example['transcription'])
        processed_data['source'].append(example['source'])

    # Create dataset with proper features for this split
    final_mig_ds[split] = Dataset.from_dict(
        processed_data,
        features=Features({
            'audio': Audio(sampling_rate=16000),
            'transcription': Value('string'),
            'source': Value('string')
        })
    )


In [10]:
ds = load_dataset("chuuhtetnaing/myanmar-speech-dataset-for-asr")

In [11]:
final_mig_ds

{'train': Dataset({
     features: ['audio', 'transcription', 'source'],
     num_rows: 2682
 }),
 'test': Dataset({
     features: ['audio', 'transcription', 'source'],
     num_rows: 140
 })}

In [12]:
combined_train_ds = concatenate_datasets([ds['train'], final_mig_ds['train']])
combined_test_ds = concatenate_datasets([ds['test'], final_mig_ds['test']])

In [13]:
combined_ds = DatasetDict({
    'train': combined_train_ds,
    'test': combined_test_ds
})

In [15]:
combined_ds.push_to_hub("chuuhtetnaing/myanmar-speech-dataset-for-asr", commit_message="add Ko-Yin-Maung/mig-burmese-audio-transcription dataset")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ? shards/s]

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   3%|3         | 23.8MB /  712MB            

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   0%|          | 66.6kB /  725MB            

Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   0%|          |  960kB /  386MB            

Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   1%|          | 1.42MB /  207MB            

Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   2%|2         | 3.75MB /  153MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :  15%|#4        | 30.6MB /  209MB            

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-speech-dataset-for-asr/commit/ed80affc62cba8270fa6ec55ae446f72af3a272a', commit_message='add Ko-Yin-Maung/mig-burmese-audio-transcription dataset', commit_description='', oid='ed80affc62cba8270fa6ec55ae446f72af3a272a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-speech-dataset-for-asr', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-speech-dataset-for-asr'), pr_revision=None, pr_num=None)