In [2]:
import os
import pandas as pd
import torchaudio
import datasets

# Function to load audio dataset
def load_audio_data(audio_path, text_path):
    with open(text_path, "r", encoding="utf-8") as f:
        text = f.read().strip()  # Read the transcript
        
    return [{"audio": audio_path, "text": text}]

# Load training and validation data
dataset_dict = {
    "train": load_audio_data("./dataset/train.mp3", "./dataset/train.txt"),
    "validation": load_audio_data("./dataset/validation.mp3", "./dataset/validation.txt"),
}

In [3]:
# Convert to Hugging Face dataset format
dataset = datasets.DatasetDict({
    split: datasets.Dataset.from_pandas(pd.DataFrame(data))
    for split, data in dataset_dict.items()
})

# Convert audio column to Hugging Face Audio format
dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))

In [4]:
# Print dataset structure
print(dataset)

dataset.save_to_disk("hf_audio_dataset")

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 1
    })
})


Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 50.37 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 104.55 examples/s]


In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/Users/elifsorguc/Desktop/fonetika/fonetikadata/hf_audio_env/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/elifsorguc/Desktop/fonetika/fonetikadata/hf_audio_env/lib/python3.9/site-packages/huggingface

In [7]:
dataset.push_to_hub("elifsorguc/fonetika")

Map: 100%|██████████| 1/1 [00:00<00:00, 68.21 examples/s]<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 123.92ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]
Map: 100%|██████████| 1/1 [00:00<00:00, 139.39 examples/s]?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 352.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/elifsorguc/fonetika/commit/4fe0429da66642ec88f4e20bc270fd661dddd3d3', commit_message='Upload dataset', commit_description='', oid='4fe0429da66642ec88f4e20bc270fd661dddd3d3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/elifsorguc/fonetika', endpoint='https://huggingface.co', repo_type='dataset', repo_id='elifsorguc/fonetika'), pr_revision=None, pr_num=None)