In [1]:
import os
import pandas as pd
import torchaudio
import datasets

# Function to load audio dataset
def load_audio_data(audio_files, text_files):
    dataset = []
    for audio_path, text_path in zip(audio_files, text_files):
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().strip()  # Read the transcript
        
        dataset.append({"audio": audio_path, "text": text})
    return dataset

# Define training and validation file paths
train_audio_files = ["./dataset/001.mp3", "./dataset/002.mp3"]
train_text_files = ["./dataset/001.txt", "./dataset/002.txt"]
val_audio_files = ["./dataset/003.mp3", "./dataset/004.mp3"]
val_text_files = ["./dataset/003.txt", "./dataset/004.txt"]

# Load datasets properly
dataset_dict = {
    "train": load_audio_data(train_audio_files, train_text_files),
    "validation": load_audio_data(val_audio_files, val_text_files),
}

# Convert to Hugging Face dataset format
dataset = datasets.DatasetDict({
    split: datasets.Dataset.from_pandas(pd.DataFrame(data))
    for split, data in dataset_dict.items()
})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Convert audio column to Hugging Face Audio format
dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))

In [3]:
# Print dataset structure
print(dataset)

dataset.save_to_disk("hf_audio_dataset")

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 2
    })
    validation: Dataset({
        features: ['audio', 'text'],
        num_rows: 2
    })
})


Saving the dataset (1/1 shards): 100%|██████████| 2/2 [00:00<00:00, 93.98 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 2/2 [00:00<00:00, 208.05 examples/s]


In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/Users/elifsorguc/Desktop/fonetika/fonetikadata/hf_audio_env/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/elifsorguc/Desktop/fonetika/fonetikadata/hf_audio_env/lib/python3.9/site-packages/huggingface

In [4]:
dataset.push_to_hub("elifsorguc/fonetika")


Map: 100%|██████████| 2/2 [00:00<00:00, 148.08 examples/s]?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 107.46ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
Map: 100%|██████████| 2/2 [00:00<00:00, 255.43 examples/s]?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 409.96ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/elifsorguc/fonetika/commit/8f73f9c4105c827a75180060441433bc28ccdbf9', commit_message='Upload dataset', commit_description='', oid='8f73f9c4105c827a75180060441433bc28ccdbf9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/elifsorguc/fonetika', endpoint='https://huggingface.co', repo_type='dataset', repo_id='elifsorguc/fonetika'), pr_revision=None, pr_num=None)