In [1]:
import os
import time
import shutil
from pathlib import Path

import pandas as pd
from tqdm import tqdm
from IPython.display import Audio, display
from mutagen.mp3 import MP3
from mutagen.wave import WAVE

from utils import download_audios, download_texts
from utils import usx_parser
from utils import audio_stats
# from utils import force_align_book

In [3]:
audio_df = audio_stats.get_all_audio_files(audios_dir="data/audios/Yoruba", alignment_filter="only")

total_hours_all_languages = audio_df["duration_seconds"].sum() / 3600
print(f"TOTAL HOURS (all languages): {total_hours_all_languages:.2f}")

audio_df.sort_values(by="duration_seconds", ascending=False)

Collecting audio files: 100%|██████████| 32289/32289 [01:10<00:00, 458.50it/s] 


TOTAL HOURS (all languages): 92.41


Unnamed: 0,language,testament_format,book,file_name,file_path,format,file_size_mb,duration_seconds
21628,Alignment,Jeremiah,JER_051_Verse_021.wav,JER_051_Verse_021.wav,data/audios/Yoruba/Alignment/Jeremiah/JER_051_...,wav,47.91,523.32
21630,Alignment,Jeremiah,JER_051_Verse_022.wav,JER_051_Verse_022.wav,data/audios/Yoruba/Alignment/Jeremiah/JER_051_...,wav,47.08,514.27
5157,Alignment,Luke,LUK_023_Verse_016.wav,LUK_023_Verse_016.wav,data/audios/Yoruba/Alignment/Luke/LUK_023_Vers...,wav,33.22,362.82
5159,Alignment,Luke,LUK_023_Verse_017.wav,LUK_023_Verse_017.wav,data/audios/Yoruba/Alignment/Luke/LUK_023_Vers...,wav,32.82,358.47
3794,Alignment,John,JHN_018_Verse_013.wav,JHN_018_Verse_013.wav,data/audios/Yoruba/Alignment/John/JHN_018_Vers...,wav,29.24,319.39
...,...,...,...,...,...,...,...,...
25223,Alignment,Leviticus,LEV_023_Verse_023.wav,LEV_023_Verse_023.wav,data/audios/Yoruba/Alignment/Leviticus/LEV_023...,wav,0.18,1.91
1962,Alignment,Acts,ACT_018_Verse_004.wav,ACT_018_Verse_004.wav,data/audios/Yoruba/Alignment/Acts/ACT_018_Vers...,wav,0.17,1.87
26935,Alignment,Numbers,NUM_026_Verse_052.wav,NUM_026_Verse_052.wav,data/audios/Yoruba/Alignment/Numbers/NUM_026_V...,wav,0.17,1.83
3608,Alignment,John,JHN_011_Verse_035.wav,JHN_011_Verse_035.wav,data/audios/Yoruba/Alignment/John/JHN_011_Vers...,wav,0.16,1.75


In [5]:
# Book to testament mapping (constant, defined outside function)
BOOK_TO_TESTAMENT = {
    # Old Testament
    "Genesis": "Old Testament",
    "Exodus": "Old Testament",
    "Leviticus": "Old Testament",
    "Numbers": "Old Testament",
    "Deuteronomy": "Old Testament",
    "Joshua": "Old Testament",
    "Judges": "Old Testament",
    "Ruth": "Old Testament",
    "1 Samuel": "Old Testament",
    "2 Samuel": "Old Testament",
    "1 Kings": "Old Testament",
    "2 Kings": "Old Testament",
    "1 Chronicles": "Old Testament",
    "2 Chronicles": "Old Testament",
    "Ezra": "Old Testament",
    "Nehemiah": "Old Testament",
    "Esther": "Old Testament",
    "Job": "Old Testament",
    "Psalms": "Old Testament",
    "Proverbs": "Old Testament",
    "Ecclesiastes": "Old Testament",
    "Song of Songs": "Old Testament",
    "Isaiah": "Old Testament",
    "Jeremiah": "Old Testament",
    "Lamentations": "Old Testament",
    "Ezekiel": "Old Testament",
    "Daniel": "Old Testament",
    "Hosea": "Old Testament",
    "Joel": "Old Testament",
    "Amos": "Old Testament",
    "Obadiah": "Old Testament",
    "Jonah": "Old Testament",
    "Micah": "Old Testament",
    "Nahum": "Old Testament",
    "Habakkuk": "Old Testament",
    "Zephaniah": "Old Testament",
    "Haggai": "Old Testament",
    "Zechariah": "Old Testament",
    "Malachi": "Old Testament",
    # New Testament
    "Matthew": "New Testament",
    "Mark": "New Testament",
    "Luke": "New Testament",
    "John": "New Testament",
    "Acts": "New Testament",
    "Romans": "New Testament",
    "1 Corinthians": "New Testament",
    "2 Corinthians": "New Testament",
    "Galatians": "New Testament",
    "Ephesians": "New Testament",
    "Philippians": "New Testament",
    "Colossians": "New Testament",
    "1 Thessalonians": "New Testament",
    "2 Thessalonians": "New Testament",
    "1 Timothy": "New Testament",
    "2 Timothy": "New Testament",
    "Titus": "New Testament",
    "Philemon": "New Testament",
    "Hebrews": "New Testament",
    "James": "New Testament",
    "1 Peter": "New Testament",
    "2 Peter": "New Testament",
    "1 John": "New Testament",
    "2 John": "New Testament",
    "3 John": "New Testament",
    "Jude": "New Testament",
    "Revelation": "New Testament",
}


def get_alignment_dataframe(language: str, base_dir: str = "data/audios") -> pd.DataFrame:
    """
    Load alignment data for a given language and return a DataFrame.
    
    Args:
        language: The language name (e.g., "Yoruba")
        base_dir: Base directory for audio files (default: "data/audios")
    
    Returns:
        DataFrame with columns: audio_file, text_file, text, book, chapter, 
                               verse, testament, duration_seconds
    """
    alignment_dir = os.path.join(base_dir, language, "Alignment")
    
    # Collect all audio files recursively (.wav and .mp3)
    audio_files = []
    for root, dirs, files in os.walk(alignment_dir):
        for file in files:
            if file.lower().endswith(('.wav', '.mp3')):
                audio_files.append(os.path.join(root, file))
    
    # Each audio file has a corresponding .txt file with the same name
    text_files = [os.path.splitext(x)[0] + ".txt" for x in audio_files]
    
    # Build initial dataframe with file paths
    df = pd.DataFrame({
        "audio_file": audio_files,
        "text_file": text_files,
    })
    
    # Helper to safely read text file contents
    def read_text_file(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read()
        except Exception:
            return None
    
    # Read transcript text from each text file
    df["text"] = df["text_file"].apply(read_text_file)
    
    # Extract metadata from file path structure:
    # Format: .../Alignment/{Book}/{BOOK_CHAPTER_Verse_VERSE}.txt
    df["book"] = df["text_file"].apply(lambda x: x.split("/")[-2])
    df["chapter"] = df["text_file"].apply(lambda x: x.replace(".txt", "").split("/")[-1].split("_")[1])
    df["verse"] = df["text_file"].apply(lambda x: x.replace(".txt", "").split("/")[-1].split("_")[-1])
    
    # Map book name to testament (Old/New)
    df["testament"] = df["book"].map(BOOK_TO_TESTAMENT)
    
    # Get audio duration in seconds
    df["duration_seconds"] = df["audio_file"].apply(audio_stats.get_audio_duration)

    # Reorder columns
    df = df[["audio_file", "text", "testament", "book", "chapter", "verse", "duration_seconds"]]
    
    return df

In [6]:
language = "Yoruba"
base_dir = "data/audios"

alignment_df = get_alignment_dataframe(language, base_dir)
alignment_df.head()

Unnamed: 0,audio_file,text,testament,book,chapter,verse,duration_seconds
0,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,"Paulu, ẹni ti a pé láti jẹ́ aposteli Kristi Je...",New Testament,1 Corinthians,1,1,8.93
1,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,"Sí ìjọ ènìyàn Ọlọ́run ni Kọrinti, sí àwọn ti a...",New Testament,1 Corinthians,1,2,17.72
2,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,Oore-ọ̀fẹ́ àti àlàáfíà fún yín láti ọ̀dọ̀ Ọlọ́...,New Testament,1 Corinthians,1,3,8.9
3,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,Nígbà gbogbo ni mo ń dúpẹ́ lọ́wọ́ Ọlọ́run fún ...,New Testament,1 Corinthians,1,4,6.91
4,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,Nítorí nínú rẹ̀ ni a ti sọ yín di ọlọ́rọ̀ nínú...,New Testament,1 Corinthians,1,5,7.15


## Outlier removal

In [8]:
alignment_df_renamed = alignment_df.rename(columns={"audio_file": "wav_filename", "text": "transcript"})
alignment_df_renamed.to_csv("data-checker/files/Yoruba.csv")

In [13]:
alignment_df_renamed

Unnamed: 0,wav_filename,transcript,testament,book,chapter,verse,duration_seconds
0,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,"Paulu, ẹni ti a pé láti jẹ́ aposteli Kristi Je...",New Testament,1 Corinthians,001,001,8.93
1,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,"Sí ìjọ ènìyàn Ọlọ́run ni Kọrinti, sí àwọn ti a...",New Testament,1 Corinthians,001,002,17.72
2,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,Oore-ọ̀fẹ́ àti àlàáfíà fún yín láti ọ̀dọ̀ Ọlọ́...,New Testament,1 Corinthians,001,003,8.90
3,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,Nígbà gbogbo ni mo ń dúpẹ́ lọ́wọ́ Ọlọ́run fún ...,New Testament,1 Corinthians,001,004,6.91
4,data/audios/Yoruba/Alignment/1 Corinthians/1CO...,Nítorí nínú rẹ̀ ni a ti sọ yín di ọlọ́rọ̀ nínú...,New Testament,1 Corinthians,001,005,7.15
...,...,...,...,...,...,...,...
31095,data/audios/Yoruba/Alignment/Zephaniah/ZEP_003...,"Ní ọjọ́ náà, wọn yóò sọ fún Jerusalẹmu pé, “Má...",Old Testament,Zephaniah,003,016,8.22
31096,data/audios/Yoruba/Alignment/Zephaniah/ZEP_003...,"Olúwa Ọlọ́run rẹ wà pẹ̀lú rẹ, Ó ní agbára láti...",Old Testament,Zephaniah,003,017,14.61
31097,data/audios/Yoruba/Alignment/Zephaniah/ZEP_003...,“Èmi ó kó àwọn tí ó ń banújẹ́ fún àjọ̀dún tí a...,Old Testament,Zephaniah,003,018,8.69
31098,data/audios/Yoruba/Alignment/Zephaniah/ZEP_003...,Ní àkókò náà ni èmi yóò dojúkọ àwọn tí ń ni yí...,Old Testament,Zephaniah,003,019,14.33


In [None]:
best = pd.read_csv("data-checker/files/Yoruba.BEST")
print(best.shape)

(29933, 16)


In [None]:
python data-checker/data_checks.py data-checker/files/Yoruba.csv 2.0

## Upload to Hugging Face

In [None]:
from datasets import Dataset, Audio
from huggingface_hub import HfApi

def upload_alignment_to_hf(
    alignment_df: pd.DataFrame,
    language: str,
    repo_id: str,
    private: bool = False,
    max_shard_size: str = "200MB",
    max_retries: int = 3,
):
    """
    Upload alignment data as a TTS dataset to Hugging Face Hub.
    
    Args:
        alignment_df: DataFrame with audio_file, text, and metadata columns
        language: Language name to use as the split/config name
        repo_id: Hugging Face repository ID (e.g., "username/bible-tts")
        private: Whether the dataset should be private
        max_shard_size: Maximum shard size for upload (smaller = more reliable)
        max_retries: Number of retries on timeout errors
    """
    # Create a copy to avoid modifying the original
    df = alignment_df.copy()
    
    # Rename audio_file to audio for HF convention
    df = df.rename(columns={"audio_file": "audio"})
    
    # Create HF Dataset
    dataset = Dataset.from_pandas(df, preserve_index=False)
    
    # Cast the audio column to Audio feature (this handles loading the actual audio files)
    dataset = dataset.cast_column("audio", Audio())
    
    # Push to hub with retry logic for timeout errors
    for attempt in range(max_retries):
        try:
            dataset.push_to_hub(
                repo_id=repo_id,
                config_name=language,  # Use language as the config name
                split="train",  # HF requires a split name, but you can ignore it when loading
                private=private,
                max_shard_size=max_shard_size,  # Smaller shards for more reliable uploads
            )
            print(f"✓ Uploaded {len(dataset)} samples for '{language}' to {repo_id}")
            return
        except Exception as e:
            if "timeout" in str(e).lower() or "ReadTimeout" in str(type(e).__name__):
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 30  # 30s, 60s, 90s
                    print(f"⚠ Timeout on attempt {attempt + 1}/{max_retries}. Retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"✗ Failed after {max_retries} attempts. The data may have uploaded - check the repo.")
                    raise
            else:
                raise

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
LANGUAGES = [
    'Apali',
    'Arabic Standard',
    'Assamese',
    'Bengali',
    'Central Kurdish',
    'Chhattisgarhi',
    'Chichewa',
    'Dawro',
    'Dholuo',
    'Ewe',
    'Gamo',
    'Gofa',
    'Gujarati',
    'Haitian Creole',
    # 'Haryanvi',
    'Hausa',
    'Hiligaynon',
    'Hindi',
    'Igbo',
    'Kannada',
    'Kikuyu',
    'Lingala',
    'Luganda',
    'Malayalam',
    'Marathi',
    'Ndebele',
    'Oromo',
    'Punjabi',
    'Shona',
    'Swahili',
    'Tamil',
    'Telugu',
    # 'Toma',
    'Turkish',
    'Twi (Akuapem)',
    'Twi (Asante)',
    'Ukrainian',
    'Urdu',
    'Vietnamese',
    'Yoruba'
]
base_dir = "data/audios"

for language in tqdm(LANGUAGES, desc="Uploading languages"):
    print(f"\n--- Processing '{language}' ---")
    try:
        alignment_df = get_alignment_dataframe(language, base_dir)

        # Simple outlier removal, need to do better
        alignment_df = alignment_df[alignment_df["duration_seconds"] < 60]

        repo_id = "davidguzmanr/bible-tts-resources"  # Change this to your repo ID

        upload_alignment_to_hf(
            alignment_df=alignment_df.head(100),
            language=language,
            repo_id=repo_id,
            private=False,
        )
    except Exception as e:
        print(f"✗ Error uploading '{language}': {e}")
    else:
        print(f"✓ Finished uploading '{language}'")


Uploading languages:   0%|          | 0/38 [00:00<?, ?it/s]


--- Processing 'Apali' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 232.01 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  7.62ba/s]
Processing Files (1 / 1): 100%|██████████|  125MB /  125MB, 56.8MB/s  
New Data Upload: 100%|██████████|  125MB /  125MB, 56.8MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.72s/ shards]
Uploading languages:   3%|▎         | 1/38 [00:35<21:40, 35.14s/it]

✓ Uploaded 100 samples for 'Apali' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Apali'

--- Processing 'Arabic Standard' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 478.27 examples/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 16.49ba/s]
Processing Files (1 / 1): 100%|██████████| 37.5MB / 37.5MB, 37.4MB/s  
New Data Upload: 100%|██████████| 37.5MB / 37.5MB, 37.4MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.02s/ shards]
Uploading languages:   5%|▌         | 2/38 [01:46<33:53, 56.49s/it]

✓ Uploaded 100 samples for 'Arabic Standard' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Arabic Standard'

--- Processing 'Assamese' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 259.14 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.75ba/s]
Processing Files (1 / 1): 100%|██████████| 73.9MB / 73.9MB, 41.0MB/s  
New Data Upload: 100%|██████████| 73.9MB / 73.9MB, 41.0MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.26s/ shards]
Uploading languages:   8%|▊         | 3/38 [03:11<40:26, 69.33s/it]

✓ Uploaded 100 samples for 'Assamese' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Assamese'

--- Processing 'Bengali' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 228.06 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  8.62ba/s]
Processing Files (1 / 1): 100%|██████████| 88.2MB / 88.2MB, 49.0MB/s  
New Data Upload: 100%|██████████| 88.2MB / 88.2MB, 49.0MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.77s/ shards]
Uploading languages:  11%|█         | 4/38 [04:50<45:57, 81.10s/it]

✓ Uploaded 100 samples for 'Bengali' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Bengali'

--- Processing 'Central Kurdish' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 257.82 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 12.12ba/s]
Processing Files (1 / 1): 100%|██████████| 96.4MB / 96.4MB, 53.6MB/s  
New Data Upload: 100%|██████████| 96.4MB / 96.4MB, 53.6MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.08s/ shards]
Uploading languages:  13%|█▎        | 5/38 [06:10<44:19, 80.59s/it]

✓ Uploaded 100 samples for 'Central Kurdish' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Central Kurdish'

--- Processing 'Chhattisgarhi' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 256.93 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 11.15ba/s]
Processing Files (1 / 1): 100%|██████████| 96.2MB / 96.2MB, 53.4MB/s  
New Data Upload: 100%|██████████| 96.2MB / 96.2MB, 53.4MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.99s/ shards]
Uploading languages:  16%|█▌        | 6/38 [07:34<43:37, 81.80s/it]

✓ Uploaded 100 samples for 'Chhattisgarhi' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Chhattisgarhi'

--- Processing 'Chichewa' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 370.13 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  5.34ba/s]
Processing Files (1 / 1): 100%|██████████| 55.0MB / 55.0MB, 39.3MB/s  
New Data Upload: 100%|██████████| 55.0MB / 55.0MB, 39.3MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.21s/ shards]
Uploading languages:  18%|█▊        | 7/38 [08:39<39:28, 76.39s/it]

✓ Uploaded 100 samples for 'Chichewa' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Chichewa'

--- Processing 'Dawro' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 307.75 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  8.93ba/s]
Processing Files (1 / 1): 100%|██████████| 45.9MB / 45.9MB, 38.2MB/s  
New Data Upload: 100%|██████████| 45.9MB / 45.9MB, 38.2MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.74s/ shards]
Uploading languages:  21%|██        | 8/38 [09:42<36:07, 72.24s/it]

✓ Uploaded 100 samples for 'Dawro' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Dawro'

--- Processing 'Dholuo' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 273.02 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.78ba/s]
Processing Files (1 / 1): 100%|██████████| 75.7MB / 75.7MB, 42.0MB/s  
New Data Upload: 100%|██████████| 75.7MB / 75.7MB, 42.0MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.90s/ shards]
Uploading languages:  24%|██▎       | 9/38 [10:54<34:50, 72.10s/it]

✓ Uploaded 100 samples for 'Dholuo' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Dholuo'

--- Processing 'Ewe' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 225.73 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 11.51ba/s]
Processing Files (1 / 1): 100%|██████████| 91.4MB / 91.4MB, 50.8MB/s  
New Data Upload: 100%|██████████| 91.4MB / 91.4MB, 50.8MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.36s/ shards]
Uploading languages:  26%|██▋       | 10/38 [12:10<34:09, 73.20s/it]

✓ Uploaded 100 samples for 'Ewe' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Ewe'

--- Processing 'Gamo' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 386.01 examples/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 13.35ba/s]
Processing Files (1 / 1): 100%|██████████| 37.6MB / 37.6MB, 31.4MB/s  
New Data Upload: 100%|██████████| 37.6MB / 37.6MB, 31.4MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.56s/ shards]
Uploading languages:  29%|██▉       | 11/38 [13:15<31:51, 70.78s/it]

✓ Uploaded 100 samples for 'Gamo' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Gamo'

--- Processing 'Gofa' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 158.83 examples/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 16.58ba/s]
Processing Files (1 / 1): 100%|██████████| 32.4MB / 32.4MB, 31.9MB/s  
New Data Upload: 100%|██████████| 32.4MB / 32.4MB, 31.9MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.63s/ shards]
Uploading languages:  32%|███▏      | 12/38 [14:19<29:49, 68.84s/it]

✓ Uploaded 100 samples for 'Gofa' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Gofa'

--- Processing 'Gujarati' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 245.21 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  6.08ba/s]
Processing Files (1 / 1): 100%|██████████| 82.0MB / 82.0MB, 41.0MB/s  
New Data Upload: 100%|██████████| 82.0MB / 82.0MB, 41.0MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.66s/ shards]
Uploading languages:  34%|███▍      | 13/38 [15:33<29:20, 70.42s/it]

✓ Uploaded 100 samples for 'Gujarati' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Gujarati'

--- Processing 'Haitian Creole' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 388.31 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.87ba/s]
Processing Files (1 / 1): 100%|██████████| 49.8MB / 49.8MB, 35.5MB/s  
New Data Upload: 100%|██████████| 49.8MB / 49.8MB, 35.5MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.93s/ shards]
Uploading languages:  37%|███▋      | 14/38 [16:41<27:46, 69.43s/it]

✓ Uploaded 100 samples for 'Haitian Creole' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Haitian Creole'

--- Processing 'Hausa' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 317.98 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.91ba/s]
Processing Files (1 / 1): 100%|██████████| 64.5MB / 64.5MB, 40.3MB/s  
New Data Upload: 100%|██████████| 64.5MB / 64.5MB, 40.3MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.92s/ shards]
Uploading languages:  39%|███▉      | 15/38 [17:16<22:39, 59.13s/it]

✓ Uploaded 100 samples for 'Hausa' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Hausa'

--- Processing 'Hiligaynon' ---



[A
[A
Map: 100%|██████████| 100/100 [00:00<00:00, 190.84 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  9.72ba/s]
Processing Files (1 / 1): 100%|██████████|  107MB /  107MB, 59.5MB/s  
New Data Upload: 100%|██████████|  107MB /  107MB, 59.5MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.89s/ shards]
Uploading languages:  42%|████▏     | 16/38 [19:12<27:58, 76.31s/it]

✓ Uploaded 100 samples for 'Hiligaynon' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Hiligaynon'

--- Processing 'Hindi' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 234.69 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  6.79ba/s]
Processing Files (1 / 1): 100%|██████████|  100MB /  100MB, 55.5MB/s  
New Data Upload: 100%|██████████|  100MB /  100MB, 55.5MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.55s/ shards]
Uploading languages:  45%|████▍     | 17/38 [20:31<26:57, 77.05s/it]

✓ Uploaded 100 samples for 'Hindi' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Hindi'

--- Processing 'Igbo' ---



[A
[A
Map: 100%|██████████| 100/100 [00:00<00:00, 194.00 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  5.73ba/s]
Processing Files (1 / 1): 100%|██████████|  106MB /  106MB, 48.2MB/s  
New Data Upload: 100%|██████████|  106MB /  106MB, 48.2MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.21s/ shards]
Uploading languages:  47%|████▋     | 18/38 [21:52<26:07, 78.36s/it]

✓ Uploaded 100 samples for 'Igbo' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Igbo'

--- Processing 'Kannada' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 222.26 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  7.84ba/s]
Processing Files (1 / 1): 100%|██████████| 97.7MB / 97.7MB, 54.3MB/s  
New Data Upload: 100%|██████████| 97.7MB / 97.7MB, 54.3MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.49s/ shards]
Uploading languages:  50%|█████     | 19/38 [23:12<24:58, 78.88s/it]

✓ Uploaded 100 samples for 'Kannada' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Kannada'

--- Processing 'Kikuyu' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 538.17 examples/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 10.97ba/s]
Processing Files (1 / 1): 100%|██████████| 40.7MB / 40.7MB, 33.9MB/s  
New Data Upload: 100%|██████████| 40.7MB / 40.7MB, 33.9MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.56s/ shards]
Uploading languages:  53%|█████▎    | 20/38 [24:16<22:16, 74.27s/it]

✓ Uploaded 100 samples for 'Kikuyu' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Kikuyu'

--- Processing 'Lingala' ---



[A
[A
Map: 100%|██████████| 100/100 [00:00<00:00, 166.66 examples/s]

[A
[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  6.69ba/s]
Processing Files (1 / 1): 100%|██████████|  115MB /  115MB, 52.2MB/s  
New Data Upload: 100%|██████████|  115MB /  115MB, 52.2MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.41s/ shards]
Uploading languages:  55%|█████▌    | 21/38 [25:33<21:17, 75.13s/it]

✓ Uploaded 100 samples for 'Lingala' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Lingala'

--- Processing 'Luganda' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 422.70 examples/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 15.55ba/s]
Processing Files (1 / 1): 100%|██████████| 40.4MB / 40.4MB, 28.9MB/s  
New Data Upload: 100%|██████████| 40.4MB / 40.4MB, 28.9MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.75s/ shards]
Uploading languages:  58%|█████▊    | 22/38 [26:41<19:26, 72.91s/it]

✓ Uploaded 100 samples for 'Luganda' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Luganda'

--- Processing 'Malayalam' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 244.77 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  6.73ba/s]
Processing Files (1 / 1): 100%|██████████| 72.5MB / 72.5MB, 24.2MB/s  
New Data Upload: 100%|██████████| 72.5MB / 72.5MB, 24.2MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.11s/ shards]
Uploading languages:  61%|██████    | 23/38 [27:53<18:12, 72.84s/it]

✓ Uploaded 100 samples for 'Malayalam' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Malayalam'

--- Processing 'Marathi' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 287.23 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.04ba/s]
Processing Files (1 / 1): 100%|██████████| 72.9MB / 72.9MB, 40.5MB/s  
New Data Upload: 100%|██████████| 72.9MB / 72.9MB, 40.5MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.10s/ shards]
Uploading languages:  63%|██████▎   | 24/38 [29:08<17:06, 73.29s/it]

✓ Uploaded 100 samples for 'Marathi' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Marathi'

--- Processing 'Ndebele' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 245.40 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 11.65ba/s]
Processing Files (1 / 1): 100%|██████████| 87.2MB / 87.2MB, 43.6MB/s  
New Data Upload: 100%|██████████| 87.2MB / 87.2MB, 43.6MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.42s/ shards]
Uploading languages:  66%|██████▌   | 25/38 [30:39<17:02, 78.68s/it]

✓ Uploaded 100 samples for 'Ndebele' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Ndebele'

--- Processing 'Oromo' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 239.76 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 11.04ba/s]
Processing Files (1 / 1): 100%|██████████| 84.8MB / 84.8MB, 47.1MB/s  
New Data Upload: 100%|██████████| 84.8MB / 84.8MB, 47.1MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.19s/ shards]
Uploading languages:  68%|██████▊   | 26/38 [31:55<15:32, 77.72s/it]

✓ Uploaded 100 samples for 'Oromo' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Oromo'

--- Processing 'Punjabi' ---



[A
Map: 100%|██████████| 100/100 [00:00<00:00, 234.73 examples/s]

[A
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.40ba/s]
Processing Files (1 / 1): 100%|██████████| 85.1MB / 85.1MB, 42.5MB/s  
New Data Upload: 100%|██████████| 85.1MB / 85.1MB, 42.5MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.62s/ shards]
Uploading languages:  71%|███████   | 27/38 [33:11<14:10, 77.31s/it]

✓ Uploaded 100 samples for 'Punjabi' to davidguzmanr/bible-tts-resources
✓ Finished uploading 'Punjabi'

--- Processing 'Shona' ---


In [8]:
# Upload the dataset to Hugging Face
# Make sure you're logged in: huggingface-cli login

repo_id = "davidguzmanr/bible-tts-resources"  # Change this to your repo ID

upload_alignment_to_hf(
    alignment_df=alignment_df.head(100),
    language=language,  # "Yoruba"
    repo_id=repo_id,
    private=False,
)

Map: 100%|██████████| 100/100 [00:00<00:00, 237.88 examples/s] shards/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 13.68ba/s]
Processing Files (1 / 1): 100%|██████████| 80.0MB / 80.0MB, 44.5MB/s  
New Data Upload: 100%|██████████| 80.0MB / 80.0MB, 44.5MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.02s/ shards]


ReadTimeout: The read operation timed out

In [None]:
# To load the dataset later, use:
# from datasets import load_dataset
# ds = load_dataset("YOUR_USERNAME/bible-tts", "Yoruba")  # Load specific language
# ds = load_dataset("YOUR_USERNAME/bible-tts", "Hausa")   # Load another language