# Download

## Audios and timing files

In [3]:
from utils import download_audios

In [None]:
# 1) Parse HTML -> dict(name -> URL)
html_path = "html_files/audios/Apali.html"
links = download_audios.extract_artifact_links(html_path)

links


{'Matthew': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e6d',
  'section': 'New Testament - mp3'},
 'Mark': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e6f',
  'section': 'New Testament - mp3'},
 'Luke': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e77',
  'section': 'New Testament - mp3'},
 'John': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e70',
  'section': 'New Testament - mp3'},
 'Acts': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e81',
  'section': 'New Testament - mp3'},
 'Romans': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e7f',
  'section': 'New Testament - mp3'},
 '1 Corinthians': {'url': 'https://openbible-api-1.biblica.com/artifactContent/62e2eabb5dafb527681c6e74',
  'section': 'New Testament - mp3'},
 '2 Corinthians': {'url': 'https://openbible-api

In [None]:
# 2) Download + unzip into folders
output_dir = output_base_dir / lang_name
download_audios.download_and_unzip_all(links, str(output_dir), overwrite=False, timeout=60)

# Parsing text files

## USX

In [1]:
from utils.usx_parser import usx_to_dataframe, usx_directory_to_dataframe

In [15]:
path = "data/texts/Yoruba/USX/release/USX_1/GEN.usx"
df = usx_to_dataframe(path, include_headings=False)
df

Unnamed: 0,book,chapter,verse,text
0,GEN,1,1,Ní ìbẹ̀rẹ̀ ohun gbogbo Ọlọ́run dá àwọn ọ̀run à...
1,GEN,1,2,"Ayé sì wà ní rúdurùdu, ó sì ṣófo, òkùnkùn sì w..."
2,GEN,1,3,"Ọlọ́run sì wí pé, “Jẹ́ kí ìmọ́lẹ̀ kí ó wà,” ìm..."
3,GEN,1,4,"Ọlọ́run rí i pé ìmọ́lẹ̀ náà dára, ó sì ya ìmọ́..."
4,GEN,1,5,"Ọlọ́run sì pe ìmọ́lẹ̀ náà ní “ọ̀sán,” àti òkùn..."
...,...,...,...,...
1528,GEN,50,22,Josẹfu sì ń gbé ní Ejibiti pẹ̀lú gbogbo ìdílé ...
1529,GEN,50,23,"Ó sì rí ìran kẹta ọmọ Efraimu-Àwọn ọmọ Makiri,..."
1530,GEN,50,24,Nígbà náà ni Josẹfu wí fún àwọn arákùnrin rẹ̀ ...
1531,GEN,50,25,Josẹfu sì mú kí àwọn ọmọ Israẹli búra májẹ̀mú ...


In [16]:
path = "data/texts/Yoruba/USX/release/USX_1/"
df = usx_directory_to_dataframe(path, include_headings=False)
df

Unnamed: 0,book,chapter,verse,text
0,1CH,1,1,"Adamu, Seti, Enoṣi,"
1,1CH,1,2,"Kenani, Mahalaleli, Jaredi,"
2,1CH,1,3,"Enoku, Metusela, Lameki, Noa."
3,1CH,1,4,"Àwọn ọmọ Noa: Ṣemu, Hamu àti Jafeti."
4,1CH,1,5,"Àwọn ọmọ Jafeti ni: Gomeri, Magogu, Madai; Jaf..."
...,...,...,...,...
31082,ZEP,3,16,"Ní ọjọ́ náà, wọn yóò sọ fún Jerusalẹmu pé, “Má..."
31083,ZEP,3,17,"Olúwa Ọlọ́run rẹ wà pẹ̀lú rẹ, Ó ní agbára láti..."
31084,ZEP,3,18,“Èmi ó kó àwọn tí ó ń banújẹ́ fún àjọ̀dún tí a...
31085,ZEP,3,19,Ní àkókò náà ni èmi yóò dojúkọ àwọn tí ń ni yí...


In [14]:
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from utils.usx_parser import usx_to_dataframe


def test_usx_parser_all_files(texts_dir: str = "data/texts") -> pd.DataFrame:
    """
    Test usx_to_dataframe on all USX files in the texts directory.
    
    Args:
        texts_dir: Path to the texts directory containing language folders.
        
    Returns:
        DataFrame with columns: language, file_path, success, error_message, num_rows
    """
    texts_path = Path(texts_dir)
    
    # Find all USX files
    # Include all USX files under the texts_path (looks for any .usx at any subdirectory depth)
    usx_files = list(texts_path.glob("**/*.usx"))
    
    results = []
    for usx_file in tqdm(usx_files, desc="Testing USX files"):
        # Extract language from path (first directory under texts_dir)
        relative_path = usx_file.relative_to(texts_path)
        language = relative_path.parts[0]
        
        # Test parsing
        try:
            df = usx_to_dataframe(usx_file, include_headings=False)
            results.append({
                "language": language,
                "file_path": str(usx_file),
                "success": True,
                "error_message": None,
                "num_rows": len(df)
            })
        except Exception as e:
            results.append({
                "language": language,
                "file_path": str(usx_file),
                "success": False,
                "error_message": str(e),
                "num_rows": None
            })
    
    return pd.DataFrame(results)


# Run the test
results_df = test_usx_parser_all_files()
results_df

Testing USX files: 100%|██████████| 2514/2514 [00:33<00:00, 75.08it/s]


Unnamed: 0,language,file_path,success,error_message,num_rows
0,Apali,data/texts/Apali/USX/release/USX_1/1CO.usx,True,,420.0
1,Apali,data/texts/Apali/USX/release/USX_1/1JN.usx,True,,103.0
2,Apali,data/texts/Apali/USX/release/USX_1/1PE.usx,True,,103.0
3,Apali,data/texts/Apali/USX/release/USX_1/1TH.usx,True,,84.0
4,Apali,data/texts/Apali/USX/release/USX_1/1TI.usx,True,,109.0
...,...,...,...,...,...
2509,Yoruba,data/texts/Yoruba/USX/release/USX_1/RUT.usx,True,,85.0
2510,Yoruba,data/texts/Yoruba/USX/release/USX_1/SNG.usx,True,,117.0
2511,Yoruba,data/texts/Yoruba/USX/release/USX_1/TIT.usx,True,,46.0
2512,Yoruba,data/texts/Yoruba/USX/release/USX_1/ZEC.usx,True,,211.0


In [22]:
results_df["language"].nunique()

39

In [23]:
all_files = os.listdir("data/texts")
set(all_files) - set(results_df["language"])

{'Toma'}

In [24]:
# Summary statistics
print(f"Total files tested: {len(results_df)}")
print(f"Successful: {results_df['success'].sum()}")
print(f"Failed: {(~results_df['success']).sum()}")
print(f"\nLanguages: {results_df['language'].nunique()}")

# Show failures if any
failed_df = results_df[~results_df['success']]
if len(failed_df) > 0:
    print(f"\n--- Failed files ---")
    display(failed_df)
else:
    print("\nAll files parsed successfully!")
    
# Summary by language
summary_by_lang = results_df.groupby('language').agg({
    'success': ['sum', 'count'],
    'num_rows': 'sum'
}).reset_index()
summary_by_lang.columns = ['language', 'successful_files', 'total_files', 'total_rows']
summary_by_lang['success_rate'] = summary_by_lang['successful_files'] / summary_by_lang['total_files']
print("\n--- Summary by language ---")
summary_by_lang

Total files tested: 2514
Successful: 2513
Failed: 1

Languages: 39

--- Failed files ---


Unnamed: 0,language,file_path,success,error_message,num_rows
209,Bengali,data/texts/Bengali/USX/release/USX_1/MAT.usx,False,"Unexpected USX id format: 'MAT 18:10,11'",



--- Summary by language ---


Unnamed: 0,language,successful_files,total_files,total_rows,success_rate
0,Apali,29,29,10218.0,1.0
1,Arabic Standard,66,66,31103.0,1.0
2,Assamese,66,66,31092.0,1.0
3,Bengali,65,66,30026.0,0.984848
4,Central Kurdish,66,66,31103.0,1.0
5,Chhattisgarhi,66,66,31103.0,1.0
6,Chichewa,66,66,31104.0,1.0
7,Dawro,66,66,30815.0,1.0
8,Dholuo,66,66,31103.0,1.0
9,Ewe,66,66,31103.0,1.0
