In [105]:
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import pandas as pd

There are 27 chapters in the New Testament. Check [Bible](https://www.faithcomesbyhearing.com/audio-bible-resources/recordings-database) for the bible in different languages.

In [117]:
chapters = ['Mateo', 'Marcos', 'Lucas', 'Juan', 'Hechos', 'Romanos', 'Corintios-1', 'Corintios-2', 'Gálatas', 'Efesios', 'Filipenses',
            'Colosenses', 'Tesalonicenses-1', 'Tesalonicenses-2', 'Timoteo-1', 'Timoteo-2', 'Tito', 'Filemón', 'Hebreos', 'Santiago',
            'Pedro-1', 'Pedro-2', 'Juan-1', 'Juan-2', 'Juan-3', 'Judas', 'Apocalipsis']
num_subchapters = [28, 16, 24, 21, 28, 16, 16, 13, 6, 6, 4, 
                   4, 5, 3, 6, 4, 3, 1, 13, 5, 
                   5, 3, 5, 1, 1, 1, 22]
aliases = ['MAT', 'MRK', 'LUK', 'JHN', 'ACT', 'ROM', '1CO', '2CO', 'GAL', 'EPH', 'PHP', 
         'COL', '1TH', '2TH', '1TI', '2TI', 'TIT', 'PHM', 'HEB', 'JAS',
         '1PE', '2PE', '1JN', '2JN', '3JN', 'JUD', 'REV']

dict_bible = {chapter: {'num_subchapters': num, 'alias': alias} for (chapter, num, alias) in zip(chapters, num_subchapters, aliases)}

In [74]:
for chapter in chapters:
    os.system(f'mkdir Bible/Spanish/{chapter}')
    os.system(f'mkdir Bible/Nahuatl/Guerrero/{chapter}')

In [26]:
url_es = 'https://live.bible.is/bible/SPNNVI'
url_nah = 'https://live.bible.is/bible/NGUTBL' # Guerrero variant

# Download the New Testament in Spanish
for chapter in tqdm(dict_bible.keys()):
    num_subchapter = dict_bible[chapter]['num_subchapters']
    alias = dict_bible[chapter]['alias']
    
    for num in range(1, num_subchapter + 1):
        os.system(f'wget -P Bible/Spanish/{chapter} https://live.bible.is/bible/SPNNVI/{alias}/{num}')
        
# Download the New Testament in Nahuatl
for chapter in tqdm(dict_bible.keys()):
    num_subchapter = dict_bible[chapter]['num_subchapters']
    alias = dict_bible[chapter]['alias']
    
    for num in range(1, num_subchapter + 1):
        os.system(f'wget -P Bible/Nahuatl/Guerrero/{chapter} https://live.bible.is/bible/NGUTBL/{alias}/{num}')

100%|██████████| 27/27 [04:50<00:00, 10.76s/it]


We can align the translations using the chapter, subchapter and number of verse.

In [163]:
dict_spanish = {}
for chapter in tqdm(dict_bible.keys()):
    num_subchapter = dict_bible[chapter]['num_subchapters']
    alias = dict_bible[chapter]['alias']
    
    for num in range(1, num_subchapter + 1):
        raw_html = open(f'Bible/Spanish/{chapter}/{num}').read()
        html = BeautifulSoup(raw_html, 'html.parser')
        verses = html.find_all(name='span', attrs={'data-verseid': range(1,100), 'class': None})
        verses_ids = [int(verse.attrs['data-verseid']) for verse in verses]
        
        for verse_id, verse in zip(verses_ids, verses):
            dict_spanish[(chapter, num, verse_id)] = verse.text

100%|██████████| 27/27 [00:02<00:00,  9.29it/s]


In [164]:
dict_nahuatl = {}
for chapter in tqdm(dict_bible.keys()):
    num_subchapter = dict_bible[chapter]['num_subchapters']
    alias = dict_bible[chapter]['alias']
    
    for num in range(1, num_subchapter + 1):
        raw_html = open(f'Bible/Nahuatl/Guerrero/{chapter}/{num}').read()
        html = BeautifulSoup(raw_html, 'html.parser')
        verses = html.find_all(name='span', attrs={'data-verseid': range(1,100), 'class': None})
        verses_ids = [int(verse.attrs['data-verseid']) for verse in verses]
        
        for verse_id, verse in zip(verses_ids, verses):
            dict_nahuatl[(chapter, num, verse_id)] = verse.text

100%|██████████| 27/27 [00:03<00:00,  8.96it/s]


In [155]:
common_verses = set(dict_spanish.keys()).intersection(set(dict_nahuatl.keys()))
common_verses = sorted(common_verses)

len(dict_spanish.keys()), len(dict_nahuatl.keys()), len(common_verses)

(7941, 7933, 7916)

Lets see this in a dataframe to confirm we are aligning the verses. Observations:

- There are some missing verses, so it is important to be sure they are correctly aligned.
- Some of the verses are blank, so we will ignore them.

In [165]:
bible = pd.DataFrame(columns=['chapter', 'subchapter', 'verse', 'spanish', 'nahuatl'])

for verse_id in tqdm(dict_spanish):
    if verse_id in dict_nahuatl:
        bible.loc[len(bible)] = [verse_id[0], verse_id[1], verse_id[2], dict_spanish[verse_id], dict_nahuatl[verse_id]]
    
bible.to_csv('Bible/parallel_bible_guerrero.csv', index=False)

100%|██████████| 7941/7941 [00:16<00:00, 480.43it/s]


In [168]:
file_es = open('bible.es', mode='w') 
file_nah = open('bible_guerrero.nah', mode='w')

for verse_es, verse_nah in zip(bible['spanish'], bible['nahuatl']):
    if len(verse_es) > 1 and len(verse_nah) > 1: 
        file_es.write(verse_es + '\n')
        file_nah.write(verse_nah + '\n')

file_es.close()
file_nah.close()

In [144]:
file_es = open('bible.es').readlines() 
file_nah = open('bible_guerrero.nah').readlines()

len(file_es), len(file_nah)

(7908, 7908)