In [93]:
import re
import requests
from bs4 import BeautifulSoup

from bs4.element import NavigableString as n_str

from lxml import etree

In [2]:
# URLs components
base_url = 'https://www.churchofjesuschrist.org'

standard_works_base_url = base_url + '/study/scriptures'

standard_works = {
    '/ot':           'Old Testament',
    '/nt':           'New Testament',
    '/bofm':         'Book of Mormon',
    '/dc-testament': 'Doctrine and Covenants',
    '/pgp':          'Pearl of Great Price'
}


In [11]:
# Utility scraping methods
def get(url):
    page = requests.get(url)
    if page.status_code == 200:
        return BeautifulSoup(page.text, 'html.parser')
    
    raise ValueError(f"Failed to get {url}")
    
def get_first_from_standard_work(standard_work):
    'Obtains first content url for a given standard work on churchofjesuschrist.org'
    return base_url + get(
        standard_works_base_url + standard_work
    ).find(
        'nav'
    ).find(
        'ul'
    ).find_all(
        'li'
    )[1].find(
        'a'
    )['href'].split('?')[0]
    
# Finds the next link on page from https://www.churchofjesuschrist.org/study/scriptures/
next_url = lambda page: page.find('span', re.compile('nextLink')).find('a')['href'].split('?')[0]

def get_all_standard_work_pages(url, verbose=False):
    'Gets all standard work chapter URLs as a generator'
        
    while url:
        yield url
        page = get(url)

        if next_url(page):
            url = base_url + next_url(page)
            if verbose:
                print(url)
        else:
            url = None


In [4]:
get_first_from_standard_work(
    standard_works_base_url+'/ot'
)

'https://www.churchofjesuschrist.org/study/scriptures/ot/title-page'

In [12]:
list(
    get_all_standard_work_pages(
        get_first_from_standard_work(
            '/ot'
        ),
        verbose=True,
    )
)

https://www.churchofjesuschrist.org/study/scriptures/ot/dedication
https://www.churchofjesuschrist.org/study/scriptures/ot/gen/1
https://www.churchofjesuschrist.org/study/scriptures/ot/gen/2
https://www.churchofjesuschrist.org/study/scriptures/ot/gen/3
https://www.churchofjesuschrist.org/study/scriptures/ot/gen/4
https://www.churchofjesuschrist.org/study/scriptures/ot/gen/5
https://www.churchofjesuschrist.org/study/scriptures/ot/gen/6


KeyboardInterrupt: 

In [120]:
replacements = {                  # String replacement to process raw web strings
    'â\x80\x94'         : ' - ' , # Long m dash
    'â\x80\x99'         : "'"   , # Apostrophe : ’
    'Â¶ '               : ''    , # Paragraph mark : ¶
}

def process_raw_web_string(s):
    "Processes raw unicode strings from curchofjesuschrist.org"
    for old, new in replacements.items():
        s = s.replace(old, new)
    
    return s

def process_verse(verse):
    trim_sups = lambda v: process_raw_web_string(v) if isinstance(v, n_str) else ''.join(
        [
            trim_sups(c)
            for c in v.contents
            if c.name != 'sup'
        ])
    
    return ''.join([trim_sups(c) for c in verse.contents[1:]])

def process_chapter(standard_work, book, chapter):
    # Processes a chapter from a standard work
    url = f'{standard_works_base_url}{standard_work}/{book}/{chapter}'
    page = get(url)

    chapter = etree.Element('chapter')
    chapter.text = process_raw_web_string(page.find('p', **{'id': 'title_number1'}).text)

    summary = etree.Element('summary')
    summary.text = process_raw_web_string(page.find('p', **{'id': 'study_summary1'}).text)
    chapter.append(summary)

    body = page.find('div', **{'class': 'body-block'})

    for verse in body.find_all('p'):
        new = etree.Element('verse', number=verse['id'][1:])
        new.text = process_verse(verse)
        chapter.append(new)
        
    return chapter

for i in range(1, 34):
    chapter = process_chapter('/ot', 'deut', i)
    s = etree.tostring(chapter, pretty_print=True)
    print(i, '&' in s.decode('ascii'))




1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
30 False
31 False
32 False
33 False


In [103]:
page = get("https://www.churchofjesuschrist.org/study/scriptures/ot/deut/1")

p1 = page.find('span', **{'class': 'para-mark'})

In [112]:
b'\xc3\x82\xc2\xb6 '.decode('utf-8')

'Â¶ '

In [71]:
get(base_url+'/study/scriptures/ot/title-page')

'/study/scriptures/ot/dedication'

In [50]:

print(
    [
        a['href'].split('?')[0] for a in get(
            standard_works_base+'nt'
        ).find('ul', 'doc-map').find_all('a', href=True)
    ]
)

['/study/scriptures/nt/title-page', '/study/scriptures/nt/matt/1', '/study/scriptures/nt/mark/1', '/study/scriptures/nt/luke/1', '/study/scriptures/nt/john/1', '/study/scriptures/nt/acts/1', '/study/scriptures/nt/rom/1', '/study/scriptures/nt/1-cor/1', '/study/scriptures/nt/2-cor/1', '/study/scriptures/nt/gal/1', '/study/scriptures/nt/eph/1', '/study/scriptures/nt/philip/1', '/study/scriptures/nt/col/1', '/study/scriptures/nt/1-thes/1', '/study/scriptures/nt/2-thes/1', '/study/scriptures/nt/1-tim/1', '/study/scriptures/nt/2-tim/1', '/study/scriptures/nt/titus/1', '/study/scriptures/nt/philem/1', '/study/scriptures/nt/heb/1', '/study/scriptures/nt/james/1', '/study/scriptures/nt/1-pet/1', '/study/scriptures/nt/2-pet/1', '/study/scriptures/nt/1-jn/1', '/study/scriptures/nt/2-jn/1', '/study/scriptures/nt/3-jn/1', '/study/scriptures/nt/jude/1', '/study/scriptures/nt/rev/1']


In [None]:
# create XML 
root = etree.Element('root')
root.append(etree.Element('child'))
# another child with text
child = etree.Element('child')
child.text = 'some text'
root.append(child)

# pretty string
s = etree.tostring(root, pretty_print=True)
print s