http://drbo.org/

In [1]:
import re
import os
import json
import glob
from django.template.defaultfilters import slugify
from pywebber import Ripper

In [2]:
domain = "http://drbo.org/"

In [3]:
def normalize_filename(file_name):
    return "_".join([each.lower() for each in re.split(r"[\, *, \/]", file_name) if each != ''])

home = Ripper(domain)
home_links = list(home.links())

In [6]:
def get_all_books_page_links(raw_page_rip):
    """Get page links for each book"""
    nt = {}
    ot = {}
    OTIDS = []
    soup = raw_page_rip.soup   
    
    nt_soup = soup.find("td", class_="NT")
    ot1 = soup.find("td", class_="OT1")
    ot2 = soup.find("td", class_="OT2")
    
    for each in nt_soup.find_all("a", href=True):
        if 'class="b"' in str(each):
            href = each.get("href")
            name = each.text

            idd = re.search(r'\d{5}', href).group(0)
            nt[name] = [domain + href, idd]
            
    p = "D:\git\catholic\drbo_data"
            
    with open(os.path.join(p, "new_test.json"), "w+") as wh:
        json.dump(nt, wh)

    for each in ot1.find_all("a", href=True):
        if 'class="b"' in str(each):
            
            href = each.get("href")
            name = each.text
            idd = re.search(r'\d{5}', href).group(0)
                        
            if idd in OTIDS:
                ot[domain + href][0] = name + " or " + ot[domain + href][0]
                
            else:
                ot[domain + href] = [name, idd]
                OTIDS.append(idd)
                
    for each in ot2.find_all("a", href=True):
        if 'class="b"' in str(each):
            
            href = each.get("href")
            name = each.text
            idd = re.search(r'\d{5}', href).group(0)
                        
            if idd in OTIDS:
                ot[domain + href][0] = name + " or " + ot[domain + href][0]
                
            else:
                ot[domain + href] = [name, idd]
                OTIDS.append(idd)
                
    rev_old = {value[0] : [key, value[1]] for key, value in ot.items()}
    with open(os.path.join(p, "old_test.json"), "w+") as wh:
        json.dump(rev_old, wh)

In [7]:
get_all_books_page_links(Ripper(url=domain, save_path="D:\git\catholic\drbo_org_scrap"))

In [8]:
def get_all_chapters_and_write_to_json(book_name, book_link, idd):
    """Get all chapters and write to json"""
    chapters = {"01" : book_link}
    home = Ripper(book_link)
    
    for each in home.raw_links:
        str_each = str(each)
        excludes = ["next" in str_each, "previous" in str_each, "chapter" in str_each, "statcounter" in str_each,
                    "/x/" in str_each, "DRBO.ORG" in str_each, "theologica" in str_each]
        
        if not any(excludes):
            chapters[each.text] = domain + "chapter/" + each.get("href")

    book_chaps = {book_name : chapters}

    with open("douay/chapters/" + normalize_filename(book_name) + ".json", "w+") as wh:
        json.dump(book_chaps, wh)
    return book_chaps

In [9]:
def join_chapter_text(chapter_content_list):
    chapter_content_list = [each for each in chapter_content_list if each != "\n"]
    chap_text = {}
    tracker = 0

    for each in chapter_content_list:
        each = str(each)
        if re.search(r'\/x\/d\?b=drb', each):
            tracker += 1
            verse = re.search(r'\[(\d+)\]', each).group(1)
            chap_text[tracker] = ""
        else:
            chap_text[tracker] = chap_text[tracker] + each
    return chap_text

def get_all_text_for_chapter(location):
    """Get text for single chapter"""
    chapter_output_dictionary = {}
    chapter_contents_list = []

    soup = Ripper(location, parser="html5lib").soup
    text = soup.find("table", class_="texttable")

    for each in text.find_all("p"):
        attributes = each.attrs
        if attributes:
            if "desc" in attributes["class"]:
                pass
            elif "note" in attributes["class"]:
                pass
        else:
            new_cont = each.contents
            chapter_contents_list.extend(new_cont)
    return join_chapter_text(chapter_contents_list)

def get_all_text_for_book(book_file_name):
    with open(book_file_name, "r+") as rh:
        book = json.load(rh)
    chapter_text = {}

    for name, chapters_dictionary in book.items():
        
        for chap, location in chapters_dictionary.items():
            norm = normalize_filename("{}_{}.json".format(name, chap))
            outfile = "douay/verses/{}.json".format(norm)
            if os.path.exists(outfile):
                continue
            else:
                chapter_text[name + "__" + chap] = get_all_text_for_chapter(location)
                with open(outfile, "w+") as wh:
                    json.dump(chapter_text, wh)
                chapter_text = {}

In [10]:
def make_dictionary_from_each_commentary_text(commentary_parts_list):
    """Make a dictionary from each commentary text.
    Input is a list consisting of a 3 items.
    [verse, underlined text from bible, commentary text]
    """
    verse_string = str(commentary_parts_list[0])
    header_string = str(commentary_parts_list[1])
    
    verse = re.search(r"\n\[(\d+)\]", verse_string).group(1)
    header = re.search(r'<u>"(.+)"</u>', header_string).group(1)

    commentary_text = commentary_parts_list[2].replace(": ", "")
    key = verse + "__" + header
    
    return key, commentary_text

def get_commentary_for_chapter(location):
    """Get commentary text for single chapter"""
    chapter_commentary_dictionary = {}

    soup = Ripper(location, parser="html5lib").soup
    text = soup.find("table", class_="texttable")

    for each in text.find_all("p"):
        attributes = each.attrs
        if attributes:
            if "desc" in attributes["class"]:
                pass
            elif "note" in attributes["class"]:
                new_content = each.contents
                verse_header, text = make_dictionary_from_each_commentary_text(new_content)
                chapter_commentary_dictionary[verse_header] = text
        else:
            continue
    return chapter_commentary_dictionary

def get_commentary_for_all_book_chapters(book_file_name):
    with open(book_file_name, "r+") as rh:
        book = json.load(rh)
    chapter_text = {}

    for name, chapters_dictionary in book.items():
        
        for chap, location in chapters_dictionary.items():
            norm = normalize_filename("{}_{}".format(name, chap))
            outfile = "douay/commentary/challoner/{}.json".format(norm)
            
            if os.path.exists(outfile):
                continue
            else:
                chapter_text[name + "__" + chap] = get_commentary_for_chapter(location)
                with open(outfile, "w+") as wh:
                    json.dump(chapter_text, wh)
                chapter_text = {}

## Gateway

In [11]:
p = "D:\git\catholic\drbo_data"
            
for each in [os.path.join(p, "new_test.json"),os.path.join(p, "old_test.json")]:
    with open(each, "r+") as rh:
        books = json.load(rh)

    for name, link in books.items():
        link = link[0]
        idd = link[1]
        get_all_chapters_and_write_to_json(name, link, idd)

FileNotFoundError: [Errno 2] No such file or directory: 'douay/chapters/\n_matthew\n.json'

In [None]:
all_books = glob.glob("douay/chapters/*.json")

for each in all_books:
    get_all_text_for_book(each)

In [None]:
for each in all_books:
    get_commentary_for_all_book_chapters(each)