In [1]:
# Imports
from bs4 import BeautifulSoup

import numpy as np

import git
import os

import requests
import unicodedata

import re

import jsonlines

from tqdm.notebook import tqdm

In [2]:
# Define master url, easiest to pull from mobile index
master_url = "https://www.dhammatalks.org/suttas/index_mobile.html"
books_of_interest = ["DN", "MN", "SN", "AN", "KN"]
avoid = ["histor", "endn", "bibl", "app", "ackn", "intro", "epi", "prol", "syll"]

# Try to connect
response = requests.get(master_url)
if response.status_code == 200:
    # Redefine encoding
    response.encoding = "UTF-8"
    # Parse HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all links
    links = soup.find_all("a")
    links = set(links)
    # Define list to hold link dicts
    link_dicts = []

    for link in links:
        href = link.get('href')

        if any(book_of_note in href for book_of_note in books_of_interest) and not any(avoid_these in href for avoid_these in avoid):
            href_split = link.get("href").split("/")
            link_dicts.append(
                {   "book": href_split[2],
                    "sub_book": href_split[3] if len(href_split)> 4 else "None",
                    "url": "https://www.dhammatalks.org" + link.get("href"),
                    "sutta": re.sub(" +", " ", unicodedata.normalize("NFKD", link.get_text()))#.replace("Â", ""))
                }
            )
    
else:
    print("Failed to connect to Thanissaro's Webpage")



In [3]:
# Get absolute file path
git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")

In [None]:
import string
nonalpha = string.digits + string.punctuation + string.whitespace
counter = 0
# Now cycle through dictionary of links and append text
# List to append failed request.get dictionaries to
no_go = []

for link_dict in tqdm(link_dicts):
    response = requests.get(link_dict["url"]) 
    # If failed response, append dictionary to no_go for further investigation
    if response.status_code != 200:
        no_go.append(link_dict)
        print(f"NOPE: {response}")
    # Successful response means pull out all text
    else:
        # Replace encoding
        response.encoding = "UTF-8"
        # Scrape
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Define various fields of interest
        # We will work from the outside in, slowly destroying as we go.
        # Master sutta div
        sutta_div_original = soup.find("div", id="sutta") ###****###
        all_text = unicodedata.normalize("NFKD", sutta_div_original.get_text()).strip() if sutta_div_original else None ###****###
        
        # Title in h1
        title_attr = sutta_div_original.find("h1") ###****###
        title_text = re.sub('\s{2,}', ' ', unicodedata.normalize("NFKD", title_attr.get_text()).strip()) if title_attr else None ###****###
        # Destroy h1
        if title_attr:
            title_attr.decompose()

        # Now go to See also
        see_also_tag_original = sutta_div_original.find_all("p", {"class": "seealso"})
        # If a list of see also tag exists
        if see_also_tag_original:
            ## Set to the final tag, the footnotes
            see_also_tag_original = see_also_tag_original[-1] ###****###
            ## Pull out text
            see_also_text = unicodedata.normalize("NFKD", see_also_tag_original.get_text().split("See also: ")[-1]).strip() ###****###

            # Extract list of associated suttas
            see_also_a_tags = see_also_tag_original.find_all("a") ###****###
            see_also_links = ["https://www.dhammatalks.org" + link.get('href') for link in see_also_a_tags] ###****###
            see_also_suttas = [unicodedata.normalize("NFKD", link.get_text()).strip() for link in see_also_a_tags] ###****###
            see_also_text_and_link = [ (text_, link_) for text_, link_ in zip(see_also_suttas, see_also_links) ]

            # Destroy see_also_tag
            see_also_tag_original.decompose()
        # If no list exists, just set everything to None, and no need to decompose see_also_tag
        else:
            see_also_tag_original = None 
            see_also_text = None ###****###
            see_also_a_tags = None 
            see_also_links = None ###****###
            see_also_suttas = None ###****###
            see_also_text_and_link = None ###****###
        
        see_also_dict = {
            # "see_also_tag":str(see_also_tag),
            "see_also_text":see_also_text,
            # "see_also_a_tags":str(see_also_a_tags),
            "see_also_links":see_also_links,
            "see_also_suttas":see_also_suttas,
            "see_also_text_and_link": see_also_text_and_link
        }

        # Now go to note, second from bottom if it exists
        note_tag_original = sutta_div_original.find("div", {"class": "note"})

        # If note list exists, set to final of list
        if note_tag_original:
            note_text = unicodedata.normalize("NFKD", note_tag_original.get_text().strip()) ###****###

            # Extract all paragraphs
            # First destroy notetitle, we don't need it
            note_title = note_tag_original.find("p", {"class":"notetitle"})
            if note_title:
                note_title.decompose()
            # Extract all paragraphs
            note_paragraph_tags = note_tag_original.find_all("p", {"id":re.compile(r'note')})
            # Set note_counter to create dictionary of notes with associated text and links and associated resources
            note_dicts = [] ###****###
            note_counter = 0

            if note_paragraph_tags:
                # Go through each paragraph tag
                for par_ in note_paragraph_tags:
                    # Increment note_counter
                    note_counter += 1
                    # Find any links
                    par_a_tags = par_.find_all("a")

                    # Set up dict to store
                    temp_note_par_dict = {
                        "note_count":"note " + str(note_counter), # Set note counter
                        "note_text":unicodedata.normalize("NFKD", par_.get_text()).strip().lstrip(nonalpha),
                        "links":["https://www.dhammatalks.org" + link.get("href") for link in par_a_tags] if par_a_tags else [],
                        "links_text":[unicodedata.normalize("NFKD", link.get_text()).strip() for link in par_a_tags] if par_a_tags else [],
                    }

                    # Define all next siblings
                    for sibling in par_.find_next_siblings():
                        # If sibling is another paragraph tag, break
                        if sibling in note_paragraph_tags:
                            break
                        
                        # If no break, append text
                        temp_note_par_dict["note_text"] = temp_note_par_dict["note_text"] + f"""\n\n{unicodedata.normalize("NFKD", sibling.get_text()).strip()}"""
                        # Check if any links
                        sib_a_tags = sibling.find_all("a")
                        if sib_a_tags:
                            temp_note_par_dict["links"] = temp_note_par_dict["links"] + ["https://www.dhammatalks.org" + link.get("href") for link in sib_a_tags]
                            # Add the link text to the dictionary
                            temp_note_par_dict["links_text"] = temp_note_par_dict["links_text"] + [unicodedata.normalize("NFKD", link.get_text()).strip() for link in sib_a_tags]
                    # Now you've gone through all siblings, so append to list
                    # Create final link, text zip
                    temp_note_par_dict["link_and_text"] = [(text_, link_) for  text_, link_ in zip(temp_note_par_dict["links_text"], temp_note_par_dict["links"])    ] if temp_note_par_dict["links_text"] else []
                    note_dicts.append(temp_note_par_dict)

            ## Once done looking at all note pars, can decompose
            note_tag_original.decompose()
        else:
            note_text = None
            note_dicts = None
            ################################## NEED TO FIGURE OUT HOW TO SET AS COLUMNS IN DF

        # Left with intro and body, no way to split them nicely
        
        # Update ongoing text to drop title
        ongoing_text = unicodedata.normalize("NFKD", sutta_div_original.get_text()).strip() if sutta_div_original else None
        
        # Need to split on * * * or [ I ]
        split_on_intro = re.split(
            r'\* \* \*|\[ I \]'
            , ongoing_text
        )
        # For intro, pull out prior to split
        intro_text = split_on_intro[0].strip() if len(split_on_intro) > 1 else None ###****###
       
        # Body text, pull out after split
        sutta_text = split_on_intro[-1].strip() ###****###

        # Also set up sutta_blocks
        break_tags = sutta_div_original.find_all(
            lambda el: ("h" in el.name) | (el.get_attribute_list('class')[0] == 'stars') 
            )
        # Set up sutta_blocks to store.
        # Key with be heading text, values will be list of elements text
        sutta_blocks = {}
        # Loop through each break
        for break_ in break_tags:
            # Initialize list of contained elements
            values = []
            # Loop through list of elements after the break tag
            for sibling in break_.find_next_siblings():
                # If element is a header or stars, it means we want to add a new key to our dict, so move on to next break_
                if ("h" in sibling.name) or (sibling.get_attribute_list("class")[0] == "stars"):
                    break
                # Otherwise, append the element to the list of elements for the current break_
                values.append(unicodedata.normalize("NFKD", sibling.text).strip())
            # Append result to master_dict
            sutta_blocks[break_.text] = values

        ############################################### APPENDS ############################################
        # Append everything to link dict
        # link_dict["sutta_div_original"] = str(sutta_div_original)
        link_dict["all_text"] = all_text
        # link_dict["sutta_div_original"] = str(sutta_div_original)
        # Title
        # link_dict["title_attr"] = str(title_attr)
        link_dict["title_text"] = title_text
        # Intro
        link_dict["intro_text"] = intro_text
        # Sutta 
        link_dict["sutta_text"] = sutta_text
        link_dict["sutta_headings_with_text_dict"] = sutta_blocks 

        # Notes
        # link_dict["note_div"] = str(note_div)
        link_dict["note_text"] = note_text
        link_dict["note_dicts"] = note_dicts

        link_dict["note_text_dict"] = {
            el_['note_count']:el_['note_text'] for el_ in note_dicts
        } if note_dicts else None
        
        # See also
        for key_, value_ in see_also_dict.items():
            link_dict[key_] = value_

        # Write link dict
        # Define write filename
        link_dicts_write_filename = "thanissaro_scraped.jsonl"
        # Define write filepath
        link_dicts_write_path = git_root + "/" + "data/web" + "/" + link_dicts_write_filename
        counter += 1

        with jsonlines.open(link_dicts_write_path, mode='a') as writer:
            writer.write(link_dict) 
print(f"Count: {counter}")

  0%|          | 0/1420 [00:00<?, ?it/s]