In [1]:
# Imports
from bs4 import BeautifulSoup

import git
import os

import requests
import unicodedata

import re

import jsonlines

In [7]:
# Define master url, easiest to pull from mobile index
master_url = "https://www.dhammatalks.org/suttas/index_mobile.html"
books_of_interest = ["DN", "MN", "SN", "AN", "KN"]
avoid = ["histor", "endn", "bibl", "app", "ackn", "intro", "epi", "prol", "syll"]

# Try to connect
response = requests.get(master_url)
if response.status_code == 200:
    # Redefine encoding
    response.encoding = "UTF-8"
    # Parse HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all links
    links = soup.find_all("a")
    # Define list to hold link dicts
    link_dicts = []

    for link in links:
        href = link.get('href')

        if any(book_of_note in href for book_of_note in books_of_interest) and not any(avoid_these in href for avoid_these in avoid):
            href_split = link.get("href").split("/")
            link_dicts.append(
                {   "book": href_split[2],
                    "sub_book": href_split[3] if len(href_split)> 4 else "None",
                    "url": "https://www.dhammatalks.org" + link.get("href"),
                    "sutta": re.sub(" +", " ", unicodedata.normalize("NFKD", link.get_text()))#.replace("Â", ""))
                }
            )
    
else:
    print("Failed to connect to Thanissaro's Webpage")


In [8]:
# Get absolute file path
git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
git_root = git_repo.git.rev_parse("--show-toplevel")

In [None]:
import string
nonalpha = string.digits + string.punctuation + string.whitespace

# Now cycle through dictionary of links and append text
# List to append failed request.get dictionaries to
no_go = []

for link_dict in link_dicts:
    response = requests.get(link_dict["url"]) 
    # If failed response, append dictionary to no_go for further investigation
    if response.status_code != 200:
        no_go.append(link_dict)
        print(f"NOPE: {response}")
    # Successful response means pull out all text
    else:
        print(link_dict["url"])
        # Replace encoding
        response.encoding = "UTF-8"
        # Scrape
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Define various fields of interest
        # We will work from the outside in, slowly destroying as we go.
        # Master sutta div
        sutta_div_original = soup.find("div", id="sutta") ###****###
        all_text = unicodedata.normalize("NFKD", sutta_div_original.get_text()).strip() if sutta_div_original else None ###****###
        
        # Title in h1
        title_attr = sutta_div_original.find("h1") ###****###
        title_text = re.sub('\s{2,}', ' ', unicodedata.normalize("NFKD", title_attr.get_text()).strip()) if title_attr else None ###****###
        # Destroy h1
        if title_attr:
            title_attr.decompose()

        # Now go to See also
        see_also_tag_original = sutta_div_original.find_all("p", {"class": "seealso"})
        # If a list of see also tag exists
        if see_also_tag_original:
            ## Set to the final tag, the footnotes
            see_also_tag_original = see_also_tag_original[-1] ###****###
            ## Pull out text
            see_also_text = unicodedata.normalize("NFKD", see_also_tag_original.get_text().split("See also: ")[-1]).strip() ###****###

            # Extract list of associated suttas
            see_also_a_tags = see_also_tag_original.find_all("a") ###****###
            see_also_links = ["https://www.dhammatalks.org" + link.get('href') for link in see_also_a_tags] ###****###
            see_also_suttas = [unicodedata.normalize("NFKD", link.get_text()).strip() for link in see_also_a_tags] ###****###
            see_also_text_and_link = [ (text_, link_) for text_, link_ in zip(see_also_suttas, see_also_links) ]

            # Destroy see_also_tag
            see_also_tag_original.decompose()
        # If no list exists, just set everything to None, and no need to decompose see_also_tag
        else:
            see_also_tag_original = None 
            see_also_text = None ###****###
            see_also_a_tags = None 
            see_also_links = None ###****###
            see_also_suttas = None ###****###
            see_also_text_and_link = None ###****###
        
        see_also_dict = {
            # "see_also_tag":str(see_also_tag),
            "see_also_text":see_also_text,
            # "see_also_a_tags":str(see_also_a_tags),
            "see_also_links":see_also_links,
            "see_also_suttas":see_also_suttas,
            "see_also_text_and_link": see_also_text_and_link
        }

        # Now go to note, second from bottom if it exists
        note_tag_original = sutta_div_original.find("div", {"class": "note"})

        # If note list exists, set to final of list
        if note_tag_original:
            note_text = unicodedata.normalize("NFKD", note_tag_original.get_text().strip()) ###****###

            # Extract all paragraphs
            # First destroy notetitle, we don't need it
            note_title = note_tag_original.find("p", {"class":"notetitle"})
            if note_title:
                note_title.decompose()
            # Extract all paragraphs
            note_paragraph_tags = note_tag_original.find_all("p", {"id":re.compile(r'note')})
            # Set note_counter to create dictionary of notes with associated text and links and associated resources
            note_dicts = [] ###****###
            note_counter = 0

            if note_paragraph_tags:
                # Go through each paragraph tag
                for par_ in note_paragraph_tags:
                    # Increment note_counter
                    note_counter += 1
                    # Find any links
                    par_a_tags = par_.find_all("a")

                    # Set up dict to store
                    temp_note_par_dict = {
                        "note_count":"note " + str(note_counter), # Set note counter
                        "note_text":unicodedata.normalize("NFKD", par_.get_text()).strip().lstrip(nonalpha),
                        "links":["https://www.dhammatalks.org" + link.get("href") for link in par_a_tags] if par_a_tags else [],
                        "links_text":[unicodedata.normalize("NFKD", link.get_text()).strip() for link in par_a_tags] if par_a_tags else [],
                    }

                    # Define all next siblings
                    for sibling in par_.find_next_siblings():
                        # If sibling is another paragraph tag, break
                        if sibling in note_paragraph_tags:
                            break
                        
                        # If no break, append text
                        temp_note_par_dict["note_text"] = temp_note_par_dict["note_text"] + f"""\n\n{unicodedata.normalize("NFKD", sibling.get_text()).strip()}"""
                        # Check if any links
                        sib_a_tags = sibling.find_all("a")
                        if sib_a_tags:
                            temp_note_par_dict["links"] = temp_note_par_dict["links"] + ["https://www.dhammatalks.org" + link.get("href") for link in sib_a_tags]
                            # Add the link text to the dictionary
                            temp_note_par_dict["links_text"] = temp_note_par_dict["links_text"] + [unicodedata.normalize("NFKD", link.get_text()).strip() for link in sib_a_tags]
                    # Now you've gone through all siblings, so append to list
                    # Create final link, text zip
                    temp_note_par_dict["link_and_text"] = [(text_, link_) for  text_, link_ in zip(temp_note_par_dict["links_text"], temp_note_par_dict["links"])    ] if temp_note_par_dict["links_text"] else []
                    note_dicts.append(temp_note_par_dict)

            ## Once done looking at all note pars, can decompose
            note_tag_original.decompose()
        else:
            note_text = None
            note_dicts = None
            ################################## NEED TO FIGURE OUT HOW TO SET AS COLUMNS IN DF

        # Left with intro and body, no way to split them nicely
        
        # Update ongoing text to drop title
        ongoing_text = unicodedata.normalize("NFKD", sutta_div_original.get_text()).strip() if sutta_div_original else None
        
        # Need to split on * * * or [ I ]
        split_on_intro = re.split(
            r'\* \* \*|\[ I \]'
            , ongoing_text
        )
        # For intro, pull out prior to split
        intro_text = split_on_intro[0].strip() if len(split_on_intro) > 1 else None ###****###
       
        # Body text, pull out after split
        sutta_text = split_on_intro[-1].strip() ###****###

        # Also set up sutta_blocks
        break_tags = sutta_div_original.find_all(
            lambda el: ("h" in el.name) | (el.get_attribute_list('class')[0] == 'stars') 
            )
        # Set up sutta_blocks to store.
        # Key with be heading text, values will be list of elements text
        sutta_blocks = {}
        # Loop through each break
        for break_ in break_tags:
            # Initialize list of contained elements
            values = []
            # Loop through list of elements after the break tag
            for sibling in break_.find_next_siblings():
                # If element is a header or stars, it means we want to add a new key to our dict, so move on to next break_
                if ("h" in sibling.name) or (sibling.get_attribute_list("class")[0] == "stars"):
                    break
                # Otherwise, append the element to the list of elements for the current break_
                values.append(unicodedata.normalize("NFKD", sibling.text).strip())
            # Append result to master_dict
            sutta_blocks[break_.text] = values

        ############################################### APPENDS ############################################
        # Append everything to link dict
        # link_dict["sutta_div_original"] = str(sutta_div_original)
        link_dict["all_text"] = all_text
        # link_dict["sutta_div_original"] = str(sutta_div_original)
        # Title
        # link_dict["title_attr"] = str(title_attr)
        link_dict["title_text"] = title_text
        # Intro
        link_dict["intro_text"] = intro_text
        # Sutta 
        link_dict["sutta_text"] = sutta_text
        link_dict["sutta_headings_with_text_dict"] = sutta_blocks 

        # Notes
        # link_dict["note_div"] = str(note_div)
        link_dict["note_text"] = note_text
        link_dict["note_dicts"] = note_dicts

        link_dict["note_text_dict"] = {
            el_['note_count']:el_['note_text'] for el_ in note_dicts
        } if note_dicts else None
        
        # See also
        for key_, value_ in see_also_dict.items():
            link_dict[key_] = value_

        # Write link dict
        # Define write filename
        link_dicts_write_filename = "thanissaro_scraped.jsonl"
        # Define write filepath
        link_dicts_write_path = git_root + "/" + "data/web" + "/" + link_dicts_write_filename

        with jsonlines.open(link_dicts_write_path, mode='a') as writer:
            writer.write(link_dict) 
        

SyntaxError: invalid syntax (1736741327.py, line 59)

In [6]:
link_dicts[-2]


{'book': 'KN',
 'sub_book': 'Thig',
 'url': 'https://www.dhammatalks.org/suttas/KN/Thig/thig13_5.html',
 'sutta': 'Thig 13:5',
 'all_text': 'Thig 13:5  Subhā the Goldsmith’s Daughter\n\n“I was a child, with clean clothes,\nwhen I first heard the Dhamma.\nAnd within me, heedful,\nwas a break-through to the truth.\nThen I arrived\nat an enormous dissatisfaction\nwith all sensuality.\nSeeing the danger\nin self-identity,\nI longed only\nfor renunciation.\nLeaving my circle of relatives,\nslaves, workers,\nprosperous villages & fields,\ndelightful, enticing possessions,\nI went forth,\nabandoning not-insignificant wealth.\n\nHaving gone out through conviction\nin the well-taught true Dhamma,\nit wouldn’t be proper for me—\naspiring to nothingness—\nhaving cast off gold & silver\nto take them back.\nGold & silver\ndon’t buy awakening,\ndon’t buy peace.\nThis [gold] isn’t proper for contemplatives.\nThis isn’t noble wealth.\nThis is\ngreediness, intoxication,\ndelusion, bondage to dust,\nsu

In [9]:
soup1 = BeautifulSoup(requests.get("https://www.dhammatalks.org/suttas/DN/DN01.html").text, "html.parser")

response2 = requests.get("https://www.dhammatalks.org/suttas/KN/Thig/thig14.html")
response2.encoding = "UTF-8"
soup2 = BeautifulSoup(response2.text, "html.parser")


In [None]:
sutta_div_original = soup1.find("div", id="sutta") ###****###
note_paragraph_tags = sutta_div_original.find_all("p", {"id":re.compile(r'note')})
note_paragraph_tags
# note_tag_original = sutta_div_original.find("div", {"class": "note"})

# # If note list exists, set to final of list
# if note_tag_original:
#     note_text = unicodedata.normalize("NFKD", note_tag_original.get_text().strip()) ###****###

#     # Extract all paragraphs
#     # First destroy notetitle, we don't need it
#     note_title = note_tag_original.find("p", {"class":"notetitle"})
#     print(f"note title: {note_title}")
#     note_title.decompose()
#     # Extract all paragraphs
#     note_paragraph_tags = note_tag_original.find_all("p")
# sutta1 = soup1.find('div', id='sutta')
# sutta = copy.copy(sutta1)
# sutta.h1.decompose()
# see_also_tag_original = sutta.find_all("p", {"class": "seealso"})
# # if see_also_tag_original:
# see_also_tag = see_also_tag_original[-1] 
# #     see_also_tag.decompose()
# see_also_tag.decompose()

[<p id="dn1note01">1. In many suttasâsuch as <a href="/suttas/DN/DN09.html">DN 9</a>, <a href="/suttas/MN/MN63.html">MN 63</a>, <a href="/suttas/MN/MN72.html">MN 72</a>, and <a href="/suttas/SN/SN44_7.html">SN 44:7â8</a>âthe Buddha and his disciples refuse to take a stand on whether the cosmos is eternal or not. See <a href="/books/SkillInQuestions/Section0013.html"><em>Skill in Questions,</em> Chapter 8</a>, for a discussion of the reasons for their refusal. As a general principle, the Buddha warns against speculation about the cosmos, saying in <a href="/suttas/AN/AN4_77.html">AN 4:77</a> that itâs one of four types of conjecture that can lead to madness. What he <em>does</em> say about how long there has been a cosmos (see, for instance, <a href="/suttas/SN/SN15_3.html">SN 15:3</a>) is that transmigration comes from an inconceivable beginning. As for the length of time the cosmos will last, in <a href="/suttas/SN/SN12_44.html">SN 12:44</a> he teaches the path to the end of t

In [225]:
note_paragraph_tags[0].find_all("a") + note_paragraph_tags[1].find_all("a")

[<a href="/suttas/DN/DN09.html">DN 9</a>,
 <a href="/suttas/MN/MN63.html">MN 63</a>,
 <a href="/suttas/MN/MN72.html">MN 72</a>,
 <a href="/suttas/SN/SN44_7.html">SN 44:7â8</a>,
 <a href="/books/SkillInQuestions/Section0013.html"><em>Skill in Questions,</em> Chapter 8</a>,
 <a href="/suttas/AN/AN4_77.html">AN 4:77</a>,
 <a href="/suttas/SN/SN15_3.html">SN 15:3</a>,
 <a href="/suttas/SN/SN12_44.html">SN 12:44</a>,
 <a href="/suttas/AN/AN10_95.html">AN 10:95</a>,
 <a href="/suttas/DN/DN02.html">DN 2</a>]

In [156]:
note_tag_original

<div class="note">
<!--end notetitle-->
<p id="dn1note01">1. In many suttasâsuch as <a href="/suttas/DN/DN09.html">DN 9</a>, <a href="/suttas/MN/MN63.html">MN 63</a>, <a href="/suttas/MN/MN72.html">MN 72</a>, and <a href="/suttas/SN/SN44_7.html">SN 44:7â8</a>âthe Buddha and his disciples refuse to take a stand on whether the cosmos is eternal or not. See <a href="/books/SkillInQuestions/Section0013.html"><em>Skill in Questions,</em> Chapter 8</a>, for a discussion of the reasons for their refusal. As a general principle, the Buddha warns against speculation about the cosmos, saying in <a href="/suttas/AN/AN4_77.html">AN 4:77</a> that itâs one of four types of conjecture that can lead to madness. What he <em>does</em> say about how long there has been a cosmos (see, for instance, <a href="/suttas/SN/SN15_3.html">SN 15:3</a>) is that transmigration comes from an inconceivable beginning. As for the length of time the cosmos will last, in <a href="/suttas/SN/SN12_44.html">SN 12:44<

In [90]:
note_tag_original = sutta.find("div", {'class': 'note'})
note_tag_original.decompose()

In [91]:
sutta

<div id="sutta">
<div id="DN01">

<h2 class="intro" id="toc_1">Introduction</h2>
<p><em>This suttaâthe first of the entire Sutta Piá¹­akaâintroduces the Buddha as a practitioner and as a teacher. Because its portrait focuses on the Dhamma qualities that he exemplifies, it acts as an introduction to the Dhamma he teaches as well.</em></p>
<p><em>The portrait falls into four sections, each presenting an aspect of the Buddhaâs accomplishments:</em></p>
<div class="iblock">
<p><em>his attitude toward praise and criticism,</em></p>
<p><em>his virtue,</em></p>
<p><em>his discernment,</em></p>
<p><em>his release.</em></p>
</div>
<p><em><strong>Praise &amp; criticism.</strong> In the first section of the sutta, the Buddha meets with the monks after a day and night in which he and the monks have had to listen to two wanderers of other sects arguing as to whether the Buddha should be criticized or praised. He counsels the monks not to let their minds be affected by such discussions, his re

In [63]:


note_tag = sutta.find_all(lambda el: ("h" in el.name) | (el.get_attribute_list('class')[0] == 'stars') )
sutta_blocks = {}
for heading in note_tag:
    values = []
    for sibling in heading.find_next_siblings():
        if ("h" in sibling.name) or (sibling.get_attribute_list("class")[0] == "stars"):
            break
        values.append(unicodedata.normalize("NFKD", sibling.text).strip())
    sutta_blocks[heading.text] = values

In [64]:
sutta_blocks

{'Introduction': ['This suttaâ\x80\x94the first of the entire Sutta Piá1\xadakaâ\x80\x94introduces the Buddha as a practitioner and as a teacher. Because its portrait focuses on the Dhamma qualities that he exemplifies, it acts as an introduction to the Dhamma he teaches as well.',
  'The portrait falls into four sections, each presenting an aspect of the Buddhaâ\x80\x99s accomplishments:',
  'his attitude toward praise and criticism,\nhis virtue,\nhis discernment,\nhis release.',
  'Praise & criticism. In the first section of the sutta, the Buddha meets with the monks after a day and night in which he and the monks have had to listen to two wanderers of other sects arguing as to whether the Buddha should be criticized or praised. He counsels the monks not to let their minds be affected by such discussions, his reasoning being that only if the mind is unaffected can it see clearly what is true or false in the words of criticism or praiseâ\x80\x94and only then can it respond approp

[]

In [264]:
d

{'a': None, 'b': [1, 2]}