In [None]:
from bs4 import BeautifulSoup
import requests
import unicodedata
import re
import copy

# Define master url, easiest to pull from mobile index
master_url = "https://www.dhammatalks.org/suttas/index_mobile.html"
books_of_interest = ["DN", "MN", "SN", "AN", "KN"]
# Try to connect
response = requests.get(master_url)
if response.status_code == 200:
    # Redefine encoding
    response.encoding = "UTF-8"
    # Parse HTML content
    soup = BeautifulSoup(response.text, "lxml")

    # Find all links
    links = soup.find_all("a")
    # Define list to hold link dicts
    link_dicts = []

    for link in links:
        href = link.get('href')

        if any(book_of_note in href for book_of_note in books_of_interest):
            href_split = link.get("href").split("/")
            link_dicts.append(
                {   "book": href_split[2],
                    "sub_book": href_split[3] if len(href_split)> 4 else "None",
                    "url": "https://www.dhammatalks.org/" + link.get("href"),
                    "sutta": re.sub(" +", " ", unicodedata.normalize("NFKD", link.get_text()))#.replace("Â", ""))
                }
            )
    
else:
    print("Failed to connect to Thanissaro's Webpage")


In [None]:
# Pull entire div so I only do it once so I only do requests once



In [None]:
# Now cycle through dictionary of links and append text
# List to append failed request.get dictionaries to
no_go = []

for link_dict in link_dicts:
    reponse = requests.get(link_dict["url"]) 
    # If failed response, append dictionary to no_go for further investigation
    if response != 200:
        no_go.append(link_dict)
    # Successful response means pull out all text
    else:
        # Replace encoding
        response.encoding = "UTF-8"
        # Scrape
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Define various fields of interest
        # We will work from the outside in, slowly destroying as we go.
        # Master sutta div
        sutta_div_original = soup.select("div#sutta")[0] ###****###
        all_text = unicodedata.normalize("NFKD", sutta_div_original).strip() if sutta_div else None ###****###
        # Make copy to work through
        sutta_div = copy.copy(sutta_div_original) ###****###
        
        # Title in h1
        title_attr = copy.copy(sutta_div.h1) ###****###
        title_text = unicodedata.normalize("NFKD", title_attr.get_text()).strip() if title_attr else None ###****###
        # Destroy h1
        sutta_div.h1.decompose()

        # Now go to See also
        see_also_tag_original = sutta_div.find_all("p", {"class": "seealso"})
        # If a list of see also tag exists
        if see_also_tag_original:
            ## Set to the final p tag, the footnotes
            see_also_tag = copy.copy(see_also_tag_original[-1]) ###****###
            ## Pull out text
            see_also_text = unicodedata.normalize("NFKD", see_also_tag.get_text().split("See also: ")[-1]).strip() ###****###

            # Extract list of associated suttas
            see_also_a_tags = see_also_tag.find_all("a") ###****###
            see_also_links = ["https://www.dhammatalks.org/" + link.get('href') for link in see_also_a_tags] ###****###
            see_also_suttas = [link.get_text().strip() for link in see_also_a_tags] ###****###
            
            # Destroy non-copy version of see_also_tag (see_also_tag_original) in sutta_div we're working through
            see_also_tag_original.decompose()
        # If no list exists, just set everything to None, and no need to decompose our see_also_tag
        else:
            see_also_tag = None ###****###
            see_also_text = None ###****###
            see_also_a_tags = None ###****###
            see_also_links = None ###****###
            see_also_suttas = None ###****###

        # Now go to notetitle
        note_tag_original = sutta_div.find("div", id="notetitle")

        # If note list exists, set to final of list
        if note_tag_original:
            note_tag = copy.copy(note_tag_original) ###****###
            note_tag_text = unicodedata.normalize("NFKD", note_tag.get_text().strip()) ###****###

            # Extract all paragraphs
            # First destroy notetitle
            note_tag.find("p", {"class":"notetitle"}).decompose()
            # Extract all paragraphs
            note_paragraph_tags = note_tag.find_all("p")
            # Set note_counter to 1 to create dictionary of notes with associated text and links and associated resources
            note_dicts = [] ###****###
            note_counter = 0

            if note_paragraph_tags:
                    
                # Iterate through paragraph tags
                for par_ in note_paragraph_tags:

                    # If par tag includes "note", then we have found a new note
                    if "note" in par_.attrs.get("id"):
                        # If note01 note in id, append existing note_dicts, otherwise none exists
                        if "note01" not in par_.attrs.get("id"):
                            # If fields are empty, set to None
                            for key_ in note_dicts.keys():
                                if not note_dicts[key_]:
                                    note_dicts[key_] = None
                            note_dicts.append(dict_to_store)
                        
                        # Set up new dict
                        dict_to_store = {}
                        # Increase counter and store in dict
                        note_counter += 1
                        dict_to_store["note_count"] = "note " + str(note_counter) ###****###
                
                        # Store ID
                        dict_to_store["note_id"] = par_.attrs.get('id') ###****###

                        # Set up list for link tags
                        dict_to_store["links_tag"] = [] ###****###
                        # Set up list for links
                        dict_to_store["links"] = [] ###****###
                        # Set up list for link text
                        dict_to_store["links_text"] = [] ###****###

                        # Set up new text string to append subsequent notes to
                        dict_to_store['note_text'] = '' ###****###
                        

                    # Append paragraph text to existing field
                    dict_to_store['note_text'] += unicodedata.normalize("NFKD", par_.get_text()).strip() + "\n\n"
                    # Find all a tags in the paragraph
                    par_a_tags = par_.find_all("a")
                    # If any a tags exist
                    if par_a_tags:
                        # Add link tags to dictionary
                        dict_to_store['links_tag'] += par_a_tags
                        # Add the links to the dictionary
                        dict_to_store["links"] += ["https://www.dhammatalks.org/" + link.get("href") for link in par_a_tags]
                        # Add the link text to the dictionary
                        dict_to_store["links_text"] += [link.get_text() for link in par_a_tags]

            ## Once done looking at all note pars, can decompose
            note_tag_original.decompose()
        else:
            note_dicts = None
            ################################## NEED TO FIGURE OUT HOW TO SET AS COLUMNS IN DF

        # Left with intro and body, no way to split them nicely
        
        # Update ongoing text to drop title
        ongoing_text = unicodedata.normalize("NFKD", sutta_div).strip() if sutta_div else None
        
        # Need to split on * * * or [ I ]
        split_on_intro = re.split(
            r'\* \* \*|\[ I \]'
            , ongoing_text
        )
        # For intro, pull out prior to split
        intro_text = split_on_intro[0].strip() ###****###
       
        # Body text, pull out after split
        sutta_text = split_on_intro[-1].strip() ###****###

        # Also set up sections
        blocks = {}
        # Treat each h tag as a separator
        for heading_ in sutta_div.find_all(re.compile('^h[1-9]$'), {"id": re.compile(r'toc')}):
            


        ## Notes, should be able to just find note class
        note_attr = soup.find("div", {"class": "note"})
        note_text = unicodedata.normalize("NFKD", note_attr).strip() if note_attr else None

        ## To extract See also footnotes, need to pull out all seealsos and get last one, since footnotes
        see_also_attr = soup.find_all("p", {"class":"seealso"})[-1]
        see_also_attr = see_also_attr.get_text() if see_also_attr else None
        
        
        

[{'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN01.html',
  'text': 'DN 1'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN02.html',
  'text': 'DN 2'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN09.html',
  'text': 'DN 9'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN11.html',
  'text': 'DN 11'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN12.html',
  'text': 'DN 12'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN15.html',
  'text': 'DN 15'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN16.html',
  'text': 'DN 16'},
 {'book': 'DN',
  'sub_book': 'None',
  'href': 'https://www.dhammatalks.org/s/suttas/DN/DN20.html',
  'text': 'DN 20'},
 {'book': 'DN',
  'sub_book': 'None

In [276]:
soup1 = BeautifulSoup(requests.get("https://www.dhammatalks.org/suttas/DN/DN02.html").text, "html.parser")

response2 = requests.get("https://www.dhammatalks.org/suttas/KN/Iti/iti2.html")
response2.encoding = "UTF-8"
soup2 = BeautifulSoup(response2.text, "html.parser")


In [309]:
sutta = soup1.select('div#sutta')[0]


note_tag = sutta.find_all(lambda el: ("h" in el.name) or (el.attrs.get('class') == 'stars') )
blocks = {}
for heading in note_tag:
    values = []
    for sibling in heading.find_next_siblings():
        if "h" in sibling.name:
            break
        values.append(sibling.text)
    blocks[heading.text] = values

In [307]:
html = """
<div id="my-div">
    <p class="my-class">Paragraph 1</p>
    <p class="my-class">Paragraph 2</p>
    <span id="my-span">Span</span>
</div>
"""

soup = BeautifulSoup(html, "html.parser")

# Find elements by ID or Class
elements = soup.find_all(attrs={"id": "my-div", "class": "my-class"})

soup.find('p').attrs.get('id')

[]

In [264]:
d

{'a': None, 'b': [1, 2]}