# Scrapping the need words from guttenberg.org

This Python script is designed for web scraping, specifically targeting a webpage on the Project Gutenberg website. It begins by fetching the HTML content of the page using the requests library. Upon successfully retrieving the content, the script employs BeautifulSoup from the bs4 library to parse the HTML. It then searches for text within bold tags, looking specifically for instances where the bold text is preceded by a '#' followed by a number, using a regular expression to identify these patterns. For each match, it combines the matched pattern with the bold text into a string and stores these in a list.

The script also examines the HTML for table row (<tr>) elements, filtering them based on the presence of the terms "CLASS", "SECTION", or "DIVISION" within any <dt> elements they contain. Each filtered <tr> element is associated with the nearest preceding bold text that fits the earlier mentioned pattern. Special handling is applied to the first <tr> element in the "SECTION" and "CLASS" categories, where their texts are inserted at the beginning of the list.

Finally, the script writes the processed information to a text file named bold_words_with_numbers3.txt. For items in the list that are tuples (indicating a matched number and associated bold text), it writes both the number and the text to the file, with  consideration for texts that need further splitting. Other items,  are written directly to the file.


In [1]:
import requests
from bs4 import BeautifulSoup
import re

# get the html content

url = "https://www.gutenberg.org/files/10681/old/20040627-10681-h-body-pos.htm#1"

# Use requests to fetch the content of the URL
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all bold tags (both 'b' and 'strong' could be used for bold text)
    bold_tags = soup.find_all(['b', 'strong'])

    # Initialize a list to hold the extracted bold words that meet the criteria
    bold_words_with_numbers = []

    # Regular expression to match a # followed by a number
    pattern = re.compile(r'#\d+')

    # Iterate through all found bold tags
    for tag in bold_tags:
        # Initialize a variable to hold the previous text element
        prev_text = None

        for sibling in tag.previous_elements:
            # Check if the sibling is a bold tag (either <b> or <strong>)
            if sibling.name in ['b', 'strong']:
                # Check if the bold tag contains non-empty text
                if sibling.text and sibling.text.strip():
                    prev_text = sibling.text.strip()
                    break  # Exit the loop once a non-empty, bold text node is found
            # Check if the sibling is a text node and has a bold parent (<b> or <strong>)
            elif sibling.string and sibling.string.strip() and sibling.parent.name in ['b', 'strong']:
                prev_text = sibling.string.strip()
                break  # Exit the loop once a non-empty, bold text node is found

        # Check if the previous text element matches the pattern
        if prev_text and pattern.search(prev_text):
            # If a match is found, add the text from the bold tag to the list
            # along with the matched pattern (the previous text element)
            # make a string of the previous text and the bold tag text

            final_text = prev_text + ' ' + tag.text

            bold_words_with_numbers.append((prev_text, tag.text))

    tr_elements = soup.find_all('tr')

    # Initialize a list to hold <tr> elements that meet the criteria
    tr_elements_with_class = []
    tr_elements_with_section = []
    tr_elements_with_division = []

    # Iterate through each <tr> element
    for tr in tr_elements:
        # Find all <dt> elements within the current <tr>
        dt_elements = tr.find_all('dt')

        # Check each <dt> for the presence of the word "CLASS"
        for dt in dt_elements:
            if 'CLASS' in dt.text:
                # If "CLASS" is found, add the <tr> to the list and break the loop
                tr_elements_with_class.append(tr)
                break  # No need to check other <dt> elements in the same <tr>

            if 'SECTION' in dt.text:
                # If "SECTION" is found, add the <tr> to the list and break the loop
                tr_elements_with_section.append(tr)
                break

            if 'DIVISION' in dt.text:
                # If "DIVISION" is found, add the <tr> to the list and break the loop
                tr_elements_with_division.append(tr)
                break

    # Find the closest bold tag that has the pattern # followed by a number for each <tr> element
    for tr in tr_elements_with_section:
        # for the first tr element add the section name to second index of the bold_words_with_numbers list
        if tr == tr_elements_with_section[0]:
            tr_text = ' '.join(tr.stripped_strings)
            tr_text_cleaned = re.sub(r'\s+', ' ', tr_text)
            bold_words_with_numbers.insert(0, tr_text_cleaned)
            continue
        # Find the closest bold tag that has the pattern # followed by a number
        bold_tag = tr.find_previous(['b', 'strong'], string=pattern)
        # If a matching bold tag is found, add the section name to the list after the bold word
        if bold_tag:
            tr_text = ' '.join(tr.stripped_strings)
            tr_text_cleaned = re.sub(r'\s+', ' ', tr_text)
            # find the index of the bold word in the bold_words_with_numbers list
            index = [i for i, v in enumerate(bold_words_with_numbers) if v[0] == bold_tag.text]
            # insert the tr text cleaned to the bold_words_with_numbers list in the number index
            bold_words_with_numbers.insert(int(index[0]) + 1, tr_text_cleaned)

    for tr in tr_elements_with_division:
        # Find the closest bold tag that has the pattern # followed by a number
        bold_tag = tr.find_previous(['b', 'strong'], string=pattern)

        # If a matching bold tag is found, add the division name to the list after the bold word
        if bold_tag:
            tr_text = ' '.join(tr.stripped_strings)
            tr_text_cleaned = re.sub(r'\s+', ' ', tr_text)
            # find the index of the bold word in the bold_words_with_numbers list
            index = [i for i, v in enumerate(bold_words_with_numbers) if v[0] == bold_tag.text]
            # insert the tr text cleaned to the bold_words_with_numbers list in the number index
            bold_words_with_numbers.insert(int(index[0]) + 1, tr_text_cleaned)

    # Find the closest bold tag that has the pattern # followed by a number for each <tr> element
    for tr in tr_elements_with_class:
        # for the first tr element skip go to the next tr element
        if tr == tr_elements_with_class[0]:
            tr_text = ' '.join(tr.stripped_strings)
            tr_text_cleaned = re.sub(r'\s+', ' ', tr_text)
            bold_words_with_numbers.insert(0, tr_text_cleaned)
            continue

        # Find the closest bold tag that has the pattern # followed by a number
        bold_tag = tr.find_previous(['b', 'strong'], string=pattern)

        # If a matching bold tag is found, add the class name to the list after the bold word
        if bold_tag:
            tr_text = ' '.join(tr.stripped_strings)
            tr_text_cleaned = re.sub(r'\s+', ' ', tr_text)
            # find the index of the bold word in the bold_words_with_numbers list
            index = [i for i, v in enumerate(bold_words_with_numbers) if v[0] == bold_tag.text]
            # insert the tr text cleaned to the bold_words_with_numbers list in the number index
            bold_words_with_numbers.insert(int(index[0]) + 1, tr_text_cleaned)

    # Write the extracted bold words with numbers to a file
    with open('bold_words_with_numbers3.txt', 'w', encoding='utf-8') as file:
        for item in bold_words_with_numbers:
            # Check if the item is a tuple, indicating it's in the special format
            if isinstance(item, tuple):
                # Unpack the tuple into number and word
                number, word = item
                # if word has 2 words, split by comma or . split the word and write the number and word to the file in seperate lines
                if len(word.split()) > 1:
                    if ',' in word:
                        words = word.split(',')
                        for word in words:
                            # remove any special characters
                            word = re.sub(r'[^\w\s]', '', word)
                            file.write(f"{number} {word.strip()}\n")
                    elif '.' in word:
                        words = word.split('.')
                        for word in words:
                            word = re.sub(r'[^\w\s]', '', word)
                            file.write(f"{number} {word.strip()}\n")
                    else:
                        file.write(f"{number} {word}\n")
                else:
                    # Write the number and word to the file
                    word = re.sub(r'[^\w\s]', '', word)
                    file.write(f"{number} {word}\n")

            else:
                # For any other format, write the item as it is
                file.write(str(item) + '\n')

else:
    print("Failed to retrieve the webpage.")