In [6]:
from glob import glob
from bs4 import BeautifulSoup, Comment, NavigableString
import lxml
import re

In [7]:
files = glob("*.htm")
files

['page_1.htm',
 'page_10.htm',
 'page_11.htm',
 'page_12.htm',
 'page_2.htm',
 'page_3.htm',
 'page_4.htm',
 'page_5.htm',
 'page_6.htm',
 'page_7.htm',
 'page_8.htm',
 'page_9.htm']

In [8]:
parsed_html = []
raw_html = []
for file in files:
    with open(file, "r") as f:
        soup = BeautifulSoup(f, 'lxml')
        parsed_html.append(soup)
    with open(file, "r") as f:
        raw_html.append(f.read())

In [10]:
nodes = []
for i, html in enumerate(parsed_html):
    node = {}
    node["title"] = html.find("title").text[:-12]
    node["links"] = []
    # find the div with class mw-parser-output
    content = html.find("div", {"id": "mw-content-text"})
    content = content.find("div", {"class": "mw-parser-output"})
    trimmed_content = content
    # remove all table tags
    for table in trimmed_content.find_all("table"):
        table.extract()
    # remove all figure tags
    for figure in trimmed_content.find_all("figure"):
        figure.extract()
    # remove all sup tags
    for sup in trimmed_content.find_all("sup"):
        sup.extract()
    # remove all tags with class mw-editsection
    for edit in trimmed_content.find_all("span", {"class": "mw-editsection"}):
        edit.extract()
    # bring out the content from all meta tags and remove the tag
    for meta in trimmed_content.find_all("meta"):
        meta.unwrap()
    
    text_content = str(trimmed_content)
    
    section = ["Lead"]
    lead_paragraph = ''
    search_index_link = 0
    search_index_sentence = 0
    # iterate through all tags
    for tag in trimmed_content:
        if isinstance(tag, NavigableString):
            continue
        if tag.get("role") == "note":
            continue
        # if the tag is a header tag
        if tag.name in ['h2', 'h3', 'h4']:
            # update section, add or trim section list
            if tag.name == 'h2':
                section = [tag.text]
            elif tag.name == 'h3':
                section = section[:1] + [tag.text]
            elif tag.name == 'h4':
                section = section[:2] + [tag.text]
            if section in [["Notes"], ["References"], ["External links"], ["Further reading"]]:
                break
        
        if tag.name == 'p' and section == ["Lead"]:
            lead_paragraph += tag.text

        text_tag = str(tag)
        # split text into sentences
        # use all punctuation marks and new line as sentence separators
        sentences = re.split(r'[.!?;:\n]', text_tag)
        
        # find links
        links = tag.find_all('a')
        # if there are links in the paragraph
        if links:
            # iterate through the links
            for link in links:
                # if link has class "external text" then skip it
                if link.get("class") == ["external", "text"]:
                    continue
                link_data = {}
                
                # get the title of the link and the target anchor if it exists
                full_title = link["href"][6:]
                if "#" in full_title:
                    link_data['title'] = full_title.split("#")[0]
                    link_data['target_section'] = full_title.split("#")[1]
                else:
                    link_data['title'] = full_title
                    link_data['target_section'] = "Lead"
                
                # get the text of the link
                link_data['text'] = link.text
                
                # get the source section of the link
                link_data['source_section'] = '<sep>'.join(section)
                
                # get the start and end index of the link in the text
                index = text_content.index(str(link), search_index_link)
                search_index_link = index + len(str(link))
                link_data['link_start_index'] = index
                link_data['link_end_index'] = index + len(str(link))
                
                # get the sentence in which the link is present
                # iterate through the sentences
                for i, sentence in enumerate(sentences):
                    # if the link is present in the sentence
                    if str(link) in sentence:
                        # get the start and end index of the link in the sentence
                        link_data['sentence'] = sentence
                        index = text_content.index(sentence, search_index_sentence)
                        link_data['sentence_start_index'] = index
                        link_data['sentence_end_index'] = index + len(sentence)
                        search_index_sentence = index
                        break
                
                sentences = sentences[i:]
                
                
                # add the link to the node
                node["links"].append(link_data)
                node["page_length"] = len(text_content)
                node["lead_paragraph"] = lead_paragraph
                
    nodes.append(node)

for node in nodes:
    print(f"Title: {node['title']}")
    print(f"Page length: {node['page_length']}")
    print(f"Lead paragraph: {node['lead_paragraph']}")
    for link in node["links"]:
        print(f"\t{link}")
    

Title: Jiří Pechar
Page length: 16337
Lead paragraph: 
Jiří Pechar (7 May 1929 – 22 August 2022) was a Czech philosopher and translator.

	{'title': 'P%C5%99%C3%ADbram', 'target_section': 'Lead', 'text': 'Příbram', 'source_section': 'Life and career', 'link_start_index': 1097, 'link_end_index': 1158, 'sentence': '<p>Born in <a href="/wiki/P%C5%99%C3%ADbram" title="Příbram">Příbram</a>, Pechar studied in history of literature at the <a href="/wiki/Charles_University" title="Charles University">Charles University</a>, and worked as an editor for the <a href="/wiki/Czechoslovak_Academy_of_Sciences" title="Czechoslovak Academy of Sciences">Czechoslovak Academy of Sciences</a>, before being dismissed for political reasons in 1958', 'sentence_start_index': 1086, 'sentence_end_index': 1505}
	{'title': 'Charles_University', 'target_section': 'Lead', 'text': 'Charles University', 'source_section': 'Life and career', 'link_start_index': 1207, 'link_end_index': 1291, 'sentence': '<p>Born in <a hr