In [32]:
from glob import glob
from bs4 import BeautifulSoup, Comment, NavigableString
from bs4.formatter import HTMLFormatter
import lxml
import re
import urllib

In [33]:
def process_title(title):
    return urllib.parse.quote(title.replace(' ', '_'))

In [34]:
files = glob("example_pages/*.htm")
files

['example_pages\\page_1.htm',
 'example_pages\\page_10.htm',
 'example_pages\\page_11.htm',
 'example_pages\\page_12.htm',
 'example_pages\\page_2.htm',
 'example_pages\\page_3.htm',
 'example_pages\\page_4.htm',
 'example_pages\\page_5.htm',
 'example_pages\\page_6.htm',
 'example_pages\\page_7.htm',
 'example_pages\\page_8.htm',
 'example_pages\\page_9.htm']

In [35]:
class UnsortedAttributes(HTMLFormatter):
    def attributes(self, tag):
        for k, v in tag.attrs.items():
            yield k, v

In [36]:
parsed_html = []
raw_html = []
for file in files:
    with open(file, "r") as f:
        soup = BeautifulSoup(f, 'lxml')
        parsed_html.append(soup)
    with open(file, "r") as f:
        raw_html.append(f.read())

In [37]:
nodes = []
for i, html in enumerate(parsed_html):
    node = {}
    node["title"] = html.find("title").text[:-12]
    node["links"] = []
    # find the div with class mw-parser-output
    content = html.find("div", {"id": "mw-content-text"})
    content = content.find("div", {"class": "mw-parser-output"})
    
    # define the sentences
    # split by punctuation if the puctuation is not followed by a letter
    temp_sentences = []
    sentence = ''
    safe = False
    for j, char in enumerate(raw_html[i]):
        if not safe and char in ['.', '!', '?', '\n'] and (j > 1 and raw_html[i][j-2] != '.' and raw_html[i][j-2].isalnum()) and (j < len(raw_html[i]) - 1 and not raw_html[i][j+1].isalnum() and raw_html[i][j+1] != '_'):
            sentence += char
            temp_sentences.append(sentence)
            sentence = ''
        else:
            if not safe and char == '<' and raw_html[i][j + 1] == 'a':
                safe = True
            if safe and char == '>' and raw_html[i][j - 1] == 'a':
                safe = False
            sentence += char
    temp_sentences.append(sentence)
        
    # temp_sentences = re.split(r'(?<!\w\.\w\.)(?<!\W\.\W\.)(?<=[A-Za-z0-9])[.?!\n](?![A-Za-z0-9])(?![^<]*>)', raw_html[i])
    sentences = []
    start_index = 0
    end_index = 0
    for sentence in temp_sentences:
        # split by <p> or </p>
        for part in re.split('(<p>|</p>)', sentence):
            end_index += len(part)
            sentences.append({'sentence': part, 'start_index': start_index, 'end_index': end_index})
            start_index = end_index
        
    section = ["Lead"]
    lead_paragraph = ''
    search_index_link = 0
    search_index_sentence = 0
    # iterate through all tags
    for tag in content:
        if isinstance(tag, NavigableString):
            continue
        if tag.get("role") == "note":
            continue
        # if the tag is a header tag
        if tag.name in ['h2', 'h3', 'h4']:
            # update section, add or trim section list
            if tag.name == 'h2':
                section = [tag.text]
            elif tag.name == 'h3':
                section = section[:1] + [tag.text]
            elif tag.name == 'h4':
                section = section[:2] + [tag.text]
            if section in [["Notes"], ["References"], ["External links"], ["Further reading"]]:
                break
        
        if tag.name == 'p' and section == ["Lead"]:
            lead_paragraph += tag.text
        
        # find links
        links = tag.find_all('a')
        # if there are links in the paragraph
        if links:
            # iterate through the links
            for link in links:
                # if link has class "mw-file-description" then skip it
                if link.get("class") == ["mw-file-description"]:
                    continue
                # if link has class "mw-redirect" then skip it
                if link.get("class") == ["mw-redirect"]:
                    continue
                # if link has class "external text" then skip it
                if link.get("class") == ["external", "text"]:
                    continue
                # if link has class "mw-selflink selflink" then skip it
                if link.get("class") == ["mw-selflink", "selflink"]:
                    continue
                # if link has no href or if href does not start with "/wiki/" then skip it
                if not link.get("href") or not link["href"].startswith("/wiki/"):
                    continue
                # if link has ':' in it then skip it
                if ':' in link["href"]:
                    continue
                link_data = {}
                
                # check if there are invalid elements in the tag parents or itself
                # invalid elements are: table, figure, sup
                # also check if the tag has class "mw-editsection"
                invalid = False
                for parent in link.parents:
                    if parent.name in ['table', 'figure', 'sup']:
                        invalid = True
                        break
                    if parent.get("class") == ["mw-editsection"]:
                        invalid = True
                        break
                    if parent.get("class") == ["mw-editsection"]:
                        invalid = True
                        break
                                
                # get the title of the link and the target anchor if it exists
                full_title = link["href"][6:]
                if "#" in full_title:
                    link_data['title'] = full_title.split("#")[0]
                    link_data['target_section'] = full_title.split("#")[1]
                else:
                    link_data['title'] = full_title
                    link_data['target_section'] = "Lead"
                
                # get the text of the link
                link_data['text'] = link.text
                
                # get the source section of the link
                link_data['source_section'] = '<sep>'.join(section)
                
                # get the start and end index of the link in the text
                index = raw_html[i].index(f"/wiki/{full_title}", search_index_link)
                while raw_html[i][index] != '<':
                    index -= 1
                end_index = index
                while raw_html[i][end_index] != '>':
                    end_index += 1
                search_index_link = end_index
                link_data['link_start_index'] = index
                link_data['link_end_index'] = end_index
                while len(sentences) > 1 and sentences[1]['start_index'] < index:
                    sentences.pop(0)
                
                # get the sentence in which the link is present
                # iterate through the sentences
                # expensive process, only apply if the link is valid
                if invalid:
                    continue
                for j, sentence in enumerate(sentences):
                    # if the link is present in the sentence
                    if f"/wiki/{full_title}" in sentence['sentence']:
                        # get the start and end index of the link in the sentence
                        link_data['sentence'] = sentence['sentence']
                        link_data['sentence_start_index'] = sentence['start_index']
                        link_data['sentence_end_index'] = sentence['end_index']
                        search_index_sentence = index
                        break
                    elif j == len(sentences) - 1:
                        print('Not found')
                        print(sentences[0]['sentence'])
                        
                sentences = sentences[j:]
                
                # add the link to the node
                node["links"].append(link_data)
    node["page_length"] = len(raw_html[i])
    node["lead_paragraph"] = lead_paragraph
                
    nodes.append(node)

for node in nodes:
    print(f"Title: {node['title']}")
    print(f"Page length: {node['page_length']}")
    print(f"Lead paragraph: {node['lead_paragraph']}")
    for link in node["links"]:
        print(f"\t{link}")
    

Title: Jiří Pechar
Page length: 68007
Lead paragraph: 
Jiří Pechar (7 May 1929 – 22 August 2022) was a Czech philosopher and translator.

	{'title': 'P%C5%99%C3%ADbram', 'target_section': 'Lead', 'text': 'Příbram', 'source_section': 'Life and career[edit]', 'link_start_index': 36131, 'link_end_index': 36180, 'sentence': 'Born in <a href="/wiki/P%C5%99%C3%ADbram" title="Příbram">Příbram</a>, Pechar studied in history of literature at the <a href="/wiki/Charles_University" title="Charles University">Charles University</a>, and worked as an editor for the <a href="/wiki/Czechoslovak_Academy_of_Sciences" title="Czechoslovak Academy of Sciences">Czechoslovak Academy of Sciences</a>, before being dismissed for political reasons in 1958.', 'sentence_start_index': 36123, 'sentence_end_index': 36540}
	{'title': 'Charles_University', 'target_section': 'Lead', 'text': 'Charles University', 'source_section': 'Life and career[edit]', 'link_start_index': 36241, 'link_end_index': 36302, 'sentence': '