In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

## Parse the webpage and extract hyperlinks

Make a table for each webpage where you see the position of the link and the mention frequency.

In [None]:
def _parse(url):
    response = requests.get(url) # send GET request to URL
    html_content = response.content #get the HTML content
    soup = BeautifulSoup(html_content, "html.parser")  # parse html using soup
    hyperlinks = soup.find_all("a")  # Find all the hyperlinks
    names = [] # make 4 empty lists
    positions = []
    num_mentions_list = []
    urls = []
    for hyperlink in hyperlinks:       # loop each link and extract 4 pieces of info
        name = hyperlink.text.strip()          # get name of hyperlink
        position = hyperlink.parent.text.index(name)      # get position of hyperlink in text
        num_mentions = hyperlink.parent.text.count(name)   # get number of mentions in text
        href = hyperlink.get('href')  # get the href url
        full_url = urljoin(url, href)  # join base and relative url together
        names.append(name) # append each of 4 lists together
        positions.append(position)
        num_mentions_list.append(num_mentions)
        urls.append(full_url)
    data = {                  # make it into a dataframe!
        "name": names,
        "position": positions,
        "mentions": num_mentions_list,
        "url": urls}
    df = pd.DataFrame(data) #into a df!
    df = df.sort_values('mentions', ascending=False) 
    df = df.reset_index(drop=True)
    return df

def _delete(df):
    df = df[df.name.str.strip().astype(bool)]  # this will remove rows where the Name is empty
    df = df[df.name.str.len() > 1]  # this will remove the rows where Name is a single character
    df = df[df.url.str.contains('Special') == False]
    df = df[df.mentions >= 2].reset_index(drop=True)
    return df

In [None]:
tables = _parse('https://en.wikipedia.org/wiki/Semantic_similarity')

## Iterate for child articles
Repeat the parsing process for the mentioned articles.

In [None]:
prioritised = _delete(tables)

In [None]:
def _function(mother, child):
    response = requests.get(child)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")
    try:
        text = soup.get_text()
        positioning = text.index(mother)
        mentioning = text.count(mother)
        return positioning, mentioning
    except ValueError:
        return None, None
    

In [None]:
prioritised[['positioning', 'mentioning']] = prioritised.apply(lambda row: pd.Series(_function(row['name'], row['url'])), axis=1).fillna(0)


In [None]:
prioritised