In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Load list of slugs
with open("top_slugs.txt") as f:
    slugs = [line.strip() for line in f.readlines()]

# Initialize matrix
n = len(slugs)
slug_set = set(slugs)
link_matrix = pd.DataFrame(0, index=slugs, columns=slugs)

# Wikipedia base URL
base_url = "https://en.wikipedia.org/wiki/"

excluded_sections = ["See also", "References", "Sources", "Further reading", "External links"]

def remove_after_first_excluded_section(content, excluded_sections):
    """Remove everything after the first encountered excluded section."""
    # Find all section headers
    headers = content.find_all(['h2', 'h3'])
    
    for header in headers:
        section_title = header.get_text().strip()
        
        if section_title in excluded_sections:
            # Found the first excluded section - remove this header and everything after it
            elements_to_remove = []
            
            # Get all elements that come after this header in the document
            for element in header.find_all_next():
                elements_to_remove.append(element)
            
            # Also remove the header itself
            elements_to_remove.append(header)
            
            # Remove all collected elements
            for element in elements_to_remove:
                if element and element.parent:
                    element.extract()
            
            break  # Stop after finding the first excluded section

# Loop through each person's Wikipedia page
for source_slug in slugs:
    print(f"Processing: {source_slug}")
    try:
        url = base_url + source_slug
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main content div
        content = soup.find('div', {'id': 'mw-content-text'})
        if not content:
            continue

        # Remove everything after first excluded section
        remove_after_first_excluded_section(content, excluded_sections)
        
        # Now find all remaining links
        links = content.find_all('a', href=True)
        linked_slugs = set(
            link['href'].split('/wiki/')[-1].split('#')[0]
            for link in links
            if link['href'].startswith('/wiki/') and ':' not in link['href']
        )

        # Check if any link matches a target slug
        for target_slug in slugs:
            if target_slug != source_slug and target_slug in linked_slugs:
                link_matrix.at[source_slug, target_slug] = 1

    except Exception as e:
        print(f"Failed for {source_slug}: {e}")
    break

In [None]:
children = list(content.children)

In [None]:
len(children[1])

In [None]:
list(children[0].children)[5]

In [None]:
linked_slugs

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Load list of slugs
with open("top_slugs.txt") as f:
    slugs = [line.strip() for line in f.readlines()]

# Initialize matrix
n = len(slugs)
slug_set = set(slugs)
link_matrix = pd.DataFrame(0, index=slugs, columns=slugs)
# Wikipedia base URL
base_url = "https://en.wikipedia.org/wiki/"

# Sections to exclude
excluded_sections = ["See also", "References", "Sources", "Further reading", "External links"]


def remove_excluded_sections(content, excluded_sections):
    """Remove excluded sections and their content from the HTML."""
    # Find all section headers
    all_headers = content.find_all(['h2', 'h3'])
    
    # Create a list of (header, next_header) pairs
    header_pairs = []
    for i, header in enumerate(all_headers):
        next_header = all_headers[i + 1] if i + 1 < len(all_headers) else None
        header_pairs.append((header, next_header))
    
    # Process each header pair
    for header, next_header in header_pairs:
        section_title = header.get_text().strip()
        
        if section_title in excluded_sections:
            # Find all elements between this header and the next header
            elements_to_remove = [header]  # Start with the header itself
            
            # Get all elements in the document after this header
            all_elements = header.find_all_next()
            
            for element in all_elements:
                # Stop if we reach the next header
                if next_header and element == next_header:
                    break
                # Stop if we reach any other section header that comes after next_header
                if element.name in ['h2', 'h3'] and element != header:
                    break
                elements_to_remove.append(element)
            
            # Remove all collected elements
            for element in elements_to_remove:
                if element and element.parent:  # Make sure element still exists in DOM
                    element.extract()

# Loop through each person's Wikipedia page
for source_slug in slugs:
    print(f"Processing: {source_slug}")
    try:
        url = base_url + source_slug
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main content div
        content = soup.find('div', {'id': 'mw-content-text'})
        if not content:
            continue

        # Remove excluded sections
        remove_excluded_sections(content, excluded_sections)
        
        # Now find all remaining links
        links = content.find_all('a', href=True)
        linked_slugs = set(
            link['href'].split('/wiki/')[-1].split('#')[0]
            for link in links
            if link['href'].startswith('/wiki/') and ':' not in link['href']
        )

        # Check if any link matches a target slug
        for target_slug in slugs:
            if target_slug != source_slug and target_slug in linked_slugs:
                link_matrix.at[source_slug, target_slug] = 1

    except Exception as e:
        print(f"Failed for {source_slug}: {e}")
    break