In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def profile_scraper(url):
    """
    Scrapes the names and roles of people from the AHF Profiles page given its URL.
    Returns a dictionary with keys 'name' and 'roles'.
    """
    data = {'name': [], 'roles': []}

    # Send a GET request to the URL and get the response
    response = requests.get(url)

    # Parse the HTML content of the page with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the div element with the class name
    div_element = soup.find('div', {'class': 'post-title-grid'})

    # Extract the names and roles and store in the dictionary
    for name in div_element.find_all('a', href=True):
        name_text = name.text.strip()

        link = name['href']
        sub_response = requests.get(link)
        soup = BeautifulSoup(sub_response.content, 'html.parser')
        div_element = soup.find('h6', {'class': 'inline-comma-separated-list mt-10 mb-10'})
        sub_role_list = [role.text.strip() for role in div_element.find_all('a')]

        data['name'].append(name_text)
        data['roles'].append(sub_role_list)

    return data

In [3]:
main_url = "https://ahf.nuclearmuseum.org/ahf/bios/"
data = {'name': [], 'roles': []}

# Iterate over a range of pages to scrape
for page in range(1, 288):
    if page == 1:
        current_url = main_url
    else:
        current_url = f'{main_url}?_paged={page}'

    scraped_data = profile_scraper(current_url)
    data['name'].extend(scraped_data['name'])
    data['roles'].extend(scraped_data['roles'])

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,roles
0,A. Becher,"[Manhattan Project Veteran, Project Worker/Staff]"
1,A. Berkemeier,"[Manhattan Project Veteran, Project Worker/Staff]"
2,A. Bradley,"[Manhattan Project Veteran, Project Worker/Staff]"
3,A. Castle,"[Manhattan Project Veteran, Project Worker/Staff]"
4,A. Downey,"[Manhattan Project Veteran, Project Worker/Staff]"
...,...,...
14303,Zelmar Barson,"[Engineer, Manhattan Project Veteran, Military..."
14304,Zen B. Portwood,"[Manhattan Project Veteran, Project Worker/Staff]"
14305,Zenas J. Boone,"[Manhattan Project Veteran, Military Veteran, ..."
14306,Zola DiMartino,"[Manhattan Project Veteran, Project Worker/Staff]"


In [6]:
# Export the DataFrame to a CSV file
df.to_csv('../../data/profile_names_roles.csv', index=False)