# Data Collection
## Source Info
This data is scraped off of the [UVA Miller Center Famous Presidential Speeches Archive](https://millercenter.org/the-presidency/presidential-speeches). I collected the transcript off of each speech and will clean/explore it in eda.ipynb for use as the document base in the RAG system. I use selenium to interact with the websites and beautifulsoup to extract the information off of the HTML.

## Setup

In [2]:
import logging
import os
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [4]:
# For laptop

mac = False
# mac = True

In [5]:
# Logging setup

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/scraping.log', mode='w', encoding='utf-8'),
        logging.StreamHandler()  # This will output logs to the console
    ]
)

logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)

logger = logging.getLogger('rag_project')

In [6]:
# Configure ChromeDriver

def setup_driver():
    logging.info("Setting up the Chrome driver")
    driver_path = '/Users/ethanvertal/Documents/chromedriver-mac-arm64/chromedriver' if mac else '/usr/local/bin/chromedriver'
    
    if not os.path.exists(driver_path):
        logging.error(f"ChromeDriver not found at {driver_path}")
        return None

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    service = Service(driver_path)
    
    try:
        driver = webdriver.Chrome(service=service, options=chrome_options)
        logging.info("Chrome driver setup complete")
    except Exception as e:
        logging.error(f"Failed to set up Chrome driver: {str(e)}")
        return None
    
    return driver

## Scraping functions

### Speeches

In [51]:
def scrape_speech(url, driver):
    if not driver:
        logging.error("Driver not initialized.")
        return None, None, None

    logging.info(f"Scraping URL: {url}")
    driver.get(url)
    
    try:
        WebDriverWait(driver, 20).until(
            EC.any_of(
                      EC.presence_of_element_located((By.CLASS_NAME, 'transcript-inner')),
                      EC.presence_of_element_located((By.CLASS_NAME, 'view-transcript'))
                     )
        )
        logging.info(f"Page loaded successfully for {url}")
    except Exception as e:
        logging.error(f"Error waiting for transcript elements on {url}: {str(e)}")
        return None, None, None
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    try:
        title = soup.find('h2', class_='presidential-speeches--title').text.strip()
        logging.info(f"Title found: {title}")
    except AttributeError:
        logging.error(f"Title not found on {url}")
        title = "Unknown Title"
    
    try:
        president = soup.find('label', class_='presidential-speeches--label').text.strip()
        logging.info(f"President found: {president}")
    except AttributeError:
        logging.error(f"President not found on {url}")
        president = "Unknown President"
    
    transcript_div = soup.find('div', class_='transcript-inner') or soup.find('div', class_='view-transcript')
    
    if not transcript_div:
        logging.error(f"Transcript container not found on {url}")
        return title, president, ""
    
    # Different transcript structures
    if transcript_div.find_all('p'):
        paragraphs = transcript_div.find_all('p')
        full_transcript = ' '.join([p.text.strip() for p in paragraphs])
    elif transcript_div.find_all('span'):
        spans = transcript_div.find_all('span')
        full_transcript = ' '.join([span.text.strip() for span in spans])
    elif transcript_div.find('p') and '<br>' in transcript_div.decode_contents():
        full_transcript = transcript_div.decode_contents().replace('<br>', '\n').strip()
    else:
        full_transcript = transcript_div.get_text(separator=' ', strip=True)
    
    logging.info(f"Transcript scraped for {title}")
    return title, president, full_transcript

In [52]:
def scrape_all_speeches(base_url):
    driver = setup_driver()
    speeches = []
    
    if not driver:
        logging.error("Driver setup failed. Exiting scrape_all_speeches.")
        return speeches
    
    logging.info(f"Starting to scrape all speeches from base URL: {base_url}")
    driver.get(base_url)
    
    logging.info("Scrolling...")
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    links = driver.find_elements(By.XPATH, "//div[contains(@class, 'views-field-title')]//span[@class='field-content']/a")
    speech_links = [link.get_attribute('href') for link in links]
    logging.info(f"Found {len(speech_links)} speech links")

    for link in speech_links:
        for attempt in range(2):  # Retry mechanism
            try:
                title, president, transcript = scrape_speech(link, driver)
                if title and transcript:
                    speeches.append({
                        'title': title,
                        'president': president,
                        'transcript': transcript,
                        'source': link,
                        'source_type': 'speech'
                    })
                    logging.info(f"Scraped: {title} by {president}")
                    break
            except Exception as e:
                logging.error(f"Error scraping {link} on attempt {attempt + 1}: {str(e)}")
                time.sleep(2)  # Wait before retrying
    
    driver.quit()
    logging.info("Finished scraping all speeches")
    return speeches

### White House FAQ

In [53]:
def scrape_single_question(question_url):
    logging.info(f"Sending request to {question_url}")

    # Send a request to the question page
    question_response = requests.get(question_url)
    if question_response.status_code == 200:
        logging.info(f"Request to {question_url} successful.")
        question_soup = BeautifulSoup(question_response.text, 'html.parser')

        # Extract the question from the h1 tag
        question = question_soup.find('h1').get_text(strip=True)
        logging.info(f"Extracted question: {question}")

        # Extract the answer content (paragraphs and lists)
        answer_parts = []
        main_content = question_soup.find('main')
        answer_wrapper = main_content.find('div', class_='wrapper wrapper--copy wysiwyg dropcap')
        if answer_wrapper:
            logging.info(f"Found main content for question: {question}")
            for tag in answer_wrapper.find_all(['p', 'ul']):
                if tag.name == 'p':
                    answer_text = tag.get_text(strip=True)
                    answer_parts.append(answer_text)
                    logging.info("Extracted paragraph.")
                elif tag.name == 'ul':
                    # If it's a list, concatenate the list items
                    list_items = [li.get_text(strip=True) for li in tag.find_all('li')]
                    answer_text = "\n".join(list_items)
                    answer_parts.append(answer_text)
                    logging.info("Extracted list.")

        answer = " ".join(answer_parts)
        logging.info(f"Compiled answer: {answer}")

        return question, answer
    else:
        logging.error(f"Failed to retrieve {question_url}, status code: {question_response.status_code}")
        return None

In [54]:
def scrape_all_questions():
    driver = setup_driver()
    base_url = "https://www.whitehousehistory.org/questions"
    driver.get(base_url)
    time.sleep(3)

    # Click the "Keep Looking?" button until all questions are loaded
    while True:
        try:
            load_more_button = driver.find_element(By.CLASS_NAME, "section__load-more")
            load_more_button.click()
            time.sleep(3)  # Allow time for new content to load
        except:
            logger.info("No more 'Keep Looking?' button found, all questions should be loaded.")
            break

    # Get all question links
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    question_links = soup.select(".related-content-list .entry-card__link")
    logger.info(f"Found {len(question_links)} question links.")

    # Scrape each question
    all_data = {'question': [], 'answer': [], 'source': [], 'source_type': []}
    for link in question_links:
        question_url = link['href']
        question, answer = scrape_single_question(question_url)
        if question and answer:
            all_data['question'].append(question)
            all_data['answer'].append(answer)
            all_data['source'].append(question_url)
            all_data['source_type'].append('faq')

    # Close the driver
    driver.quit()
    logging.info(f"Finished Scraping {len(all_data['question'])} questions!")
    return all_data

### White House Bios

In [55]:
def scrape_single_bio(bio_url):
    logger.debug(f"Sending request to {bio_url}")
    
    # Send a request to the bio page
    bio_response = requests.get(bio_url)
    if bio_response.status_code == 200:
        logger.debug(f"Request to {bio_url} successful.")
        bio_soup = BeautifulSoup(bio_response.text, 'html.parser')
        
        # Extract the name from the h1 tag
        name = bio_soup.find('h1').get_text(strip=True)
        logger.info(f"Extracted name: {name}")
        
        # Extract the biography content (paragraphs and br-separated lines)
        bio_parts = []
        main_content = bio_soup.find('main')
        bio_wrapper = main_content.find('div', class_='wrapper wrapper--copy wysiwyg')
        if bio_wrapper:
            logger.info(f"Found main content for bio: {name}")
            for tag in bio_wrapper.find_all(['p']):
                if tag.name == 'p':
                    # Check if the paragraph contains <br> tags
                    if tag.find('br'):
                        br_texts = [line.strip() for line in tag.get_text(separator="\n").split('\n')]
                        bio_parts.extend(br_texts)
                        logger.debug(f"Extracted br-separated lines.")
                    else:
                        bio_text = tag.get_text(strip=True)
                        bio_parts.append(bio_text)
                        logger.debug(f"Extracted paragraph.")
        
        bio = " ".join(bio_parts)
        logger.info(f"Compiled bio: {name}")
        
        return {"name": name, "bio": bio}
    else:
        logger.error(f"Failed to retrieve {bio_url}, status code: {bio_response.status_code}")
        return None

In [68]:
def scrape_all_bios():
    # Set up the Selenium WebDriver
    driver = webdriver.Chrome()  # Ensure chromedriver is installed and in your PATH
    base_url = "https://www.whitehousehistory.org/bios"
    driver.get(base_url)
    time.sleep(3)  # Allow time for the page to load
    
    # Loop through pagination until all bios are loaded
    all_data = {'name': [], 'bio': [], 'source': [], 'source_type': []}
    page_number = 1
    while True:
        logger.info(f"Scraping page {page_number}")
        # Get all bio links on the current page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        bio_links = soup.select(".result-card__content .result-card__heading a")
        logger.info(f"Found {len(bio_links)} bio links on page {page_number}.")
        
        # Scrape each bio
        for link in bio_links:
            bio_url = link['href']
            data = scrape_single_bio(bio_url)
            if data:
                all_data['name'].append(data['name'])
                all_data['bio'].append(data['bio'])
                all_data['source'].append(bio_url)
                all_data['source_type'].append('bio')
        
        # Try to navigate to the next page
        if page_number > 12:
            logger.warning("No more pages found, all bios should be loaded.")
            break
            
        page_number += 1
        next_page_url = f"https://www.whitehousehistory.org/bios/p{page_number}"
        driver.get(next_page_url)
        time.sleep(3)  # Allow time for the next page to load
            
    
    # Close the driver
    driver.quit()
    return all_data

## Perform Scraping

In [58]:
base_url = 'https://millercenter.org/the-presidency/presidential-speeches'
speeches = scrape_all_speeches(base_url)
speeches = pd.DataFrame(data=speeches, columns=['title', 'president', 'transcript', 'source', 'source_type'])

2024-10-10 16:54:01,232 - INFO - Setting up the Chrome driver
2024-10-10 16:54:01,548 - INFO - Chrome driver setup complete
2024-10-10 16:54:01,549 - INFO - Starting to scrape all speeches from base URL: https://millercenter.org/the-presidency/presidential-speeches
2024-10-10 16:54:48,793 - INFO - Found 1053 speech links
2024-10-10 16:54:48,793 - INFO - Scraping URL: https://millercenter.org/the-presidency/presidential-speeches/september-24-2024-address-79th-united-nations-general-assembly
2024-10-10 16:54:50,292 - INFO - Page loaded successfully for https://millercenter.org/the-presidency/presidential-speeches/september-24-2024-address-79th-united-nations-general-assembly
2024-10-10 16:54:50,330 - INFO - Title found: September 24, 2024: Address before the 79th United Nations General Assembly
2024-10-10 16:54:50,331 - INFO - President found: Joe Biden Presidency
2024-10-10 16:54:50,334 - INFO - Transcript scraped for September 24, 2024: Address before the 79th United Nations General As

In [59]:
faqs = scrape_all_questions()
faqs = pd.DataFrame(faqs)

2024-10-10 17:14:30,409 - INFO - Setting up the Chrome driver
2024-10-10 17:14:30,739 - INFO - Chrome driver setup complete
2024-10-10 17:14:36,468 - INFO - No more 'Keep Looking?' button found, all questions should be loaded.
2024-10-10 17:14:36,501 - INFO - Found 12 question links.
2024-10-10 17:14:36,502 - INFO - Sending request to https://www.whitehousehistory.org/questions/has-every-president-worked-in-the-oval-office
2024-10-10 17:14:36,723 - INFO - Request to https://www.whitehousehistory.org/questions/has-every-president-worked-in-the-oval-office successful.
2024-10-10 17:14:36,755 - INFO - Extracted question: Has every president worked in the Oval Office?
2024-10-10 17:14:36,757 - INFO - Found main content for question: Has every president worked in the Oval Office?
2024-10-10 17:14:36,758 - INFO - Extracted paragraph.
2024-10-10 17:14:36,758 - INFO - Compiled answer: No! The Oval Office has been the primary presidential workspace since 1909, whenPresident William Howard Taftw

In [70]:
bios = scrape_all_bios()
bios = pd.DataFrame(bios)

2024-10-10 19:05:16,233 - INFO - Scraping page 1
2024-10-10 19:05:16,268 - INFO - Found 9 bio links on page 1.
2024-10-10 19:05:16,468 - INFO - Extracted name: Abigail Adams
2024-10-10 19:05:16,469 - INFO - Found main content for bio: Abigail Adams
2024-10-10 19:05:16,470 - INFO - Compiled bio: Abigail Adams
2024-10-10 19:05:16,561 - INFO - Extracted name: Abigail Powers Fillmore
2024-10-10 19:05:16,562 - INFO - Found main content for bio: Abigail Powers Fillmore
2024-10-10 19:05:16,563 - INFO - Compiled bio: Abigail Powers Fillmore
2024-10-10 19:05:16,643 - INFO - Extracted name: Abraham Lincoln
2024-10-10 19:05:16,645 - INFO - Found main content for bio: Abraham Lincoln
2024-10-10 19:05:16,645 - INFO - Compiled bio: Abraham Lincoln
2024-10-10 19:05:16,730 - INFO - Extracted name: Andrew Jackson
2024-10-10 19:05:16,731 - INFO - Found main content for bio: Andrew Jackson
2024-10-10 19:05:16,732 - INFO - Compiled bio: Andrew Jackson
2024-10-10 19:05:16,815 - INFO - Extracted name: Andre

## Save to CSV

In [61]:
filename='presidential_speeches.csv'
logging.info(f"Saving speeches to CSV file: {filename}")
speeches.to_csv(filename, index=False, encoding='utf-8')
logging.info("Speeches saved to CSV file successfully")

2024-10-10 17:18:51,246 - INFO - Saving speeches to CSV file: presidential_speeches.csv
2024-10-10 17:18:51,536 - INFO - Speeches saved to CSV file successfully


In [62]:
filename='whitehouse_faq.csv'
logging.info(f"Saving FAQ to CSV file: {filename}")
faqs.to_csv(filename, index=False, encoding='utf-8')
logging.info("FAQ saved to CSV file successfully")

2024-10-10 17:18:51,577 - INFO - Saving FAQ to CSV file: whitehouse_faq.csv
2024-10-10 17:18:51,580 - INFO - FAQ saved to CSV file successfully


In [71]:
filename='whitehouse_bios.csv'
logging.info(f"Saving Bios to CSV file: {filename}")
bios.to_csv(filename, index=False, encoding='utf-8')
logging.info("Bios saved to CSV file successfully")

2024-10-10 19:08:14,887 - INFO - Saving Bios to CSV file: whitehouse_bios.csv
2024-10-10 19:08:14,897 - INFO - Bios saved to CSV file successfully


In [64]:
bios

Unnamed: 0,name,bio
0,Abigail Adams,"Abigail Smith was born in Weymouth, Massachuse..."
1,Abigail Powers Fillmore,"Abigail Powers was born in Saratoga County, Ne..."
2,Abraham Lincoln,Abraham Lincoln warned the South in his first ...
3,Andrew Jackson,"Andrew Jackson was born on March 15, 1767 near..."
4,Andrew Johnson,"Andrew Johnson was born on December 29, 1808 i..."
...,...,...
112,Zachary Taylor,"Zachary Taylor was born on November 24, 1784 i..."
113,Woodrow Wilson,"Like Theodore Roosevelt before him, Woodrow Wi..."
114,Zachary Taylor,"Zachary Taylor was born on November 24, 1784 i..."
115,Woodrow Wilson,"Like Theodore Roosevelt before him, Woodrow Wi..."


In [65]:
faqs

Unnamed: 0,question,answer,source,source_type
0,Has every president worked in the Oval Office?,No! The Oval Office has been the primary presi...,https://www.whitehousehistory.org/questions/ha...,faq
1,How did the White House get its name?,There is a popular misconception that the Whit...,https://www.whitehousehistory.org/questions/ho...,faq
2,What is the Resolute Desk?,TheResoluteDesk is a double pedestal partners’...,https://www.whitehousehistory.org/questions/wh...,faq
3,Who saved the Gilbert Stuart painting of Georg...,"Completed in 1797, Gilbert Stuart’s painting o...",https://www.whitehousehistory.org/questions/ho...,faq
4,Has the White House ever been renovated or cha...,"Yes, many, many times! Today there is a marker...",https://www.whitehousehistory.org/questions/ha...,faq
5,How was the location of the White House selected?,"Located along the banks of the Potomac River, ...",https://www.whitehousehistory.org/questions/ho...,faq
6,Did President Abraham Lincoln actually sleep i...,President Abraham Lincolndid not sleep in the ...,https://www.whitehousehistory.org/questions/di...,faq
7,Have any presidents or first ladies died at th...,Two presidents and three first ladies have die...,https://www.whitehousehistory.org/questions/wh...,faq
8,How many weddings have been held at the White ...,There have been nineteen documented weddings h...,https://www.whitehousehistory.org/questions/ho...,faq
9,When did the White House host its first Easter...,The first annual White House Easter Egg Roll w...,https://www.whitehousehistory.org/questions/wh...,faq
