# Data Collection
## Source Info
This data is scraped off of the UVA Miller Center Famous Presidential Speeches Archive. I collected the transcript off of each speech and will clean it for use as the document base in the RAG system. 

In [6]:
import logging
import os
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [7]:
mac = True

In [8]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/scraping.log', mode='w', encoding='utf-8'),
        logging.StreamHandler()  # This will output logs to the console
    ]
)

logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)

logger = logging.getLogger('rag_project')

In [13]:
def setup_driver():
    logging.info("Setting up the Chrome driver")
    driver_path = '/Users/ethanvertal/Documents/chromedriver-mac-arm64/chromedriver' if mac else '/usr/local/bin/chromedriver'
    
    if not os.path.exists(driver_path):
        logging.error(f"ChromeDriver not found at {driver_path}")
        return None

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    service = Service(driver_path)
    
    try:
        driver = webdriver.Chrome(service=service, options=chrome_options)
        logging.info("Chrome driver setup complete")
    except Exception as e:
        logging.error(f"Failed to set up Chrome driver: {str(e)}")
        return None
    
    return driver

In [5]:
def scrape_speech(url, driver):
    if not driver:
        logging.error("Driver not initialized.")
        return None, None, None

    logging.info(f"Scraping URL: {url}")
    driver.get(url)
    
    try:
        WebDriverWait(driver, 20).until(
            EC.any_of(
                      EC.presence_of_element_located((By.CLASS_NAME, 'transcript-inner')),
                      EC.presence_of_element_located((By.CLASS_NAME, 'view-transcript'))
                     )
        )
        logging.info(f"Page loaded successfully for {url}")
    except Exception as e:
        logging.error(f"Error waiting for transcript elements on {url}: {str(e)}")
        return None, None, None
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    try:
        title = soup.find('h2', class_='presidential-speeches--title').text.strip()
        logging.info(f"Title found: {title}")
    except AttributeError:
        logging.error(f"Title not found on {url}")
        title = "Unknown Title"
    
    try:
        president = soup.find('label', class_='presidential-speeches--label').text.strip()
        logging.info(f"President found: {president}")
    except AttributeError:
        logging.error(f"President not found on {url}")
        president = "Unknown President"
    
    transcript_div = soup.find('div', class_='transcript-inner') or soup.find('div', class_='view-transcript')
    
    if not transcript_div:
        logging.error(f"Transcript container not found on {url}")
        return title, president, ""
    
    # Different transcript structures
    if transcript_div.find_all('p'):
        paragraphs = transcript_div.find_all('p')
        full_transcript = ' '.join([p.text.strip() for p in paragraphs])
    elif transcript_div.find_all('span'):
        spans = transcript_div.find_all('span')
        full_transcript = ' '.join([span.text.strip() for span in spans])
    elif transcript_div.find('p') and '<br>' in transcript_div.decode_contents():
        full_transcript = transcript_div.decode_contents().replace('<br>', '\n').strip()
    else:
        full_transcript = transcript_div.get_text(separator=' ', strip=True)
    
    logging.info(f"Transcript scraped for {title}")
    return title, president, full_transcript

In [6]:
def scrape_all_speeches(base_url):
    driver = setup_driver()
    speeches = []
    
    if not driver:
        logging.error("Driver setup failed. Exiting scrape_all_speeches.")
        return speeches
    
    logging.info(f"Starting to scrape all speeches from base URL: {base_url}")
    driver.get(base_url)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    links = driver.find_elements(By.XPATH, "//div[contains(@class, 'views-field-title')]//span[@class='field-content']/a")
    speech_links = [link.get_attribute('href') for link in links]
    logging.info(f"Found {len(speech_links)} speech links")

    for link in speech_links:
        for attempt in range(2):  # Retry mechanism
            try:
                title, president, transcript = scrape_speech(link, driver)
                if title and transcript:
                    speeches.append({
                        'title': title,
                        'president': president,
                        'transcript': transcript,
                        'url': link
                    })
                    logging.info(f"Scraped: {title} by {president}")
                    break
            except Exception as e:
                logging.error(f"Error scraping {link} on attempt {attempt + 1}: {str(e)}")
                time.sleep(2)  # Wait before retrying
    
    driver.quit()
    logging.info("Finished scraping all speeches")
    return speeches

In [7]:
def save_to_csv(speeches, filename):
    logging.info(f"Saving speeches to CSV file: {filename}")
    df = pd.DataFrame(speeches)
    df.to_csv(filename, index=False, encoding='utf-8')
    logging.info("Speeches saved to CSV file successfully")


In [9]:
base_url = 'https://millercenter.org/the-presidency/presidential-speeches'
speeches = scrape_all_speeches(base_url)
    

2024-08-09 16:47:10,682 - INFO - Setting up the Chrome driver
2024-08-09 16:47:11,523 - INFO - Chrome driver setup complete
2024-08-09 16:47:11,524 - INFO - Starting to scrape all speeches from base URL: https://millercenter.org/the-presidency/presidential-speeches
2024-08-09 16:48:07,608 - INFO - Found 1050 speech links
2024-08-09 16:48:07,609 - INFO - Scraping URL: https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east
2024-08-09 16:48:09,890 - INFO - Page loaded successfully for https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east
2024-08-09 16:48:09,958 - INFO - Title found: May 31, 2024: Remarks on the Middle East
2024-08-09 16:48:09,962 - INFO - President found: Joe Biden Presidency
2024-08-09 16:48:09,965 - INFO - Transcript scraped for May 31, 2024: Remarks on the Middle East
2024-08-09 16:48:09,966 - INFO - Scraped: May 31, 2024: Remarks on the Middle East by Joe Biden Presidency
2024-08-09 16:48:0

In [12]:
speeches = pd.DataFrame(data=speeches, columns=['title', 'president', 'transcript', 'url'])

## Save to CSV

In [13]:
filename='presidential_speeches.csv'
logging.info(f"Saving speeches to CSV file: {filename}")
speeches.to_csv(filename, index=False, encoding='utf-8')
logging.info("Speeches saved to CSV file successfully")

2024-08-09 17:40:28,224 - INFO - Saving speeches to CSV file: presidential_speeches.csv
2024-08-09 17:40:28,615 - INFO - Speeches saved to CSV file successfully


In [10]:
def scrape_single_question(question_url):
    full_url = f"https://www.whitehousehistory.org{question_url}"
    logging.info(f"Sending request to {full_url}")
    
    # Send a request to the question page
    question_response = requests.get(full_url)
    if question_response.status_code == 200:
        logging.info(f"Request to {full_url} successful.")
        question_soup = BeautifulSoup(question_response.text, 'html.parser')
        
        # Extract the question from the h1 tag
        question = question_soup.find('h1').get_text(strip=True)
        logging.info(f"Extracted question: {question}")
        
        # Extract the answer content (paragraphs and lists)
        answer_parts = []
        main_content = question_soup.find('main')
        if main_content:
            logging.info(f"Found main content for question: {question}")
            for tag in main_content.find_all(['p', 'ul']):
                if tag.name == 'p':
                    answer_text = tag.get_text(strip=True)
                    answer_parts.append(answer_text)
                    logging.info(f"Extracted paragraph: {answer_text}")
                elif tag.name == 'ul':
                    # If it's a list, concatenate the list items
                    list_items = [li.get_text(strip=True) for li in tag.find_all('li')]
                    answer_text = "; ".join(list_items)
                    answer_parts.append(answer_text)
                    logging.info(f"Extracted list: {answer_text}")
        
        answer = " ".join(answer_parts)
        logging.info(f"Compiled answer: {answer}")
        
        return {"Question": question, "Answer": answer}
    else:
        logging.error(f"Failed to retrieve {full_url}, status code: {question_response.status_code}")
        return None

In [11]:
def scrape_all_questions():
    # Set up the Selenium WebDriver
    driver = setup_driver()  # Ensure chromedriver is installed and in your PATH
    base_url = "https://www.whitehousehistory.org/questions"
    driver.get(base_url)
    time.sleep(3)  # Allow time for the page to load
    
    # Click the "Keep Looking?" button until all questions are loaded
    while True:
        try:
            load_more_button = driver.find_element(By.CLASS_NAME, "section__load-more")
            load_more_button.click()
            time.sleep(3)  # Allow time for new content to load
        except:
            logger.info("No more 'Keep Looking?' button found, all questions should be loaded.")
            break
    
    # Get all question links
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    question_links = soup.select(".related-content-list .entry-card__link")
    logger.info(f"Found {len(question_links)} question links.")
    
    # Scrape each question
    all_data = []
    for link in question_links:
        question_url = link['href']
        data = scrape_single_question(question_url)
        if data:
            all_data.append(data)
    
    # Close the driver
    driver.quit()
    logging.info(f"Finished Scraping {len(all_data)} questions!")
    return all_data

In [16]:
faqs = scrape_all_questions()

2024-10-10 14:28:17,128 - INFO - Setting up the Chrome driver
2024-10-10 14:28:22,684 - INFO - Chrome driver setup complete
2024-10-10 14:28:28,186 - INFO - No more 'Keep Looking?' button found, all questions should be loaded.
2024-10-10 14:28:28,228 - INFO - Found 12 question links.
2024-10-10 14:28:28,229 - INFO - Sending request to https://www.whitehousehistory.orghttps://www.whitehousehistory.org/questions/has-every-president-worked-in-the-oval-office


ConnectionError: HTTPSConnectionPool(host='www.whitehousehistory.orghttps', port=443): Max retries exceeded with url: /www.whitehousehistory.org/questions/has-every-president-worked-in-the-oval-office (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x1098f6790>: Failed to resolve 'www.whitehousehistory.orghttps' ([Errno 8] nodename nor servname provided, or not known)"))

In [None]:
filename='whitehouse_faq.csv'
logging.info(f"Saving speeches to CSV file: {filename}")
speeches.to_csv(filename, index=False, encoding='utf-8')
logging.info("Q&A saved to CSV file successfully")