In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time


In [7]:
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    service = Service('/Users/ethanvertal/Documents/chromedriver-mac-arm64/chromedriver')  # Update this path
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver


def scrape_speech(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "transcript-inner"))
    )
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    title = soup.find('h2', class_='presidential-speeches--title').text.strip()
    
    transcript_div = soup.find('div', class_='transcript-inner')
    paragraphs = transcript_div.find_all('p')
    full_transcript = ' '.join([p.text.strip() for p in paragraphs])
    
    return title, full_transcript


def scrape_all_speeches(base_url):
    driver = setup_driver()
    speeches = []
    
    driver.get(base_url)
   
    
    # Scroll to load all speeches
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for page to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # Now parse the fully loaded page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    speech_links = soup.find_all('span', class_='field-content')
    
    for link in speech_links:
        a_tag = link.find('a')
        if a_tag and 'href' in a_tag.attrs:
            speech_url = a_tag['href']
            full_url = base_url + speech_url if speech_url.startswith('/') else speech_url
            
            try:
                title, transcript = scrape_speech(full_url, driver)
                speeches.append({
                    'title': title,
                    'transcript': transcript,
                    'url': full_url
                })
                print(f"Scraped: {title}")
            except Exception as e:
                print(f"Error scraping {full_url}: {str(e)}")
    
    driver.quit()
    return speeches


def save_to_csv(speeches, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'transcript', 'url'])
        writer.writeheader()
        for speech in speeches:
            writer.writerow(speech)

In [8]:
base_url = 'https://millercenter.org/the-presidency/presidential-speeches'
speeches, speech_links = scrape_all_speeches(base_url)


Error scraping https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east: scrape_speech() takes 1 positional argument but 2 were given
Error scraping https://millercenter.org/the-presidency/presidential-speeches/march-7-2024-state-union-address: scrape_speech() takes 1 positional argument but 2 were given
Error scraping https://millercenter.org/the-presidency/presidential-speeches/january-5-2024-speech-third-anniversary-january-6th-attack: scrape_speech() takes 1 positional argument but 2 were given
Error scraping https://millercenter.org/the-presidency/presidential-speeches/october-20-2023-remarks-us-response-support-israel-and-ukraine: scrape_speech() takes 1 positional argument but 2 were given
Error scraping https://millercenter.org/the-presidency/presidential-speeches/february-21-2023-remarks-one-year-anniversary-ukraine-war: scrape_speech() takes 1 positional argument but 2 were given
Error scraping https://millercenter.org/the-presidency/presi

ValueError: not enough values to unpack (expected 2, got 0)

In [None]:
speech_links

In [None]:
pd.DataFrame(data=speeches, columns=['title', 'transcript', 'url'])