In [55]:
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import time

In [56]:
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraping.log', mode='w', encoding='utf-8'),
        logging.StreamHandler()  # This will output logs to the console
    ]
)

logger = logging.getLogger('rag_project')

In [57]:
def setup_driver():
    logger.debug("Setting up the Chrome driver")
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    service = Service('/Users/ethanvertal/Documents/chromedriver-mac-arm64/chromedriver')  # Update this path
    driver = webdriver.Chrome(service=service, options=chrome_options)
    logger.debug("Chrome driver setup complete")
    return driver

def scrape_speech(url, driver):
    logger.debug(f"Scraping URL: {url}")
    driver.get(url)
    
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "transcript-inner") or (By.CLASS_NAME, "view-transcript"))
        )
        logger.debug(f"Page loaded successfully for {url}")
    except Exception as e:
        logger.error(f"Error waiting for transcript elements on {url}: {str(e)}")
        return None, None
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    try:
        title = soup.find('h2', class_='presidential-speeches--title').text.strip()
        logger.debug(f"Title found: {title}")
    except AttributeError:
        logger.error(f"Title not found on {url}")
        title = "Unknown Title"
    
    transcript_div = soup.find('div', class_='transcript-inner') or soup.find('div', class_='view-transcript')
    
    if not transcript_div:
        logger.error(f"Transcript container not found on {url}")
        return title, ""
    
    if transcript_div.find_all('p'):
        paragraphs = transcript_div.find_all('p')
        full_transcript = ' '.join([p.text.strip() for p in paragraphs])
    elif transcript_div.find_all('span'):
        spans = transcript_div.find_all('span')
        full_transcript = ' '.join([span.text.strip() for span in spans])
    else:
        full_transcript = transcript_div.decode_contents().replace('<br>', '\n').strip()
    
    logger.debug(f"Transcript scraped for {title}")
    return title, full_transcript

def scrape_all_speeches(base_url):
    driver = setup_driver()
    speeches = []
    
    logger.debug(f"Starting to scrape all speeches from base URL: {base_url}")
    driver.get(base_url)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    links = driver.find_elements(By.XPATH, "//div[@class='views-row']/a")
    speech_links = [link.get_attribute('href') for link in links]
    logger.debug(f"Found {len(speech_links)} speech links")

    for link in speech_links:
        try:
            title, transcript = scrape_speech(link, driver)
            if title and transcript:
                speeches.append({
                    'title': title,
                    'transcript': transcript,
                    'url': link
                })
                logger.info(f"Scraped: {title}")
        except Exception as e:
            logger.error(f"Error scraping {link}: {str(e)}")
    
    driver.quit()
    logger.debug("Finished scraping all speeches")
    return speeches

def save_to_csv(speeches, filename):
    logger.debug(f"Saving speeches to CSV file: {filename}")
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'transcript', 'url'])
        writer.writeheader()
        for speech in speeches:
            writer.writerow(speech)
    logger.debug("Speeches saved to CSV file successfully")


In [58]:
base_url = 'https://millercenter.org/the-presidency/presidential-speeches'
speeches = scrape_all_speeches(base_url)
    

In [59]:
pd.DataFrame(data=speeches, columns=['title', 'transcript', 'url'])

Unnamed: 0,title,transcript,url
