In [5]:
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import time

In [9]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/scraping.log', mode='w', encoding='utf-8'),
        logging.StreamHandler()  # This will output logs to the console
    ]
)

logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)

logger = logging.getLogger('rag_project')

In [10]:
def setup_driver():
    logger.debug("Setting up the Chrome driver")
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    service = Service('/Users/ethanvertal/Documents/chromedriver-mac-arm64/chromedriver')  # Update this path
    driver = webdriver.Chrome(service=service, options=chrome_options)
    logger.debug("Chrome driver setup complete")
    return driver

def scrape_speech(url, driver):
    logger.debug(f"Scraping URL: {url}")
    driver.get(url)
    
    try:
        WebDriverWait(driver, 20).until(
            EC.any_of(
                      EC.presence_of_element_located((By.CLASS_NAME, 'transcript-inner')),
                      EC.presence_of_element_located((By.CLASS_NAME, 'view-transcript'))
                     )
        )
        logger.debug(f"Page loaded successfully for {url}")
    except Exception as e:
        logger.error(f"Error waiting for transcript elements on {url}: {str(e)}")
        return None, None
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    try:
        title = soup.find('h2', class_='presidential-speeches--title').text.strip()
        logger.debug(f"Title found: {title}")
    except AttributeError:
        logger.error(f"Title not found on {url}")
        title = "Unknown Title"
    
    transcript_div = soup.find('div', class_='transcript-inner') or soup.find('div', class_='view-transcript')
    
    if not transcript_div:
        logger.error(f"Transcript container not found on {url}")
        return title, ""
    
    # Different transcript structures
    if transcript_div.find_all('p'):
        paragraphs = transcript_div.find_all('p')
        full_transcript = ' '.join([p.text.strip() for p in paragraphs])
    elif transcript_div.find_all('span'):
        spans = transcript_div.find_all('span')
        full_transcript = ' '.join([span.text.strip() for span in spans])
    elif transcript_div.find('p') and '<br>' in transcript_div.decode_contents():
        full_transcript = transcript_div.decode_contents().replace('<br>', '\n').strip()
    else:
        full_transcript = transcript_div.get_text(separator=' ', strip=True)
    
    logger.debug(f"Transcript scraped for {title}")
    return title, full_transcript

def scrape_all_speeches(base_url):
    driver = setup_driver()
    speeches = []
    
    logger.debug(f"Starting to scrape all speeches from base URL: {base_url}")
    driver.get(base_url)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    links = driver.find_elements(By.XPATH, "//div[contains(@class, 'views-field-title')]//span[@class='field-content']/a")
    speech_links = [link.get_attribute('href') for link in links]
    logger.debug(f"Found {len(speech_links)} speech links")

    for link in speech_links:
        for attempt in range(2):  # Retry mechanism
            try:
                title, transcript = scrape_speech(link, driver)
                if title and transcript:
                    speeches.append({
                        'title': title,
                        'transcript': transcript,
                        'url': link
                    })
                    logger.info(f"Scraped: {title}")
                    break
            except Exception as e:
                logger.error(f"Error scraping {link} on attempt {attempt + 1}: {str(e)}")
                time.sleep(2)  # Wait before retrying
    
    driver.quit()
    logger.debug("Finished scraping all speeches")
    return speeches

def save_to_csv(speeches, filename):
    logger.debug(f"Saving speeches to CSV file: {filename}")
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'transcript', 'url'])
        writer.writeheader()
        for speech in speeches:
            writer.writerow(speech)
    logger.debug("Speeches saved to CSV file successfully")
    logger.debug("Speeches saved to CSV file successfully")


In [11]:
base_url = 'https://millercenter.org/the-presidency/presidential-speeches'
speeches = scrape_all_speeches(base_url)
    

2024-07-10 14:33:45,378 - DEBUG - Setting up the Chrome driver
2024-07-10 14:33:46,715 - DEBUG - Chrome driver setup complete
2024-07-10 14:33:46,718 - DEBUG - Starting to scrape all speeches from base URL: https://millercenter.org/the-presidency/presidential-speeches
2024-07-10 14:34:28,802 - DEBUG - Found 1050 speech links
2024-07-10 14:34:28,809 - DEBUG - Scraping URL: https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east
2024-07-10 14:34:29,313 - DEBUG - Page loaded successfully for https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east
2024-07-10 14:34:29,332 - DEBUG - Title found: May 31, 2024: Remarks on the Middle East
2024-07-10 14:34:29,334 - DEBUG - Transcript scraped for May 31, 2024: Remarks on the Middle East
2024-07-10 14:34:29,336 - INFO - Scraped: May 31, 2024: Remarks on the Middle East
2024-07-10 14:34:29,337 - DEBUG - Scraping URL: https://millercenter.org/the-presidency/presidential-spee

In [13]:
import pandas as pd

In [14]:
pd.DataFrame(data=speeches, columns=['title', 'transcript', 'url'])

Unnamed: 0,title,transcript,url
0,"May 31, 2024: Remarks on the Middle East","THE PRESIDENT: Hello, folks. (The President ch...",https://millercenter.org/the-presidency/presid...
1,"March 7, 2024: State of Union Address","Good evening. Good evening. If I were smart, I...",https://millercenter.org/the-presidency/presid...
2,"January 5, 2024: Speech on the Third Anniversa...","THE PRESIDENT: Thank you, thank you, thank you...",https://millercenter.org/the-presidency/presid...
3,"October 20, 2023: Remarks on the US Response i...","Good evening, my fellow Americans. We’re facin...",https://millercenter.org/the-presidency/presid...
4,"February 21, 2023: Remarks on the One-Year Ann...","THE PRESIDENT: Hello, Poland! One of our grea...",https://millercenter.org/the-presidency/presid...
...,...,...,...
1045,"December 29, 1790: Talk to the Chiefs and Coun...","I the President of the United States, by my ow...",https://millercenter.org/the-presidency/presid...
1046,"December 8, 1790: Second Annual Message to Con...",Fellow citizens of the Senate and House of Rep...,https://millercenter.org/the-presidency/presid...
1047,"January 8, 1790: First Annual Message to Congress",Fellow Citizens of the Senate and House of Rep...,https://millercenter.org/the-presidency/presid...
1048,"October 3, 1789: Thanksgiving Proclamation",Whereas it is the duty of all Nations to ackno...,https://millercenter.org/the-presidency/presid...


In [15]:
save_to_csv(speeches, 'presidential_speeches.csv')

2024-07-10 15:21:52,784 - DEBUG - Saving speeches to CSV file: presidential_speeches.csv
2024-07-10 15:21:53,534 - DEBUG - Speeches saved to CSV file successfully
2024-07-10 15:21:53,549 - DEBUG - Speeches saved to CSV file successfully
