In [26]:
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

In [27]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/scraping.log', mode='w', encoding='utf-8'),
        logging.StreamHandler()  # This will output logs to the console
    ]
)

logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)

logger = logging.getLogger('rag_project')

In [None]:
def setup_driver():
    logging.info("Setting up the Chrome driver")
    driver_path = '/usr/local/bin/chromedriver'
    
    if not os.path.exists(driver_path):
        logging.error(f"ChromeDriver not found at {driver_path}")
        return None

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    service = Service(driver_path)
    
    try:
        driver = webdriver.Chrome(service=service, options=chrome_options)
        logging.info("Chrome driver setup complete")
    except Exception as e:
        logging.error(f"Failed to set up Chrome driver: {str(e)}")
        return None
    
    return driver

In [None]:
def scrape_speech(url, driver):
    if not driver:
        logging.error("Driver not initialized.")
        return None, None, None

    logging.info(f"Scraping URL: {url}")
    driver.get(url)
    
    try:
        WebDriverWait(driver, 20).until(
            EC.any_of(
                      EC.presence_of_element_located((By.CLASS_NAME, 'transcript-inner')),
                      EC.presence_of_element_located((By.CLASS_NAME, 'view-transcript'))
                     )
        )
        logging.info(f"Page loaded successfully for {url}")
    except Exception as e:
        logging.error(f"Error waiting for transcript elements on {url}: {str(e)}")
        return None, None, None
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    try:
        title = soup.find('h2', class_='presidential-speeches--title').text.strip()
        logging.info(f"Title found: {title}")
    except AttributeError:
        logging.error(f"Title not found on {url}")
        title = "Unknown Title"
    
    try:
        president = soup.find('label', class_='presidential-speeches--label').text.strip()
        logging.info(f"President found: {president}")
    except AttributeError:
        logging.error(f"President not found on {url}")
        president = "Unknown President"
    
    transcript_div = soup.find('div', class_='transcript-inner') or soup.find('div', class_='view-transcript')
    
    if not transcript_div:
        logging.error(f"Transcript container not found on {url}")
        return title, president, ""
    
    # Different transcript structures
    if transcript_div.find_all('p'):
        paragraphs = transcript_div.find_all('p')
        full_transcript = ' '.join([p.text.strip() for p in paragraphs])
    elif transcript_div.find_all('span'):
        spans = transcript_div.find_all('span')
        full_transcript = ' '.join([span.text.strip() for span in spans])
    elif transcript_div.find('p') and '<br>' in transcript_div.decode_contents():
        full_transcript = transcript_div.decode_contents().replace('<br>', '\n').strip()
    else:
        full_transcript = transcript_div.get_text(separator=' ', strip=True)
    
    logging.info(f"Transcript scraped for {title}")
    return title, president, full_transcript

In [None]:
def scrape_all_speeches(base_url):
    driver = setup_driver()
    speeches = []
    
    if not driver:
        logging.error("Driver setup failed. Exiting scrape_all_speeches.")
        return speeches
    
    logging.info(f"Starting to scrape all speeches from base URL: {base_url}")
    driver.get(base_url)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    links = driver.find_elements(By.XPATH, "//div[contains(@class, 'views-field-title')]//span[@class='field-content']/a")
    speech_links = [link.get_attribute('href') for link in links]
    logging.info(f"Found {len(speech_links)} speech links")

    for link in speech_links:
        for attempt in range(2):  # Retry mechanism
            try:
                title, president, transcript = scrape_speech(link, driver)
                if title and transcript:
                    speeches.append({
                        'title': title,
                        'president': president,
                        'transcript': transcript,
                        'url': link
                    })
                    logging.info(f"Scraped: {title} by {president}")
                    break
            except Exception as e:
                logging.error(f"Error scraping {link} on attempt {attempt + 1}: {str(e)}")
                time.sleep(2)  # Wait before retrying
    
    driver.quit()
    logging.info("Finished scraping all speeches")
    return speeches

In [32]:
def save_to_csv(speeches, filename):
    logging.info(f"Saving speeches to CSV file: {filename}")
    df = pd.DataFrame(speeches)
    df.to_csv(filename, index=False, encoding='utf-8')
    logging.info("Speeches saved to CSV file successfully")


In [33]:
base_url = 'https://millercenter.org/the-presidency/presidential-speeches'
speeches = scrape_all_speeches(base_url)
    

2024-07-13 11:17:43,188 - INFO - Setting up the Chrome driver
2024-07-13 11:17:43,373 - INFO - Chrome driver setup complete
2024-07-13 11:17:43,374 - INFO - Starting to scrape all speeches from base URL: https://millercenter.org/the-presidency/presidential-speeches
2024-07-13 11:18:22,757 - INFO - Found 1050 speech links
2024-07-13 11:18:22,758 - INFO - Scraping URL: https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east
2024-07-13 11:18:23,536 - INFO - Page loaded successfully for https://millercenter.org/the-presidency/presidential-speeches/may-31-2024-remarks-middle-east
2024-07-13 11:18:23,567 - INFO - Title found: May 31, 2024: Remarks on the Middle East
2024-07-13 11:18:23,569 - INFO - President found: Joe Biden Presidency
2024-07-13 11:18:23,571 - INFO - Transcript scraped for May 31, 2024: Remarks on the Middle East
2024-07-13 11:18:23,572 - INFO - Scraped: May 31, 2024: Remarks on the Middle East by Joe Biden Presidency
2024-07-13 11:18:2

In [37]:
pd.DataFrame(data=speeches, columns=['title', 'president', 'transcript', 'url'])

Unnamed: 0,title,president,transcript,url
0,"May 31, 2024: Remarks on the Middle East",Joe Biden Presidency,"THE PRESIDENT: Hello, folks. (The President ch...",https://millercenter.org/the-presidency/presid...
1,"March 7, 2024: State of Union Address",Joe Biden Presidency,"Good evening. Good evening. If I were smart, I...",https://millercenter.org/the-presidency/presid...
2,"January 5, 2024: Speech on the Third Anniversa...",Joe Biden Presidency,"THE PRESIDENT: Thank you, thank you, thank you...",https://millercenter.org/the-presidency/presid...
3,"October 20, 2023: Remarks on the US Response i...",Joe Biden Presidency,"Good evening, my fellow Americans. We’re facin...",https://millercenter.org/the-presidency/presid...
4,"February 21, 2023: Remarks on the One-Year Ann...",Joe Biden Presidency,"THE PRESIDENT: Hello, Poland! One of our grea...",https://millercenter.org/the-presidency/presid...
...,...,...,...,...
1045,"December 29, 1790: Talk to the Chiefs and Coun...",George Washington Presidency,"I the President of the United States, by my ow...",https://millercenter.org/the-presidency/presid...
1046,"December 8, 1790: Second Annual Message to Con...",George Washington Presidency,Fellow citizens of the Senate and House of Rep...,https://millercenter.org/the-presidency/presid...
1047,"January 8, 1790: First Annual Message to Congress",George Washington Presidency,Fellow Citizens of the Senate and House of Rep...,https://millercenter.org/the-presidency/presid...
1048,"October 3, 1789: Thanksgiving Proclamation",George Washington Presidency,Whereas it is the duty of all Nations to ackno...,https://millercenter.org/the-presidency/presid...


## Save to CSV

In [38]:
filename='presidential_speeches.csv'
logging.info(f"Saving speeches to CSV file: {filename}")
speeches.to_csv(filename, index=False, encoding='utf-8')
logging.info("Speeches saved to CSV file successfully")

2024-07-13 11:55:52,423 - INFO - Saving speeches to CSV file: presidential_speeches.csv
2024-07-13 11:55:52,894 - INFO - Speeches saved to CSV file successfully
