In [47]:

import pandas as pd
import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support import 
from selenium.webdriver.support.ui import WebDriverWait

def get_current_datetime_as_intstring():
    return datetime.datetime.now().strftime("%m%d%Y%H%M%S")

In [None]:
def get_article_links_with_selenium():

    article_links_df = pd.DataFrame(columns=["Date", "Title", "URL"])

    start_url = "https://www.wsscwater.com/newsroom"
    print(start_url)
    driver = webdriver.Chrome()
    driver.delete_all_cookies()
    driver.get(start_url)
    driver.implicitly_wait(0.5)

    wait = WebDriverWait(driver, timeout=2, poll_frequency=0.2)

    # Click on the News Type checkbox for alerts
    alert_filter = driver.find_element(by=By.XPATH, value="//label[@for='edit-type-253']")
    alert_filter.click()
    
    # Wait to make sure the page reloaded with just the alerts
    wait.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, ".pager__item--next a"), "href", "type%5B253%5D=253"))

    for year in range(2022,2027):
        print(f"Current year: {year}")
        # Select the appropriate year
        select_year_element = driver.find_element(by=By.ID, value="edit-year")
        select_year = Select(select_year_element)
        print(f"Selected year: {select_year.all_selected_options[0].text}")
        select_year.select_by_value(f"{year}")
        # Wait to make sure the correct year loaded. If it doesn't load, try selecting the year again (but only once)
        try:
            wait.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, ".pager__item--next a"), "href", f"year={year}"))
        except Exception:
            select_year_element = driver.find_element(by=By.ID, value="edit-year")
            select_year = Select(select_year_element)
            print(f"Selected year: {select_year.all_selected_options[0].text}")
            select_year.select_by_value(f"{year}")
            wait.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, ".pager__item--next a"), "href", f"year={year}"))


        current_page_num = 0
        
        while(True):
            current_page_num = current_page_num + 1
            # print(f"Checking page {current_page_num}...")

            for alert in driver.find_elements(by=By.CSS_SELECTOR, value=".view-content article"):
                # print(a_link.get_attribute("href"))
                link = alert.find_element(by=By.CSS_SELECTOR, value="h3 a")
                date = alert.find_element(by=By.CSS_SELECTOR, value="time")
                article_links_df.loc[len(article_links_df)] = {
                    "Date": date.text,
                    "Title": link.text,
                    "URL": link.get_attribute("href")
                }
                # article_links_list.append(a_link.get_attribute("href"))


            try:
                next_link = driver.find_element(by=By.CSS_SELECTOR, value=".pager__item--next a")
                # print(f"Clicking next button, link is {next_link.get_attribute("href")}")
                next_link.click()

                # For some reason, clicking "next" doesn't take on the first try most of the time. 
                # To work around this, we will wait a few seconds to see if it takes, then try again. 
                # If the second time doesn't work, then we worry  
                try:
                    wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "li.pager__item.is-active a"), f"{current_page_num+1}"))
                except Exception:
                    next_link = driver.find_element(by=By.CSS_SELECTOR, value=".pager__item--next a")
                    next_link.click()
                    wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "li.pager__item.is-active a"), f"{current_page_num+1}"))
                continue
            
            # If selenium cannot find a "next page" link, then this was the last page and we can exit the while loop
            except NoSuchElementException:
                print("Break!")
                break

    return article_links_df

article_links_df = get_article_links_with_selenium()
article_links_df.to_csv(f"output/article_links.csv")

# We'll also save a copy of the dataframe with a timestamp in the name, so we have a version 
# of it saved that won't get overwritten if we run the scraper again later
timestamp = datetime.datetime.now().strftime("%m%d%Y%H%M%S")
article_links_df.to_csv(f"output/article_links_{timestamp}.csv", index=False)
article_links_df.head()


https://www.wsscwater.com/newsroom
Current year: 2022
Selected year: - Current Year (2026) -
Break!
Current year: 2023
Selected year: 2022
Selected year: 2022
Break!
Current year: 2024
Selected year: 2023
Selected year: 2023
Break!
Current year: 2025
Selected year: 2024
Selected year: 2024
Break!
Current year: 2026
Selected year: 2025
Selected year: 2025
Break!


Unnamed: 0,Date,Title,URL
0,"December 30, 2022",Emergency Water Main Repair - Takoma Park,https://www.wsscwater.com/news/2022/december/e...
1,"December 29, 2022",Emergency Water Main Repair - Capitol Heights,https://www.wsscwater.com/news/2022/december/e...
2,"December 29, 2022",Emergency Water Main Repair - Oxon Hill,https://www.wsscwater.com/news/2022/december/e...
3,"December 28, 2022",Emergency Water Main Repair - Silver Spring,https://www.wsscwater.com/news/2022/december/e...
4,"December 28, 2022",Emergency Water Main Repair - Greenbelt,https://www.wsscwater.com/news/2022/december/e...
...,...,...,...
439,"January 23, 2026",System Maintenance Alert - January 24,https://www.wsscwater.com/news/2026/january/sy...
440,"January 23, 2026",Scheduled Water Main Repair - Temple Hills,https://www.wsscwater.com/news/2026/january/sc...
441,"January 21, 2026",Emergency Water Main Repair - Landover,https://www.wsscwater.com/news/2026/january/em...
442,"January 9, 2026",Emergency Water Main Repair - Greenbelt,https://www.wsscwater.com/news/2026/january/em...


In [None]:
from bs4 import BeautifulSoup
import requests, re, time, random


In [8]:
# This function is borrowed from here, with some light tweaking: https://www.datacamp.com/blog/ethical-web-scraping
def fetch_with_retry(url, headers, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers)
            response.raise_for_status()  # Raise exception for HTTP errors
            return response
        except requests.RequestException:
            if attempt == max_retries - 1:
                # Last attempt failed, log and give up
                print(f"Failed to fetch {url} after {max_retries} attempts")
                return None
            
            # Wait with exponential backoff + small random offset
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            print(f"Attempt {attempt+1} failed, waiting {wait_time:.2f}s before retry")
            time.sleep(wait_time)	

def scrape_wssc_alert_page(url, headers):
    response = fetch_with_retry(url=url,headers=headers)

    if response is None:
        return None
    
    soup = BeautifulSoup(response.content)

    title = soup.select_one("h1").text.strip()
    date = soup.select_one("time").text.strip()
    # location = soup.select_one(".node__content p strong").text.split(" – ")[0]
    full_text = "\n".join([item.get_text() for item in soup.select(".node__content .field--type-text-long p")])
    
    alert_re = re.compile(r'(?P<diameter>\d{1,3})-inch water main at (?P<address>[\w,\s\.-]+?)\. Customers', re.IGNORECASE)
    re_search_results = alert_re.search(full_text)

    data = {
        "title": title,
        "date": date,
        # "location": location,
        "pipe_diameter": re_search_results.group("diameter") if re_search_results is not None else None,
        "address": re_search_results.group("address") if re_search_results is not None else None,
        "full_text": full_text,
    }

    return data


In [None]:
article_links_df = pd.read_csv("output/article_links.csv")
emergency_repair_links = article_links_df[article_links_df["Title"].str.contains("Emergency")]
emergency_repair_links.head()



Unnamed: 0,Date,Title,URL
0,"December 30, 2022",Emergency Water Main Repair - Takoma Park,https://www.wsscwater.com/news/2022/december/e...
1,"December 29, 2022",Emergency Water Main Repair - Capitol Heights,https://www.wsscwater.com/news/2022/december/e...
2,"December 29, 2022",Emergency Water Main Repair - Oxon Hill,https://www.wsscwater.com/news/2022/december/e...
3,"December 28, 2022",Emergency Water Main Repair - Silver Spring,https://www.wsscwater.com/news/2022/december/e...
4,"December 28, 2022",Emergency Water Main Repair - Greenbelt,https://www.wsscwater.com/news/2022/december/e...


In [48]:

data = []
headers = {
    "USER-AGENT": "wssc_alerts scraper (efurth@montgomerycollege.edu)"
}

for link in emergency_repair_links["URL"]:
    data.append(scrape_wssc_alert_page(link, headers))
    time.sleep(3)

    
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,date,pipe_diameter,address,full_text
0,Emergency Water Main Repair - Takoma Park,"December 30, 2022",8,602 Ethan Allen Avenue in Takoma Park,"Laurel, MD – December 30, 2022: WSSC Water is ..."
1,Emergency Water Main Repair - Capitol Heights,"December 29, 2022",12,6180 Old Central Avenue at Rollins Avenue in C...,"Laurel, MD – December 29, 2022: WSSC Water is ..."
2,Emergency Water Main Repair - Oxon Hill,"December 29, 2022",8,801 Owens Road in Oxon Hill,"Laurel, MD – December 29, 2022: WSSC Water is ..."
3,Emergency Water Main Repair - Silver Spring,"December 28, 2022",8,12001 Old Columbia Pike in Silver Spring,"Laurel, MD – December 28, 2022: WSSC Water is ..."
4,Emergency Water Main Repair - Greenbelt,"December 28, 2022",10,9115 Springhill Ln. in Greenbelt,"Laurel, MD – December 28, 2022: WSSC Water is ..."


In [49]:
df.to_csv("output/wssc_alerts.csv")