In [1]:
import re
import json
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# ✅ SET YOUR TARGET BASE URL HERE
BASE_URL = "https://www.stevens.edu/"

In [None]:
def get_driver():
    options = Options()
    options.add_argument('--headless=new')  # Use new headless mode
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

In [None]:
def is_internal(link, base_netloc):
    parsed = urlparse(link)
    return parsed.netloc == '' or parsed.netloc == base_netloc

In [None]:
def is_valid_href(href):
    # Avoid mailto:, tel:, javascript:, etc.
    return href and not href.startswith(('mailto:', 'tel:', 'javascript:', '#'))

In [None]:
def is_question(text):
    return text.strip().endswith('?') or re.match(r'^(how|what|why|when|where|who|is|can|does|should|do|did)\b', text.strip(), re.I)

In [None]:
def extract_qa_pairs(soup):
    qa_pairs = []
    seen = set()

    for tag in soup.find_all(re.compile('^h[1-6]$')):
        question = tag.get_text(strip=True)
        if is_question(question) and question not in seen:
            answer = ''
            for sib in tag.find_next_siblings():
                if sib.name and sib.name.startswith('h'):
                    break
                answer += sib.get_text(separator="\n", strip=True) + "\n"
            answer = answer.strip()
            if answer:
                qa_pairs.append({'Question': question, 'Answer': answer})
                seen.add(question)

    for detail in soup.find_all('details'):
        summary = detail.find('summary')
        if summary:
            question = summary.get_text(strip=True)
            if is_question(question) and question not in seen:
                detail_copy = detail.encode_contents().decode()
                soup_copy = BeautifulSoup(detail_copy, 'lxml')
                soup_copy.find('summary').decompose()
                answer = soup_copy.get_text(separator="\n", strip=True)
                if answer:
                    qa_pairs.append({'Question': question, 'Answer': answer})
                    seen.add(question)

    return qa_pairs

In [None]:
def crawl_site(base_url):
    driver = get_driver()
    visited = set()
    to_visit = [base_url]
    base_netloc = urlparse(base_url).netloc
    all_qas = []

    while to_visit:
        url = to_visit.pop(0)
        parsed_url = urlparse(url)
        clean_url = parsed_url._replace(fragment='').geturl()

        if clean_url in visited:
            continue
        visited.add(clean_url)

        try:
            driver.get(clean_url)
            time.sleep(1.5)
            soup = BeautifulSoup(driver.page_source, 'lxml')
        except Exception as e:
            print(f"❌ Failed to load {clean_url}: {e}")
            continue

        qa = extract_qa_pairs(soup)
        all_qas.extend(qa)
        print(f"✅ {len(qa)} Q&A from: {clean_url}")

        for a in soup.find_all('a', href=True):
            href = a['href'].split('#')[0].strip()
            if not is_valid_href(href):
                continue
            absolute = urljoin(clean_url, href)
            parsed = urlparse(absolute)
            if is_internal(absolute, base_netloc):
                norm_url = parsed._replace(fragment='').geturl()
                if norm_url not in visited and norm_url not in to_visit:
                    to_visit.append(norm_url)

    driver.quit()
    return all_qas

In [None]:
if __name__ == "__main__":
    print(f"🔍 Starting scrape from: {BASE_URL}")
    qa_data = crawl_site(BASE_URL)
    with open("questions_answers.json", "w", encoding="utf-8") as f:
        json.dump(qa_data, f, ensure_ascii=False, indent=2)
    print(f"✅ Done. {len(qa_data)} Q&A pairs saved to 'questions_answers.json'")