In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
import re
import tldextract
from urllib.parse import urlparse, urljoin, urlunparse, parse_qsl, urlencode
from collections import deque
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [3]:
class UnifiedScraper:
    def __init__(self, base_url, max_depth=3, max_pages=50):
        self.base_url = base_url
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.base_domain = tldextract.extract(base_url).registered_domain
        self.driver = self._init_driver()
        self.visited = set()
        self.queue = deque()
        self.combined_content = []

    def _init_driver(self):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        service = Service(ChromeDriverManager().install())
        return webdriver.Chrome(service=service, options=options)

    def _clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\b(?:copyright|trademark|®|©|all rights reserved).*', '', text, flags=re.IGNORECASE)
        return text.strip()

    def _is_same_domain(self, url):
        return tldextract.extract(url).registered_domain == self.base_domain

    def _normalize_url(self, url):
        parsed = urlparse(url)
        filtered_query = [(k, v) for k, v in parse_qsl(parsed.query) if not k.lower().startswith('utm_')]
        normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path.rstrip('/'), '', urlencode(filtered_query), ''))
        return normalized

    def is_similar_content(self, new_text, threshold=0.9):
        new_words = set(new_text.split())
        for content in self.combined_content:
            existing_words = set(content.split())
            if not existing_words or not new_words:
                continue
            intersection = new_words.intersection(existing_words)
            union = new_words.union(existing_words)
            similarity = len(intersection) / len(union)
            if similarity >= threshold:
                return True
        return False

    def scrape_page(self, url, depth=0):
        if depth > self.max_depth or len(self.visited) >= self.max_pages:
            return

        norm_url = self._normalize_url(url)
        if norm_url in self.visited:
            logging.info(f"Skipping visited URL: {norm_url}")
            return

        self.visited.add(norm_url)
        logging.info(f"Scraping URL: {norm_url} (depth={depth})")

        try:
            self.driver.get(norm_url)
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            time.sleep(1)

            page_text = self._clean_text(self.driver.find_element(By.TAG_NAME, 'body').text)
            if not self.is_similar_content(page_text):
                self.combined_content.append(page_text)
            else:
                logging.info(f"Duplicate content at {norm_url}")

            links = self.driver.find_elements(By.TAG_NAME, 'a')
            hrefs = []
            for link in links:
                try:
                    href = link.get_attribute('href')
                    if href and not href.startswith(('javascript:', 'mailto:', '#')):
                        absolute_url = urljoin(norm_url, href)
                        if self._is_same_domain(absolute_url):
                            hrefs.append(absolute_url)
                except Exception as e:
                    logging.warning(f"Error retrieving href: {e}")

            for href in hrefs:
                self.scrape_page(href, depth + 1)

        except Exception as e:
            logging.error(f"Error processing {norm_url}: {e}")

    def crawl(self):
        self.scrape_page(self.base_url)
        self.driver.quit()
        self._save_results()

    def _save_results(self):
        output_path = 'combined_content.txt'
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write('\n\n'.join(self.combined_content))
        logging.info(f"Saved combined content to {output_path}")

In [4]:
if __name__ == "__main__":
    scraper = UnifiedScraper(
        base_url="https://web.mit.edu/",
        max_depth=10,
        max_pages=70
    )
    scraper.crawl()

2025-04-10 00:07:14,959 INFO Get LATEST chromedriver version for google-chrome
2025-04-10 00:07:14,995 INFO About to download new driver from https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip
2025-04-10 00:07:15,030 INFO Driver downloading response is 200
2025-04-10 00:07:15,390 INFO Get LATEST chromedriver version for google-chrome
2025-04-10 00:07:15,647 INFO Get LATEST chromedriver version for google-chrome
2025-04-10 00:07:15,850 INFO Driver has been saved in cache [/home/dhruv_dixit/.wdm/drivers/chromedriver/linux64/114.0.5735.90]


WebDriverException: Message: unknown error: cannot find Chrome binary
Stacktrace:
#0 0x55bb9a8474e3 <unknown>
#1 0x55bb9a576c76 <unknown>
#2 0x55bb9a59d757 <unknown>
#3 0x55bb9a59c029 <unknown>
#4 0x55bb9a5daccc <unknown>
#5 0x55bb9a5da47f <unknown>
#6 0x55bb9a5d1de3 <unknown>
#7 0x55bb9a5a72dd <unknown>
#8 0x55bb9a5a834e <unknown>
#9 0x55bb9a8073e4 <unknown>
#10 0x55bb9a80b3d7 <unknown>
#11 0x55bb9a815b20 <unknown>
#12 0x55bb9a80c023 <unknown>
#13 0x55bb9a7da1aa <unknown>
#14 0x55bb9a8306b8 <unknown>
#15 0x55bb9a830847 <unknown>
#16 0x55bb9a840243 <unknown>
#17 0x7f54ee6ed609 start_thread
