In [None]:
# 📁 Standard libraries
import os
import re
import json
import socket
import ipaddress

# 🧪 Third-party libraries
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# 🌐 Selenium (Web Automation)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
"""
Useful or Another approach: https://chatgpt.com/share/68794437-7a24-8007-ab41-d416bdbde92d
"""

def wait_for_page_load(driver, timeout=10):
    try:
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
    except Exception as e:
        print(f"[!] Timeout or error during page load: {e}")


def is_safe_url(url):
    try:
        parsed = urlparse(url)
        if parsed.scheme not in ["http", "https"] or parsed.netloc == "":
            return False

        host = parsed.hostname
        ip = ipaddress.ip_address(socket.gethostbyname(host))
        if ip.is_private or ip.is_loopback or ip.is_reserved or ip.is_link_local:
            return False
    except Exception:
        return False
    return True


# CONFIGURATION
BASE_URL = "https://www.microwebtec.com/"
# DELAY = 30  # seconds
OUTPUT_DIR = "scraped_pages"
# CHROMEDRIVER_PATH = "C:\Tools\chromedriver\chromedriver.exe"  # for Windows

# Set up headless Chrome
def init_driver():
    global driver
    options = Options()
    options.add_argument("--headless")
    # options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver


def safe_get(url):
    global driver
    try:
        driver.get(url)
        wait_for_page_load(driver)
    except (InvalidSessionIdException, WebDriverException) as e:
        print(f"[!] Driver error on {url}: {e}. Restarting driver.")
        driver.quit()
        driver = init_driver()
        driver.get(url)
        wait_for_page_load(driver)
    except Exception as e:
        print(f"[!] Other error on {url}: {e}")


# Collect all internal links
def collect_links(base_url, driver):
    visited = set()
    to_visit = [base_url]

    while to_visit:
        url = to_visit.pop()
        if url in visited:
            continue
        visited.add(url)

        safe_get(url)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        for a in soup.find_all("a", href=True):
            href = a["href"]
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)

            if (parsed.netloc == urlparse(base_url).netloc and full_url not in visited and is_safe_url(full_url)):
                to_visit.append(full_url)

    return visited


# Extract clean content
def extract_clean_content(html):
    soup = BeautifulSoup(html, "html.parser")
    content = []

    for tag in soup.find_all(["h1", "h2", "h3", "p", "li", "code", "pre"]):
        text = tag.get_text(strip=True)
        if text:
            content.append(text)

    return "\n".join(content)


# Save content to files
def sanitize_filename(s):
    return re.sub(r"[^\w\-_.]", "_", s)

def save_content(url, content, count):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    filename = f"page_{count+1}_{sanitize_filename(url[-30:])}.txt"  # or use slugify
    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"URL: {url}\n\n")
        f.write(content)
    print(f"✅ Saved: {filepath}")


# Full workflow
def run_full_scraper():
    global driver
    print("🔍 Starting browser...")
    driver = init_driver()

    print("🔍 Collecting links...")
    links = collect_links(BASE_URL, driver)
    print(f"✅ Found {len(links)} pages.")

    for i, link in enumerate(links):
        try:
            # Restart driver every 100 pages to avoid session expiration
            if i % 100 == 0 and i != 0:
                print(f"[#] Restarting driver at item {i}")
                driver.quit()
                driver = init_driver()

            print(f"📄 Scraping ({i + 1}/{len(links)}): {link}")
            safe_get(link)
            html = driver.page_source
            content = extract_clean_content(html)   

            if content.strip():
                save_content(link, content, i)
        except Exception as e:
            print(f"❌ Failed to scrape {link}: {e}")
            continue

    driver.quit()
    print("✅ Done scraping.")


if __name__ == "__main__":
    run_full_scraper()

