<a href="https://colab.research.google.com/github/entropymark/Colab-output/blob/main/URL_Scraper_for_NotebookLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import sys

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    urls = set()
    try:
        # User Agent is important so websites don't block the script immediately
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8' # Ensure correct encoding

        # If the page is not accessible, return empty
        if response.status_code != 200:
            print(f"  [!] Error accessing {url}: Status {response.status_code}")
            return urls

        soup = BeautifulSoup(response.text, "html.parser")

        # Find all link tags
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                continue

            # Join relative URLs (e.g. '/about') with base URL to make absolute
            href = urljoin(url, href)

            # Remove query parameters/anchors to clean up URLs (optional, helps deduplication)
            # href = href.split("?")[0].split("#")[0]
            # Note: Keeping queries for now as some sites rely on them for navigation

            parsed_href = urlparse(href)

            # Clean the URL (remove trailing slash for consistency)
            href = href.rstrip('/')

            # Validate URL and ensure we stay on the same domain (internal links only)
            if is_valid(href) and href not in urls:
                original_domain = urlparse(url).netloc
                target_domain = parsed_href.netloc

                # Check if domains match (handling subdomains slightly loosely)
                if original_domain in target_domain:
                    urls.add(href)

    except Exception as e:
        print(f"  [!] Error parsing {url}: {e}")

    return urls

def crawl(url, max_depth=2, max_urls=200):
    """
    Crawls a web page and extracts all links.

    :param url: The starting URL
    :param max_depth: How many clicks deep to go (1 = just the page, 2 = page + links on it, etc.)
    :param max_urls: Safety limit to stop the script from running forever
    """
    global visited_urls
    visited_urls = set()
    urls_to_visit = [(url, 0)] # Queue of (url, current_depth)
    visited_urls.add(url)

    print(f"[*] Starting crawl on: {url}")
    print(f"[*] Max Depth: {max_depth} | Max URLs limit: {max_urls}")
    print("-" * 50)

    processed_count = 0

    while urls_to_visit and processed_count < max_urls:
        current_url, current_depth = urls_to_visit.pop(0)

        print(f"[{processed_count + 1}] Crawling (Depth {current_depth}): {current_url}")

        # If we haven't reached max depth, look for more links
        if current_depth < max_depth:
            found_links = get_all_website_links(current_url)

            for link in found_links:
                if link not in visited_urls:
                    visited_urls.add(link)
                    urls_to_visit.append((link, current_depth + 1))

        processed_count += 1

        # Be polite to the server
        time.sleep(0.5)

    return visited_urls

def save_to_file(urls, filename="scraped_urls.txt"):
    """
    Saves the set of URLs to a text file.
    """
    try:
        with open(filename, "w", encoding='utf-8') as f:
            for url in sorted(urls):
                f.write(url + "\n")
        print("-" * 50)
        print(f"[*] Successfully saved {len(urls)} URLs to {filename}")
        print("[*] You can now upload this file to NotebookLM.")
    except Exception as e:
        print(f"[!] Error saving file: {e}")

if __name__ == "__main__":
    print("=== Website URL Scraper for NotebookLM ===")
    start_url = input("Enter the website URL to scrape (include http:// or https://): ").strip()

    # Defaults
    depth_input = input("Enter Depth (1=One Page, 2=Drill Down 1 Level, 3=Deep): [Default: 2] ")
    max_depth = int(depth_input) if depth_input.isdigit() else 2

    if not start_url.startswith("http"):
        print("Error: Please include http:// or https:// at the start of the URL.")
    else:
        # Run the crawler
        all_urls = crawl(start_url, max_depth=max_depth, max_urls=300)
        save_to_file(all_urls)

=== Website URL Scraper for NotebookLM ===
Enter the website URL to scrape (include http:// or https://): https://docs.cloud.google.com/vertex-ai/docs
Enter Depth (1=One Page, 2=Drill Down 1 Level, 3=Deep): [Default: 2] 1
[*] Starting crawl on: https://docs.cloud.google.com/vertex-ai/docs
[*] Max Depth: 1 | Max URLs limit: 300
--------------------------------------------------
[1] Crawling (Depth 0): https://docs.cloud.google.com/vertex-ai/docs


  for a_tag in soup.findAll("a"):


[2] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/model-registry/delete-model
[3] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/featurestore/latest/serve-historical-features
[4] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/explainable-ai/limitations
[5] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/featurestore/latest/update-featuregroup
[6] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/tutorials/image-classification-custom
[7] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/training/cloud-storage-file-system
[8] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/image-data/classification/prepare-data
[9] Crawling (Depth 1): https://docs.cloud.google.com/video-stitcher/docs/concepts/overview
[10] Crawling (Depth 1): https://docs.cloud.google.com/vertex-ai/docs/tutorials/jupyter-notebooks
[11] Crawling (Depth 1): https://docs.cloud.google.com/docs
[12] Crawling 