In [15]:
import requests
from bs4 import BeautifulSoup
import os
import re

HEADERS = {"User-Agent": "Mozilla/5.0"}

DOCUMENTATION_SOURCES = {

    # ======================================
    # FASTAPI
    # ======================================
    "fastapi": {
        "framework": "FastAPI",
        "selector": ".md-content",
        "urls": [
            # Installation & setup
            "https://fastapi.tiangolo.com/installation/",
            "https://fastapi.tiangolo.com/python-types/",
            "https://fastapi.tiangolo.com/async/",
            "https://fastapi.tiangolo.com/virtual-environments/",
            "https://fastapi.tiangolo.com/fastapi-cli/",
            "https://fastapi.tiangolo.com/deployment/server-workers/",
            "https://fastapi.tiangolo.com/deployment/docker/",

            # Basics
            "https://fastapi.tiangolo.com/tutorial/first-steps/",
            "https://fastapi.tiangolo.com/tutorial/path-params/",
            "https://fastapi.tiangolo.com/tutorial/query-params/",
            "https://fastapi.tiangolo.com/tutorial/body/",
            "https://fastapi.tiangolo.com/tutorial/body-fields/",
            "https://fastapi.tiangolo.com/tutorial/body-nested-models/",
            "https://fastapi.tiangolo.com/tutorial/cookie-params/",
            "https://fastapi.tiangolo.com/tutorial/header-params/",
            "https://fastapi.tiangolo.com/tutorial/request-forms/",
            "https://fastapi.tiangolo.com/tutorial/request-files/",
            "https://fastapi.tiangolo.com/tutorial/response-model/",
            "https://fastapi.tiangolo.com/tutorial/response-status-code/",
            "https://fastapi.tiangolo.com/tutorial/handling-errors/",

            # Intermediate
            "https://fastapi.tiangolo.com/tutorial/dependencies/",
            "https://fastapi.tiangolo.com/tutorial/dependencies/classes-as-dependencies/",
            "https://fastapi.tiangolo.com/tutorial/dependencies/sub-dependencies/",
            "https://fastapi.tiangolo.com/tutorial/security/",
            "https://fastapi.tiangolo.com/tutorial/security/oauth2-jwt/",
            "https://fastapi.tiangolo.com/tutorial/middleware/",
            "https://fastapi.tiangolo.com/tutorial/cors/",
            "https://fastapi.tiangolo.com/tutorial/sql-databases/",
            "https://fastapi.tiangolo.com/tutorial/background-tasks/",
            "https://fastapi.tiangolo.com/tutorial/static-files/",
            "https://fastapi.tiangolo.com/tutorial/testing/",
            "https://fastapi.tiangolo.com/tutorial/debugging/",

            # Advanced
            "https://fastapi.tiangolo.com/advanced/additional-status-codes/",
            "https://fastapi.tiangolo.com/advanced/response-directly/",
            "https://fastapi.tiangolo.com/advanced/custom-response/",
            "https://fastapi.tiangolo.com/advanced/response-cookies/",
            "https://fastapi.tiangolo.com/advanced/response-headers/",
            "https://fastapi.tiangolo.com/advanced/websockets/",
            "https://fastapi.tiangolo.com/advanced/events/",
            "https://fastapi.tiangolo.com/advanced/sub-applications/",
            "https://fastapi.tiangolo.com/advanced/middleware/",
        ],
    },

    # ======================================
    # REACT
    # ======================================
    "react": {
        "framework": "React",
        "selector": "main, article",
        "urls": [
            "https://react.dev/learn",
            "https://react.dev/learn/thinking-in-react",
            "https://react.dev/learn/describing-the-ui",
            "https://react.dev/learn/adding-interactivity",
            "https://react.dev/learn/managing-state",

            # Hooks
            "https://react.dev/reference/react/useState",
            "https://react.dev/reference/react/useEffect",
            "https://react.dev/reference/react/useContext",
            "https://react.dev/reference/react/useReducer",
            "https://react.dev/reference/react/useRef",
            "https://react.dev/reference/react/useMemo",
            "https://react.dev/reference/react/useCallback",

            # Advanced
            "https://react.dev/learn/escape-hatches",
            "https://react.dev/learn/you-might-not-need-an-effect",
            "https://react.dev/reference/react/Component",
        ],
    },

    # ======================================
    # NEXT.JS
    # ======================================
    "nextjs": {
        "framework": "Next.js",
        "selector": "main, article",
        "urls": [
            "https://nextjs.org/docs/app/building-your-application/routing",
            "https://nextjs.org/docs/app/building-your-application/routing/defining-routes",
            "https://nextjs.org/docs/app/building-your-application/routing/pages-and-layouts",
            "https://nextjs.org/docs/app/building-your-application/routing/dynamic-routes",
            "https://nextjs.org/docs/app/building-your-application/routing/loading-ui-and-streaming",

            # Data
            "https://nextjs.org/docs/app/building-your-application/data-fetching",
            "https://nextjs.org/docs/app/building-your-application/data-fetching/server-actions-and-mutations",
            "https://nextjs.org/docs/app/building-your-application/caching",

            # API
            "https://nextjs.org/docs/app/building-your-application/routing/route-handlers",
            "https://nextjs.org/docs/app/api-reference/functions/server-actions",

            # Components
            "https://nextjs.org/docs/app/api-reference/components/image",
            "https://nextjs.org/docs/app/api-reference/components/link",
            "https://nextjs.org/docs/app/building-your-application/optimizing",

            # Deployment
            "https://nextjs.org/docs/app/building-your-application/deploying",
        ],
    },

    # ======================================
    # EXPRESS
    # ======================================
    "express": {
        "framework": "Express.js",
        "selector": "#page-doc",
        "urls": [
            "https://expressjs.com/en/starter/hello-world.html",
            "https://expressjs.com/en/starter/basic-routing.html",
            "https://expressjs.com/en/guide/routing.html",
            "https://expressjs.com/en/guide/using-middleware.html",
            "https://expressjs.com/en/guide/writing-middleware.html",
            "https://expressjs.com/en/guide/error-handling.html",
            "https://expressjs.com/en/guide/database-integration.html",
            "https://expressjs.com/en/advanced/best-practice-security.html",
            "https://expressjs.com/en/advanced/best-practice-performance.html",
        ],
    },

    # ======================================
    # POSTGRESQL
    # ======================================
    "postgresql": {
        "framework": "PostgreSQL",
        "selector": ".SECT1, .SECT2",
        "urls": [
            "https://www.postgresql.org/docs/current/tutorial-select.html",
            "https://www.postgresql.org/docs/current/tutorial-join.html",
            "https://www.postgresql.org/docs/current/tutorial-agg.html",
            "https://www.postgresql.org/docs/current/tutorial-views.html",
            "https://www.postgresql.org/docs/current/tutorial-transactions.html",
            "https://www.postgresql.org/docs/current/ddl.html",
            "https://www.postgresql.org/docs/current/indexes.html",
        ],
    },

    # ======================================
    # MONGODB
    # ======================================
    "mongodb": {
        "framework": "MongoDB",
        "selector": ".body",
        "urls": [
            "https://www.mongodb.com/docs/manual/tutorial/getting-started/",
            "https://www.mongodb.com/docs/manual/tutorial/insert-documents/",
            "https://www.mongodb.com/docs/manual/tutorial/query-documents/",
            "https://www.mongodb.com/docs/manual/tutorial/update-documents/",
            "https://www.mongodb.com/docs/manual/tutorial/remove-documents/",
            "https://www.mongodb.com/docs/manual/aggregation/",
            "https://www.mongodb.com/docs/manual/indexes/",
        ],
    },

    # ======================================
    # DOCKER
    # ======================================
    "docker": {
        "framework": "Docker",
        "selector": "article",
        "urls": [
            "https://docs.docker.com/get-started/",
            "https://docs.docker.com/get-started/02_our_app/",
            "https://docs.docker.com/get-started/03_updating_app/",
            "https://docs.docker.com/get-started/04_sharing_app/",
            "https://docs.docker.com/get-started/05_persisting_data/",
            "https://docs.docker.com/get-started/06_bind_mounts/",
            "https://docs.docker.com/get-started/07_multi_container/",
            "https://docs.docker.com/compose/gettingstarted/",
        ],
    },

    # ======================================
    # AWS
    # ======================================
    "aws": {
        "framework": "AWS",
        "selector": ".awsui-util-container",
        "urls": [
            # S3
            "https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html",
            "https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html",

            # Lambda
            "https://docs.aws.amazon.com/lambda/latest/dg/getting-started.html",
            "https://docs.aws.amazon.com/lambda/latest/dg/python-handler.html",

            # EC2
            "https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html",
            "https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/launching-instance.html",

            # DynamoDB
            "https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GettingStartedDynamoDB.html",
        ],
    },

    # ======================================
    # PANDAS
    # ======================================
    "pandas": {
        "framework": "Pandas",
        "selector": ".body",
        "urls": [
            "https://pandas.pydata.org/docs/user_guide/10min.html",
            "https://pandas.pydata.org/docs/user_guide/dsintro.html",
            "https://pandas.pydata.org/docs/user_guide/indexing.html",
            "https://pandas.pydata.org/docs/user_guide/merging.html",
            "https://pandas.pydata.org/docs/user_guide/groupby.html",
            "https://pandas.pydata.org/docs/user_guide/reshaping.html",
            "https://pandas.pydata.org/docs/user_guide/io.html",
        ],
    },

    # ======================================
    # NUMPY
    # ======================================
    "numpy": {
        "framework": "NumPy",
        "selector": "article",
        "urls": [
            "https://numpy.org/doc/stable/user/quickstart.html",
            "https://numpy.org/doc/stable/user/basics.creation.html",
            "https://numpy.org/doc/stable/user/basics.indexing.html",
            "https://numpy.org/doc/stable/user/basics.broadcasting.html",
            "https://numpy.org/doc/stable/reference/routines.array-manipulation.html",
        ],
    },

    # ======================================
    # TYPESCRIPT
    # ======================================
    "typescript": {
        "framework": "TypeScript",
        "selector": "article",
        "urls": [
            "https://www.typescriptlang.org/docs/handbook/2/basic-types.html",
            "https://www.typescriptlang.org/docs/handbook/2/everyday-types.html",
            "https://www.typescriptlang.org/docs/handbook/2/narrowing.html",
            "https://www.typescriptlang.org/docs/handbook/2/functions.html",
            "https://www.typescriptlang.org/docs/handbook/2/objects.html",
            "https://www.typescriptlang.org/docs/handbook/2/classes.html",
            "https://www.typescriptlang.org/docs/handbook/2/generics.html",
        ],
    },

    # ======================================
    # TAILWIND
    # ======================================
    "tailwind": {
        "framework": "Tailwind CSS",
        "selector": "#content-wrapper",
        "urls": [
            "https://tailwindcss.com/docs/utility-first",
            "https://tailwindcss.com/docs/responsive-design",
            "https://tailwindcss.com/docs/hover-focus-and-other-states",
            "https://tailwindcss.com/docs/dark-mode",
            "https://tailwindcss.com/docs/adding-custom-styles",
            "https://tailwindcss.com/docs/installation",
        ],
    },

    # ======================================
    # PRISMA
    # ======================================
    "prisma": {
        "framework": "Prisma ORM",
        "selector": "article",
        "urls": [
            "https://www.prisma.io/docs/getting-started/quickstart",
            "https://www.prisma.io/docs/concepts/components/prisma-schema",
            "https://www.prisma.io/docs/concepts/components/prisma-client",
            "https://www.prisma.io/docs/concepts/components/prisma-client/crud",
            "https://www.prisma.io/docs/concepts/components/prisma-client/relation-queries",
            "https://www.prisma.io/docs/guides/performance-and-optimization",
        ],
    },

    # ======================================
    # REDIS
    # ======================================
    "redis": {
        "framework": "Redis",
        "selector": ".prose",
        "urls": [
            "https://redis.io/docs/getting-started/",
            "https://redis.io/docs/data-types/strings/",
            "https://redis.io/docs/data-types/lists/",
            "https://redis.io/docs/data-types/sets/",
            "https://redis.io/docs/data-types/hashes/",
            "https://redis.io/docs/manual/patterns/",
        ],
    },
}


BASE_DIR = "docs_dataset"
os.makedirs(BASE_DIR, exist_ok=True)

def slugify(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", "-", text)
    return text.strip("-")

for key, cfg in DOCUMENTATION_SOURCES.items():
    framework = cfg["framework"]
    selector = cfg["selector"]
    urls = cfg["urls"]

    print(f"\nðŸ“˜ Processing framework: {framework}")

    # folder per framework
    folder = os.path.join(BASE_DIR, slugify(framework))
    os.makedirs(folder, exist_ok=True)

    for url in urls:
        print(f" â†’ Fetching {url}")

        res = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(res.text, "html.parser")

        # page title
        h1 = soup.find("h1")
        title = h1.get_text(strip=True) if h1 else url

        # extract main content using selector
        content = soup.select_one(selector)

        if content:
            text = content.get_text(separator="\n", strip=True)
        else:
            text = soup.get_text(separator="\n", strip=True)

        # filename from URL segment
        slug = slugify(url.rstrip("/").split("/")[-1] or "index")
        filepath = os.path.join(folder, f"{slug}.txt")

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(title + "\n\n" + text)

        print(f"   âœ” Saved â†’ {filepath}")


ðŸ“˜ Processing framework: FastAPI
 â†’ Fetching https://fastapi.tiangolo.com/installation/
   âœ” Saved â†’ docs_dataset/fastapi/installation.txt
 â†’ Fetching https://fastapi.tiangolo.com/python-types/
   âœ” Saved â†’ docs_dataset/fastapi/python-types.txt
 â†’ Fetching https://fastapi.tiangolo.com/async/
   âœ” Saved â†’ docs_dataset/fastapi/async.txt
 â†’ Fetching https://fastapi.tiangolo.com/virtual-environments/
   âœ” Saved â†’ docs_dataset/fastapi/virtual-environments.txt
 â†’ Fetching https://fastapi.tiangolo.com/fastapi-cli/
   âœ” Saved â†’ docs_dataset/fastapi/fastapi-cli.txt
 â†’ Fetching https://fastapi.tiangolo.com/deployment/server-workers/
   âœ” Saved â†’ docs_dataset/fastapi/server-workers.txt
 â†’ Fetching https://fastapi.tiangolo.com/deployment/docker/
   âœ” Saved â†’ docs_dataset/fastapi/docker.txt
 â†’ Fetching https://fastapi.tiangolo.com/tutorial/first-steps/
   âœ” Saved â†’ docs_dataset/fastapi/first-steps.txt
 â†’ Fetching https://fastapi.tiangolo.com/tuto