# Scrapper

In [14]:
from bs4 import BeautifulSoup as bs
import requests
import json
import os
from tqdm import tqdm
from typing import Literal

In [15]:
# Documentation links
DOCS_INDEX_URL = "https://docs.python.org/3.12/py-modindex.html"
DOCS_URL_PREFIX = "https://docs.python.org/3.12/"

# FileSystem paths
SAVE_FOLDER = "../data"

## Utils

In [16]:
def save_json(path: str, data: dict):
    with open(
        path,
        "w",
        encoding="utf-8",
    ) as f:
        json.dump(data, f, indent=4)


def load_json(path: str) -> dict:
    with open(
        path,
        "r",
        encoding="utf-8",
    ) as f:
        return json.load(f)

In [17]:
def load_context(url: str) -> bs:
    response = requests.get(url)
    if response.status_code == 200:
        return bs(response.text, "html.parser")
    raise RuntimeError(f"Response code is {response.status_code}")

## Keys loading

In [None]:
class ModulesIndex:
    def __init__(
        self, skip_platform_specific: bool = True, skip_deprecated: bool = True
    ) -> None:
        self._skip_platform_specific = skip_platform_specific
        self._skip_deprecated = skip_deprecated

        self._name = "index"
        self._settings = (
            f"{int(self._skip_platform_specific)}_{int(self._skip_deprecated)}"
        )
        self._path = os.path.join(
            SAVE_FOLDER,
            f"{self._name}_{self._settings}.json",
        )

        self._load()

    def _load(self):
        if os.path.exists(self._path):
            self._data = load_json(self._path)
            return
        self._data = self._scrap(load_context(DOCS_INDEX_URL))
        save_json(self._path, self._data)

    def _scrap(self, soup: bs) -> dict[str, dict[str, str]]:
        data = {}
        for row in soup.select("table > tr"):
            _, module_info, desc_info = row.select("td")
            platform_specific_part = module_info.select_one("em")

            if self._skip_platform_specific and (
                platform_specific_part is not None
                and platform_specific_part.text.startswith("(")
            ):
                continue

            module_a = module_info.select_one("a")
            if module_a is None:
                continue

            module_name: str = module_a.text
            if module_name.startswith("_"):
                continue

            module_ref = module_a.get("href")

            if module_ref is None:
                continue

            desc: str = desc_info.text.strip()
            if desc == "":
                continue
            if self._skip_deprecated and desc.startswith("Deprecated"):
                continue

            data[module_name] = {
                "link": f"{DOCS_URL_PREFIX}{module_ref}",
                "description": desc,
            }
        return data

    @property
    def data(self) -> dict[str, dict[str, str]]:
        return self._data

    @property
    def settings(self) -> str:
        return self._settings

    def __len__(self) -> int:
        return len(self._data)


modules_index = ModulesIndex()

print(f"{len(modules_index)=}")

len(modules_index)=241


## Modules Scrapper

In [None]:
class ModulesScrapper:
    def __init__(
        self,
        modules_index: ModulesIndex,
        include: list[Literal["class", "function", "exception", "data"]] = ["function"],
    ) -> None:
        self._include = include
        self._modules_index = modules_index
        self._length = 0

        self._build_selector()

        self._name = f"{'_'.join(sorted(self._include))}__{self._modules_index.settings}"
        self._path = os.path.join(SAVE_FOLDER, "scrapped", self._name)
        os.makedirs(self._path, exist_ok=True)

    def _build_selector(self):
        if len(self._include) == 0:
            raise RuntimeError("Selector is empty!")
        self._selector = ", ".join([f".py.{selector}" for selector in self._include])

    def _save(self, name: str, data: str):
        with open(
            os.path.join(self._path, f"{name}.txt"),
            "w",
            encoding="utf-8",
        ) as f:
            f.write(data)

    def _scrap_and_save(self):
        modules_data = self._modules_index.data
        for module_name, info in tqdm(
            modules_data.items(), desc="Scrapping", total=len(modules_data)
        ):
            module_context = load_context(info["link"])

            # TODO: add classes handling

            for element in module_context.select(self._selector):
                description = "\n".join([x.text.strip() for x in element.select("dd")])
                for object in element.select("dt.sig-object"):
                    # Name
                    pre_name = object.select_one("span.sig-prename")
                    name = object.select_one("span.sig-name")

                    if pre_name is None or name is None:
                        continue
                    full_name = f"{pre_name.text.strip()}{name.text.strip()}"

                    # Parameters
                    parameters_info = ""
                    params = [x.text for x in object.select("em.sig-param")]
                    if len(params) > 0:
                        parameters_info = ", ".join(params)

                    # Save
                    self._save(
                        full_name,
                        f"{full_name} FROM {module_name}\n\nPARAMETERS\n{parameters_info}\n\nDESCRIPTION\n{description}",
                    )

    def load(
        self,
        force: bool = False,
    ):
        if force or (not os.path.exists(self._path) or len(os.listdir(self._path)) == 0):
            self._scrap_and_save()
        self._length = len(os.listdir(self._path))

    def __len__(self) -> int:
        return self._length


scrapper = ModulesScrapper(modules_index)
scrapper.load()

print(f"{len(scrapper)=}")

len(scrapper)=2018
