In [26]:
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from urllib.parse import urljoin, urlparse


In [25]:
url = "https://aitutor.liduos.com/"

# html_loader = AsyncHtmlLoader(web_path=url)
# document = html_loader.load()[0]

In [20]:
# from langchain.document_loaders import WebBaseLoader

# # def get_same_domain_links(url):
#     # Load the webpage
# loader = WebBaseLoader(url)
# document = loader.load()[0] # 返回的是markdown格式的文档
# url = "https://aitutor.liduos.com/"


In [34]:
from typing import Iterator
from bs4 import BeautifulSoup
from loguru import logger

def _build_metadata(soup, url: str) -> dict:
    """Build metadata from BeautifulSoup output."""
    metadata = {"source": url}
    if title := soup.find("title"):
        metadata["title"] = title.get_text()
    if description := soup.find("meta", attrs={"name": "description"}):
        metadata["description"] = description.get("content", "No description found.")
    if html := soup.find("html"):
        metadata["language"] = html.get("lang", "No language found.")
    return metadata

class WebRecursiveLoader(WebBaseLoader):
    def __init__(self, root_url:str, depth: int = 1, **kwargs):
        super().__init__(web_path=root_url, **kwargs)
        self.domain = urlparse(root_url).netloc
        self.depth = depth

    def lazy_load(self) -> Iterator[Document]:
        loaded_set = set()
        def __inner_load__(url: str, depth: int):
            if depth < 0:
                return
            soup:BeautifulSoup = self._scrape(url, bs_kwargs=self.bs_kwargs)
            text = soup.get_text(**self.bs_get_text_kwargs)
            metadata = _build_metadata(soup, url)
            yield Document(page_content=text, metadata=metadata)
            loaded_set.add(url)
            links = soup.find_all("a", href=True)
            for link in links:
                full_url = urljoin(url, link["href"])
                parsed_url = urlparse(full_url)
                if parsed_url.netloc == self.domain and full_url not in loaded_set:
                    yield from __inner_load__(full_url, depth - 1)
        yield from __inner_load__(self.web_path, self.depth)


In [28]:
loader = WebRecursiveLoader(root_url=url, depth=2)
documents = loader.load()

In [29]:
len(documents)

42

In [33]:
documents[3].metadata

{'source': 'https://aitutor.liduos.com/01-llm/01-3.html',
 'title': 'OpenAI 文档解读 · LLM 应用开发实践笔记',
 'description': '',
 'language': 'zh-hans'}

In [35]:
try:
    loader = WebRecursiveLoader(root_url=url, depth=3, requests_per_second=10)
    documents = loader.load()
except Exception as e:
    logger.exception(e)

In [37]:
len(documents),documents[1].metadata, documents[2].metadata, documents[3].metadata, documents[4].metadata

(42,
 {'source': 'https://aitutor.liduos.com/01-llm/01-1.html',
  'title': '大语言模型概况 · LLM 应用开发实践笔记',
  'description': '',
  'language': 'zh-hans'},
 {'source': 'https://aitutor.liduos.com/01-llm/01-2.html',
  'title': '你好, ChatGPT · LLM 应用开发实践笔记',
  'description': '',
  'language': 'zh-hans'},
 {'source': 'https://aitutor.liduos.com/01-llm/01-3.html',
  'title': 'OpenAI 文档解读 · LLM 应用开发实践笔记',
  'description': '',
  'language': 'zh-hans'},
 {'source': 'https://aitutor.liduos.com/01-llm/01-4.html',
  'title': '动手实现聊天机器人 · LLM 应用开发实践笔记',
  'description': '',
  'language': 'zh-hans'})

In [39]:
documents[3].page_content

'\n\n\n\n\n\nOpenAI 文档解读 · LLM 应用开发实践笔记\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n作者：莫尔索\n\n\n\n\n            \n                    \n                    前言\n            \n                \n\n\n\n            \n                    \n                    关注《莫尔索随笔》\n            \n                \n\n\n\n            \n                    \n                    大语言模型概述\n            \n                \n\n\n\n            \n                    \n                    大语言模型概况\n            \n                \n\n\n\n            \n                    \n                    你好, ChatGPT\n            \n                \n\n\n\n            \n                    \n                    OpenAI 文档解读\n            \n                \n\n\n\n            \n                    \n                    动手实现聊天机器人\n            \n                \n\n\n\n            \n                    \n                    基于 OpenAI API 搭建一个端到端问答系统\n            \n                \n\n\n\n            \n            