In [1]:
from langchain.retrievers import ParentDocumentRetriever

from util import DiskStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="chunks", embedding_function=OpenAIEmbeddings(), persist_directory="./data/chroma"
)
# The storage layer for the parent documents
doc_store = DiskStore("./data/doc_store")
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=doc_store,
    child_splitter=child_splitter,
)

loaders = [
    TextLoader("synth.txt"),
    TextLoader("musicgen.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
retriever.add_documents(docs, ids=None)



In [16]:
vectorstore.get("3947a49f-38d4-4719-8970-b0578c86857f")

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [2]:
list(doc_store.yield_keys())



['3947a49f-38d4-4719-8970-b0578c86857f',
 '53555131-5962-47b6-b4c2-59b9a9002114']

In [5]:
sub_docs = vectorstore.similarity_search("music generation")

In [6]:
sub_docs[0]

Document(metadata={'doc_id': '3947a49f-38d4-4719-8970-b0578c86857f', 'source': 'musicgen.txt'}, page_content='---\ntitle: "生成音樂的夢"\ndate: 2022-06-10T23:34:05+08:00\ndraft: false\nimage: "https://i.imgur.com/64Dk0jO.png"\ncategories: music\nsummary: "音樂像文字，還是圖像?"\n---\n![Image](https://i.imgur.com/64Dk0jO.png#center)\n\n夢的意思是我根本不知道做不做得出來。\n\n## 音樂像文字，還是圖像?')

In [5]:
retrieved_docs = retriever.invoke("wave equation")

Failed to multipart ingest runs: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Invalid token"}')trace=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0,id=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0


Failed to multipart ingest runs: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Invalid token"}')trace=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0,id=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0


In [4]:
from langchain_community.document_loaders import WebBaseLoader
WebBaseLoader(web_paths=("https://www.google.com/search?q=eri24816",)).load()


[Document(metadata={'source': 'https://www.google.com/search?q=eri24816', 'title': 'eri24816 - Google 搜尋', 'language': 'zh-TW'}, page_content="eri24816 - Google 搜尋Google×如果系統沒有在數秒鐘後將您重新導向，請按一下這裡。    全部圖片影片地圖 新聞 購物 書籍 搜尋工具    不限語言不限語言搜尋所有中文網頁搜尋繁體中文網頁不限時間不限時間 過去 1 小時 過去 24 小時 過去 1 週 過去 1 個月 過去 1 年所有結果所有結果一字不差查看以下內容的搜尋結果：eri24816歌曲： Level Up、1205和0904eri24816's blogeri24816.github.io希望在大學的壓力下不要丟失我的初衷。 那就是：我喜愛這個世界，所以想以最high level 的角度理解世界，幫各種我喜歡的東西建立模型。例如音樂生成模型、style\xa0...eri24816's blogeri24816.tw我想以最high level 的角度理解世界，幫各種我喜歡的東西建立模型。例如音樂生成模型、style transfer、物理模擬。 ... Listen to my music on spotify!程品奕 eri24816 - GitHubgithub.com › eri24816Build a model for anything. No model is just a bad model. 44 followers · 32 following. NCKU Computer Science. Taiwan; https://eri24816.tw/ · @eri24816\xa0...幫各種我喜歡的東西建立模型。例如音樂生成模型 - eri24816's blogeri24816.tw › ...我想以最high level 的角度理解世界，幫各種我喜歡的東西建立模型。例如音樂生成模型、style transfer、物理模擬。 ... Listen to my music on spotify!Eric Chen (@eri24816) • Threads, Say moreww

In [19]:
from typing import Iterator, Any


from bs4 import BeautifulSoup
from langchain_core.documents import Document

from bs4 import BeautifulSoup, NavigableString, CData, Tag

from typing import Optional, Union


def _build_metadata(soup: Any, url: str) -> dict:
    """Build metadata from BeautifulSoup output."""
    metadata = {"source": url}
    if title := soup.find("title"):
        metadata["title"] = title.get_text()
    if description := soup.find("meta", attrs={"name": "description"}):
        metadata["description"] = description.get("content", "No description found.")
    if html := soup.find("html"):
        metadata["language"] = html.get("lang", "No language found.")
    return metadata


class MyBeautifulSoup(BeautifulSoup):
    def _all_strings(self, strip=False, types=(NavigableString, CData)):
        for descendant in self.descendants:
            # return "a" string representation if we encounter it
            if isinstance(descendant, Tag) and descendant.name == "a":
                yield str(descendant)

            # skip an inner text node inside "a"
            if (
                isinstance(descendant, NavigableString)
                and descendant.parent.name == "a"
            ):
                continue

            # default behavior
            if (types is None and not isinstance(descendant, NavigableString)) or (
                types is not None and (type(types) is not object ) and type(descendant) not in types
            ):
                continue

            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0:
                    continue
            yield descendant




class MyWebLoader(WebBaseLoader):
    def _scrape(
        self,
        url: str,
        parser: Union[str, None] = None,
        bs_kwargs: Optional[dict] = None,
    ) -> MyBeautifulSoup:
    
        if parser is None:
            if url.endswith(".xml"):
                parser = "xml"
            else:
                parser = self.default_parser
    
        self._check_parser(parser)
    
        html_doc = self.session.get(url, **self.requests_kwargs)
        if self.raise_for_status:
            html_doc.raise_for_status()
    
        if self.encoding is not None:
            html_doc.encoding = self.encoding
        elif self.autoset_encoding:
            html_doc.encoding = html_doc.apparent_encoding
        return MyBeautifulSoup(html_doc.text, parser, **(bs_kwargs or {}))
    
    def lazy_load(self) -> Iterator[Document]:
        for path in self.web_paths:
            soup: MyBeautifulSoup = self._scrape(path, bs_kwargs=self.bs_kwargs)
            text = soup.get_text(**self.bs_get_text_kwargs)
            metadata = _build_metadata(soup, path)
            yield Document(page_content=text, metadata=metadata)


In [20]:
list(
MyWebLoader(web_paths=("https://www.google.com/search?q=eri24816",)).load())

TypeError: sequence item 1: expected str instance, Tag found