In [1]:
from langchain.retrievers import ParentDocumentRetriever

from util import DiskStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="chunks", embedding_function=OpenAIEmbeddings(), persist_directory="./data/chroma"
)
# The storage layer for the parent documents
doc_store = DiskStore("./data/doc_store")
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=doc_store,
    child_splitter=child_splitter,
)

loaders = [
    TextLoader("synth.txt"),
    TextLoader("musicgen.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
retriever.add_documents(docs, ids=None)



In [16]:
vectorstore.get("3947a49f-38d4-4719-8970-b0578c86857f")

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [2]:
list(doc_store.yield_keys())



['3947a49f-38d4-4719-8970-b0578c86857f',
 '53555131-5962-47b6-b4c2-59b9a9002114']

In [5]:
sub_docs = vectorstore.similarity_search("music generation")

In [6]:
sub_docs[0]

Document(metadata={'doc_id': '3947a49f-38d4-4719-8970-b0578c86857f', 'source': 'musicgen.txt'}, page_content='---\ntitle: "生成音樂的夢"\ndate: 2022-06-10T23:34:05+08:00\ndraft: false\nimage: "https://i.imgur.com/64Dk0jO.png"\ncategories: music\nsummary: "音樂像文字，還是圖像?"\n---\n![Image](https://i.imgur.com/64Dk0jO.png#center)\n\n夢的意思是我根本不知道做不做得出來。\n\n## 音樂像文字，還是圖像?')

In [5]:
retrieved_docs = retriever.invoke("wave equation")

Failed to multipart ingest runs: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Invalid token"}')trace=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0,id=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0


Failed to multipart ingest runs: langsmith.utils.LangSmithAuthError: Authentication failed for https://api.smith.langchain.com/runs/multipart. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Invalid token"}')trace=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0,id=b8f7d5f3-6dcb-4492-8f7d-aabdab8330c0


In [7]:
retrieved_docs[0]

Document(metadata={'source': 'synth.txt'}, page_content='---\ntitle: A Physics Based Synth for Piano\ndate: 2024-11-26\nauthors:\n  - eri24816\nimage: https://i.imgur.com/1dGGCaN.png\ndraft: false\ntags:\n  - audio\n  - Juce\n  - Cpp\ncategories: Uncategorized\nseries: \nsummary:\n---\nI really like the sounds of piano. They are a subset of all possible sound waves, with some specific mathematical characteristics which make them sound bright but gentle at the same time. I\'ve been trying to understand what\'s the magic inside the sounds of piano from i was maybe 12 till now, but I guess my math is still too bad to actually understand it from a fundamental aspect.\n\nRecently I try to make a synthesizer for piano by directly simulate the vibration of piano string and sample audio from it. Although the result sounds not actually like a real piano, the journey of implementing the synth is interesting enough to be written down here.\n\n# The Model of a String\n\nThe simplest form of the eq