In [None]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

import numpy as np


#Embedding
from langchain_openai import OpenAIEmbeddings

#Vector Database
from langchain_chroma import Chroma

#Loader
from langchain_community.document_loaders import PyPDFLoader

#Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Self-Query Retriever
from langchain_openai import OpenAI

from langchain_openai import ChatOpenAI
from langchain_classic.chains.query_constructor.schema import AttributeInfo
from langchain_classic.retrievers.self_query.base import SelfQueryRetriever



In [56]:
sys.path.append('../..')
_ = load_dotenv(find_dotenv()) 
api_key = os.environ['OPENAI_API_KEY']

In [57]:
embedding = OpenAIEmbeddings()

In [58]:
vectordb = Chroma(
    collection_name="example_collection",
    embedding_function= embedding,
    persist_directory="./database/chroma_langchain_db",
)

loaders = [
    PyPDFLoader("./docs/2208.01758v4.pdf"),
    PyPDFLoader("./docs/G-CNN.pdf"),
    PyPDFLoader("./docs/G-CNN_for_NQS.pdf"),
    PyPDFLoader("./docs/information-15-00517.pdf"),
    PyPDFLoader("./docs/RBM_paper_hinton.pdf"),
    PyPDFLoader("./docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

vectordb.add_documents(documents=splits)

  docs = []


['df02e527-e44d-4200-9a17-2dc7eb8096ff',
 '3fb10489-fe91-41a9-9a86-432c51a9a59c',
 'd4e05bc3-81b7-406e-b196-bb07f5f6174c',
 '88205817-010e-49a5-89ec-79fc5a498478',
 '8ed028cb-56e2-4cf7-8be6-5d1d6d5611bf',
 'd31887e9-aae1-4dcd-9137-5244f0487124',
 'ca825498-6cb8-413b-91a3-f6ec47ea3eb3',
 'fba151ad-a356-4c86-b3df-a19aeb6a043e',
 'a2eafc38-3187-4aee-ae34-aba7db11bc04',
 '7a736f38-f1d6-4169-be24-e742d190edd5',
 '9e8c326b-bf78-448e-ac39-6f3ed3ecc233',
 '4b995328-6c03-4ab5-b81b-20aa46505bf4',
 '4a9545e7-8016-49c9-8a4b-1da95671b8f8',
 'ed6191e0-86e0-4660-b5f4-caa3027f03c8',
 '3d5360b7-317b-4fe0-874f-2bf8c359d5b1',
 '198011ae-b008-4187-80a1-78e9c4e179e4',
 '39157076-0837-4280-8150-96695b4dbd67',
 'f0c1c73a-2fa2-4c99-8aa8-b27536536f7d',
 'd47a1a43-c04d-466c-84ad-5a2db7bedb49',
 '99095482-0394-48a2-b489-9443fee57a03',
 '767e1a5d-8bb3-4e55-8202-9b87cfa139f9',
 'c8304ebc-05c8-44a3-afb4-b48e684efb54',
 '5f87de27-b5ed-4511-bd7c-2697a2764562',
 '1b900e58-fe2a-432b-a1fd-550636b184f5',
 '131e7db7-7b41-

In [59]:
print(vectordb._collection.count())

1782


In [60]:
texts = ["""ML helps computers find patterns in data and make predictions automatically.""",
"""Training a model means feeding it labeled examples until it can guess new answers on its own.""",
"""After deployment you monitor the model and retrain if incoming data shifts.""",
"""Sport is for your body important"""]

In [61]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [62]:
question = "Tell me about training in ML"

In [63]:
smalldb.similarity_search(question, k = 2)

[Document(id='5f1445e7-2b2b-40b1-8e6d-7a7fa307d001', metadata={}, page_content='ML helps computers find patterns in data and make predictions automatically.'),
 Document(id='cd23acd9-2230-4acf-b6ba-8bde0fd37756', metadata={}, page_content='ML helps computers find patterns in data and make predictions automatically.')]

In [64]:
smalldb.max_marginal_relevance_search(question, k=2, fetch_k=3)

[Document(id='5f1445e7-2b2b-40b1-8e6d-7a7fa307d001', metadata={}, page_content='ML helps computers find patterns in data and make predictions automatically.'),
 Document(id='cd23acd9-2230-4acf-b6ba-8bde0fd37756', metadata={}, page_content='ML helps computers find patterns in data and make predictions automatically.')]

In [65]:
question = "what is G-CNN?"
docs_ss = vectordb.similarity_search(question,k=3)

In [66]:
docs_ss[0].page_content[:400]

'a group may not be feasible if the group is large.\nFinally, we hope that the current work can serve as a con-\ncrete example of the general philosophy of “structured rep-\nresentations”, outlined in section 2. We believe that adding\nmathematical structure to a representation (making sure\nthat maps between representations preserve this structure),\ncould enhance the ability of neural nets to see abstr'

In [67]:
docs_ss[1].page_content[200:500]

'tion 2. We believe that adding\nmathematical structure to a representation (making sure\nthat maps between representations preserve this structure),\ncould enhance the ability of neural nets to see abstract sim-\nilarities between superﬁcially different concepts.\n10. Conclusion\nWe have introduced G-CNNs'

In [68]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

In [69]:
docs_mmr[0].page_content[:100]

'a group may not be feasible if the group is large.\nFinally, we hope that the current work can serve '

In [70]:
docs_mmr[1].page_content[:100]

'Group Equivariant Convolutional Networks\nTaco S. Cohen T.S.COHEN @UVA.NL\nUniversity of Amsterdam\nMax'

#### Working with metadata

In [71]:
question = "Tell me more about the NQS"

In [72]:
docs = vectordb.similarity_search(
    question, 
    k = 3, 
    filter={"source":"./docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf"}
)

In [73]:
for d in docs:
    print(d.metadata)

{'source': './docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf', 'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'moddate': '2025-11-02T03:00:31-08:00', 'page_label': '4', 'producer': 'Acrobat Distiller 15.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'creationdate': '2017-02-07T05:01:49+08:00', 'subject': 'Science 2017.355:602-606', 'title': 'Solving the quantum many-body problem with artificial neural networks', 'total_pages': 4, 'page': 3}
{'creationdate': '2017-02-07T05:01:49+08:00', 'title': 'Solving the quantum many-body problem with artificial neural networks', 'producer': 'Acrobat Distiller 15.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'source': './docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf', 'total_pages': 4, 'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'page_label': '4', 'page': 3, 'subject': 'Science 2017.355:602-606', 'moddate': '2025-11-02T03:

In [74]:
metadata_field_info = [
    AttributeInfo(
        name="title of paper",
        description="This is the title of the paper that was published",
        type="integer",
    ),
]

In [75]:

document_content_description = "The full text of academic research papers"

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectordb,
    document_contents=document_content_description,  # <- YES for langchain_classic
    metadata_field_info=metadata_field_info,
    verbose=True,
)

question = "What architectures have they used for NQS?"
docs = await retriever.ainvoke(question)
for doc in docs:
    print(doc.metadata)

{'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'producer': 'Acrobat Distiller 15.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'moddate': '2025-11-02T03:00:31-08:00', 'total_pages': 4, 'subject': 'Science 2017.355:602-606', 'page': 3, 'source': './docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf', 'title': 'Solving the quantum many-body problem with artificial neural networks', 'creationdate': '2017-02-07T05:01:49+08:00', 'page_label': '4'}
{'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'subject': 'Science 2017.355:602-606', 'creationdate': '2017-02-07T05:01:49+08:00', 'source': './docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf', 'producer': 'Acrobat Distiller 15.0 (Windows); modified using iText 4.2.0 by 1T3XT', 'title': 'Solving the quantum many-body problem with artificial neural networks', 'page_label': '4', 'moddate': '2025-11-02T03:00:31-08:00', 'total_pages': 