In [85]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

import numpy as np


#Embedding
from langchain_openai import OpenAIEmbeddings

#Vector Database
from langchain_chroma import Chroma

#Loader
from langchain_community.document_loaders import PyPDFLoader

#Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [86]:
sys.path.append('../..')
_ = load_dotenv(find_dotenv()) 
api_key = os.environ['OPENAI_API_KEY']

In [87]:
embeddings_model = OpenAIEmbeddings()

In [88]:
sentence1 = "Neural Networks Quantum State (NQS) has halp us to understand the quantum many body systems better, such as finding ground state"
sentence2 = "We like to apply CNN architecture to the NQS model, because we believe convolution based architecture is good"
sentence3 = "This building has a really good architecture, I think I might buy it"
sentence4 = "Quantum Monte carlo is statistical approach for many body problems"

In [89]:
embedded_1 = embeddings_model.embed_query(sentence1)
embedded_2 = embeddings_model.embed_query(sentence2)
embedded_3 = embeddings_model.embed_query(sentence3)
embedded_4 = embeddings_model.embed_query(sentence4)

In [90]:
def cosine_similarity(a, b):
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [91]:
pairs = [
    ("1 vs 2", embedded_1, embedded_2),
    ("1 vs 3", embedded_1, embedded_3),
    ("1 vs 4", embedded_1, embedded_4),
    ("2 vs 3", embedded_2, embedded_3),
    ("2 vs 4", embedded_2, embedded_4),
    ("3 vs 4", embedded_3, embedded_4),
]

for label, vec_a, vec_b in pairs:
    print(f"{label}: {cosine_similarity(vec_a, vec_b):.6f}")

1 vs 2: 0.824805
1 vs 3: 0.711113
1 vs 4: 0.846385
2 vs 3: 0.776912
2 vs 4: 0.775635
3 vs 4: 0.728063


In [92]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function= embeddings_model,
    persist_directory="./database/chroma_langchain_db",
)

##### Loading Docs

In [93]:
loaders = [
    PyPDFLoader("./docs/2208.01758v4.pdf"),
    PyPDFLoader("./docs/G-CNN.pdf"),
    PyPDFLoader("./docs/G-CNN_for_NQS.pdf"),
    PyPDFLoader("./docs/information-15-00517.pdf"),
    PyPDFLoader("./docs/RBM_paper_hinton.pdf"),
    PyPDFLoader("./docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

##### Splitting

In [94]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

len(splits)

297

##### Embedding and Adding to vector database

In [95]:

vector_store.add_documents(documents=splits)

['f9f5fb77-65ea-4684-8741-b255e77acc7d',
 'ec612792-c9fa-403d-a2da-04bd0c60b0ec',
 'a3be4d32-643b-42d4-b614-e4dc36913a76',
 'b1529c28-9224-46eb-9e80-15b2d8de2573',
 '29fd5750-e0a5-4865-800d-aa9a78ebcc33',
 '488ae3cd-0814-490c-b9e6-dc8345fce60a',
 'c8cf5961-dfd7-4812-b55f-f13c19dc8042',
 'b153bb24-cdea-4b6a-b96b-20dbd0fe5dc7',
 '5c801a11-7ded-4b26-b1d2-2f1b8bfbd0a9',
 'a5f52875-01b4-4b84-9dac-a7b0141aa61e',
 '2b280771-dcf5-4a60-bb84-de231b94a813',
 '77fe762b-b272-42a6-9fb3-7d753de49dc4',
 '3fc6bf11-49f9-4a7c-bf21-91d138594481',
 'd3693f74-ca3d-420e-a81e-e015640c755e',
 'bfaba957-1f6c-49f3-8666-167fbc5ff294',
 '83046003-49ef-4aab-af17-025b27187db6',
 'b2eb440e-51b3-4ec5-bbc2-a6b7ace43266',
 'd373e1bc-41ea-4dee-bf16-e366cf877872',
 '3de8247d-c124-4f46-82a0-2e34f2db16ca',
 'd73e5e60-8b8c-421c-b458-049450a4fa70',
 'd3d911f1-28f2-4ab0-a86e-8274149da475',
 'c5e47e00-4735-48ad-b125-d34842e27f67',
 '22fccaa5-159e-4b86-8331-a3ef3563efa7',
 'b478b906-07d6-4ccb-bb8e-aaffb768324f',
 'c69c5690-7f91-

In [96]:
print(vector_store._collection.count())

1188


In [97]:
question1 = "Explain NQS shortly?"

In [98]:
search_docs = vector_store.similarity_search(question1, k=3)

In [99]:
len(search_docs)

3

In [100]:
search_docs[0].page_content

'problem being solved is fully characterized and free from\nunintended disorder. Neural network quantum states\n(NQS) ([2–9]) have emerged as a competitive tool for\nunderstanding the low temperature properties of quan-\ntum many-body physics models. Unlike traditional vari-\national Monte-Carlo (VMC) methods, such as Gutzwiller\nprojection, neural networks have the advantage, and also\nthe disadvantage, of being free from inductive biases\nabout the structure of the solution. They compensate\nfor the absence of an informed bias by using an enor-\nmous number of parameters. As long as a wide enough\nmodel is used, neural networks contain arbitrarily accu-\nrate solutions [10].\nAlthough reasonably large neural networks are guar-\nanteed to harbor good solutions in the space of possible\nparameters, there is no guarantee that these solutions\ncan be found in a reasonable time. One way to acceler-\nate training is to constrain the search space by removing\nnon-solutions. This is especial

In [101]:
question2 = "Tell me about CNN for NQS."

In [102]:
search_docs = vector_store.similarity_search(question2, k=3)

In [103]:
search_docs[1].page_content

'problem being solved is fully characterized and free from\nunintended disorder. Neural network quantum states\n(NQS) ([2–9]) have emerged as a competitive tool for\nunderstanding the low temperature properties of quan-\ntum many-body physics models. Unlike traditional vari-\national Monte-Carlo (VMC) methods, such as Gutzwiller\nprojection, neural networks have the advantage, and also\nthe disadvantage, of being free from inductive biases\nabout the structure of the solution. They compensate\nfor the absence of an informed bias by using an enor-\nmous number of parameters. As long as a wide enough\nmodel is used, neural networks contain arbitrarily accu-\nrate solutions [10].\nAlthough reasonably large neural networks are guar-\nanteed to harbor good solutions in the space of possible\nparameters, there is no guarantee that these solutions\ncan be found in a reasonable time. One way to acceler-\nate training is to constrain the search space by removing\nnon-solutions. This is especial