In [34]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

import numpy as np


#Embedding
from langchain_openai import OpenAIEmbeddings

#Vector Database
from langchain_chroma import Chroma

#Loader
from langchain_community.document_loaders import PyPDFLoader

#Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [35]:
sys.path.append('../..')
_ = load_dotenv(find_dotenv()) 
api_key = os.environ['OPENAI_API_KEY']

In [36]:
embeddings_model = OpenAIEmbeddings()

In [37]:
sentence1 = "Neural Networks Quantum State (NQS) has halp us to understand the quantum many body systems better, such as finding ground state"
sentence2 = "We like to apply CNN architecture to the NQS model, because we believe convolution based architecture is good"
sentence3 = "This building has a really good architecture, I think I might buy it"
sentence4 = "Quantum Monte carlo is statistical approach for many body problems"

In [38]:
embedded_1 = embeddings_model.embed_query(sentence1)
embedded_2 = embeddings_model.embed_query(sentence2)
embedded_3 = embeddings_model.embed_query(sentence3)
embedded_4 = embeddings_model.embed_query(sentence4)

In [39]:
def cosine_similarity(a, b):
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [40]:
pairs = [
    ("1 vs 2", embedded_1, embedded_2),
    ("1 vs 3", embedded_1, embedded_3),
    ("1 vs 4", embedded_1, embedded_4),
    ("2 vs 3", embedded_2, embedded_3),
    ("2 vs 4", embedded_2, embedded_4),
    ("3 vs 4", embedded_3, embedded_4),
]

for label, vec_a, vec_b in pairs:
    print(f"{label}: {cosine_similarity(vec_a, vec_b):.6f}")

1 vs 2: 0.824805
1 vs 3: 0.711113
1 vs 4: 0.846385
2 vs 3: 0.776912
2 vs 4: 0.775635
3 vs 4: 0.728063


In [41]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function= embeddings_model,
    persist_directory="./database/chroma_langchain_db",
)

##### Loading Docs

In [43]:
loaders = [
    PyPDFLoader("./docs/2208.01758v4.pdf"),
    PyPDFLoader("./docs/G-CNN.pdf"),
    PyPDFLoader("./docs/G-CNN_for_NQS.pdf"),
    PyPDFLoader("./docs/information-15-00517.pdf"),
    PyPDFLoader("./docs/RBM_paper_hinton.pdf"),
    PyPDFLoader("./docs/Solving the quantum many-body problem with artificial neural networks(2017).pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

##### Splitting

In [44]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

len(splits)

297

##### Embedding and Adding to vector database

In [45]:

vector_store.add_documents(documents=splits)

['efc7e664-07ff-4e0f-8230-7b60effa8d84',
 'b7dfaed5-067b-4010-90cb-82f4073c1518',
 'ae293fe6-e8ec-40e7-a9c6-de9d4608fcab',
 '1af6578a-0cba-423c-a209-9ed0f0b20f37',
 '1a533eec-1144-4d8c-8e08-c9109b22a02a',
 '847b6882-6724-4f0c-afe2-ec0b743f9b80',
 'cbfbc6bb-3224-47d8-8f34-366d1cff7a54',
 '0f3ca64b-f900-4fe8-a558-c89040f27407',
 '615e1ee3-1dcc-4b36-aafe-518dd73da9cb',
 'ca4b30bd-a518-4c81-8fb3-572e6b71942c',
 'bb9956f0-517c-4e5a-aace-a693e2dc9da8',
 'c4d8bbd6-43e4-449c-84cb-78d7ca43063b',
 'ca12361c-c7c2-49f8-a6a5-c082c6ded7b6',
 '83108835-0fee-42b9-803a-de8516497393',
 'e0d325cd-6a54-4217-bc11-3f765a0e5e1b',
 '06e3cb22-9e9c-4ec2-b9f7-8b8ffb17507d',
 'e4134b80-f318-4084-92ca-4bc5641f2bec',
 '68f28540-a0fd-4450-85a8-0bc17ec44d61',
 'f9cf73bb-286e-4058-85ee-f6f5cd99bdda',
 '865902d8-2df7-4206-8830-3324fb9f936d',
 '750b217c-4a4c-4d6c-88d0-88b0930f0965',
 'cb78180f-d7a7-4088-9daf-e482e2ac6672',
 '324c26cf-6a03-4d77-aa83-3331ec5620e4',
 '2b0208c5-9931-43d3-8aaa-4821954c0229',
 '72319e2b-ff29-

In [46]:
print(vector_store._collection.count())

297


In [52]:
question1 = "Explain NQS shortly?"

In [55]:
search_docs = vector_store.similarity_search(question1, k=3)

In [49]:
len(search_docs)

3

In [56]:
search_docs[0].page_content

'problem being solved is fully characterized and free from\nunintended disorder. Neural network quantum states\n(NQS) ([2–9]) have emerged as a competitive tool for\nunderstanding the low temperature properties of quan-\ntum many-body physics models. Unlike traditional vari-\national Monte-Carlo (VMC) methods, such as Gutzwiller\nprojection, neural networks have the advantage, and also\nthe disadvantage, of being free from inductive biases\nabout the structure of the solution. They compensate\nfor the absence of an informed bias by using an enor-\nmous number of parameters. As long as a wide enough\nmodel is used, neural networks contain arbitrarily accu-\nrate solutions [10].\nAlthough reasonably large neural networks are guar-\nanteed to harbor good solutions in the space of possible\nparameters, there is no guarantee that these solutions\ncan be found in a reasonable time. One way to acceler-\nate training is to constrain the search space by removing\nnon-solutions. This is especial

In [58]:
question2 = "Tell me about CNN for NQS."

In [61]:
search_docs = vector_store.similarity_search(question2, k=3)

In [64]:
search_docs[1].page_content

'training deeper models, speciﬁcally convolutional neural\nnetworks (CNN) [8, 11, 16]. This is motivated by the\nﬁeld of computer vision [17, 18], in which deep convolu-\ntional neural networks have excelled at pattern recogni-\ntion. While both images and quantum ground states of-\nten have translational symmetry, quantum ground states\nfrequently have additional point group symmetry. In this\npaper we leverage literature from the machine learning\ncommunity [1] to generalize convolutional NQS to the full\nwallpaper group of highly symmetric lattices.\nWe show that group equivariant convolutional net-\nworks (G-CNN) [1] provide the most expressive\nperceptron-like model for wavefunctions with particular\nsymmetry over discrete groups. G-CNNs are constructed\nusing equivariant convolutions, which are the most com-\nplex linear operations that preserve the structure of a\ndiscrete symmetry group [1]. This approach contrasts\nwith previous research [7, 11, 12], which has utilized\na symm