# Which vectorstore should I use?

CONCLUSION - going without for now...I need a more fine grained approach in terms of how I load/parse pdfs and extract what I need...also I'd like to store what I need in a more structured way because llms aren't answering questions from these docs reliably enough for a demo-able product

In [None]:
from langchain.schema.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader

from meche_copilot.chains.helpers.specs_retriever import SpecsRetriever
from meche_copilot.utils.envars import OPENAI_API_KEY

In [None]:
# load documents using langchain pdf loader
ref_docs = ["demo-01/engineering_design_drawings.pdf"]

documents = []
for fpath in ref_docs:
  loader = PyPDFLoader(str(fpath))
  documents = documents + loader.load()

# len(documents) #36 each page is a document

In [1]:
# create llm
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model="gpt-4")
openai_embedding_func = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
# create vectorstore
chroma_db = Chroma(
  embedding_function=openai_embedding_func,
  persist_directory="data/.chroma_db"
)


In [None]:
# add docs to the vectorstore
chroma_db.add_documents(documents) # this adds duplicates, search for existing first

In [None]:
# can delete by path? yes
ids = chroma_db.get(
  where={"source": "demo-01/engineering_design_drawings.pdf"}
)['ids']
chroma_db.delete(ids)

In [None]:
# can get by page?
res = chroma_db.get(
  where={"page": 35}
)

In [None]:
# specs retriever get relavent docs for specs lookup?

retriever_llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model="gpt-4")
specs_retriever = SpecsRetriever(llm=retriever_llm, vectorstore=chroma_db, source=self.source)


In [None]:
### CUSTOM VECTORSTORE ###

from langchain import PromptTemplate
from numpy import char
import pandas as pd
# from esd_copilot.esd_toolkit.schemas import Esd, EsdConfig
# from esd_copilot.utils.config import load_config, find_config

# esd_config_box = load_config(find_config('esd-config.yaml'))
# esd_config = EsdConfig(**esd_config_box)
# esd = Esd.from_config(config=esd_config)

# # fan eq test
# eq = esd.equipments[2]
# char_desc_dict = eq.char_descs
# spec_results_dict = {}
# spec_results_dict['spec description'] = char_desc_dict
# for i, inst in enumerate(eq.instances):
#     char_result_dict = {key: "(SPEC, PAGE)" for key in char_desc_dict.keys()}
#     # name = inst.inst_name
#     # name = name.replace("template", "").replace(" ", "")
#     spec_results_dict[inst.inst_name] = char_result_dict

# pd.DataFrame(spec_results_dict)
    
# Document loader
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("demo-01/engineering_design_drawings.pdf")
data = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

# Store 
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from meche_copilot.utils.envars import OPENAI_API_KEY

vectorstore = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))

# Retriever
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, max_tokens=5000))

# Run chain
from langchain.chains import RetrievalQA
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)
# chain_type_kwargs = {"prompt": prompt}
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever_from_llm)

unique_docs = retriever_from_llm.get_relevant_documents(query="Get the whole contents of the exhaust fan schedule table")

example_spec_results_dict = {
    "spec description": {
        "spec1": "spec1 is an example spec",
        "spec2": "spec2 is another example spec",
    },
    "instance1": {
        "spec1": "(SPEC, PAGE)",
        "spec2": "(SPEC, PAGE)",
    },
    "instance2": {
        "spec1": "(SPEC, PAGE)",
        "spec2": "(SPEC, PAGE)",
    },
}

template = """
Your job is to fillout specs (short for specifications) and return a valid json blob.

To do this, replace '(SPEC, PAGE)' in the specs_json_blob with the specs you find in the reference documents or '(UNK, UNK)' if you can't find the spec that matches the spec description.

Here is an example of a valid $JSON_BLOB:
```json
    "answer": {example_spec_results_dict},
```
Reference Documents: {context}

specs_json_blob: {spec_results_dict}

ONLY RESPOND WITH your updated specs_json_blob as a valid json blob like this:
```json
specs_json_blob: your answer here
```
"""

prompt = PromptTemplate.from_template(template=template)
INPUT_PROMPT = prompt.format(context=unique_docs, 
spec_results_dict=spec_results_dict, example_spec_results_dict=example_spec_results_dict)

print(INPUT_PROMPT)

result = qa_chain({"query": INPUT_PROMPT})
res = result['result']

import ast

# Extract the dictionary string
dict_string = res.split("specification_results: ")[1].split("\n\nReplacing")[0]
# Convert the string to a dictionary
dict_result = ast.literal_eval(dict_string)
pd.DataFrame(dict_result)

print(dict_result)


In [None]:
### AUTOCREATE VECTORSTORE ###

from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, PDFPlumberLoader, PyPDFDirectoryLoader
from langchain.indexes import VectorstoreIndexCreator

from meche_copilot.utils.envars import OPENAI_API_KEY

######## Autocreate vectorstore from docs ########
# Document loader
loader = PyPDFLoader("demo-01/engineering_design_drawings.pdf")
# Index that wraps above steps
kw_args = {"openai_api_key": OPENAI_API_KEY}
index = VectorstoreIndexCreator(vectorstore_kwargs=kw_args).from_loaders([loader])
# Question-answering
question = "What is on page two of the engineering design drawings?"
index.query(question)


####### Retrieval QA with Sources chain ########

# Split (use Grobid for context aware splitting)
# loader uses GROBIB to parse PDFs into Documents that retain metadata associated with the section of text.
from langchain.document_loaders.parsers import GrobidParser
from langchain.document_loaders.generic import GenericLoader

# Document loader
loader = GenericLoader.from_filesystem(
    "demo-01/",
    glob="engineering_design_drawings.pdf",
    suffixes=[".pdf"],
    parser=GrobidParser(segment_sentences=False),
)
docs = loader.load()

# Document loader
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("demo-01/engineering_design_drawings.pdf")
data = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

# Store 
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from meche_copilot.utils.envars import OPENAI_API_KEY
vectorstore = Chroma.from_documents(documents=all_splits,embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))


from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=ChatOpenAI(temperature=0))

unique_docs = retriever_from_llm.get_relevant_documents(query='What is in the exhaust fan schedule?')