# Advanced RAG

In [42]:
import os
import chromadb

from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.documents import Document as lancghain_Document
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Pinecone

from operator import itemgetter
from langchain.memory import ConversationBufferMemory
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel
from langchain.prompts.prompt import PromptTemplate

from langchain.text_splitter import RecursiveCharacterTextSplitter

from pinecone import Pinecone as pinecone_client

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

local_db_path='../db/'
embedding_model='text-embedding-ada-002'

llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_API_KEY'), model_name="gpt-3.5-turbo", temperature=0.1)
query_model=OpenAIEmbeddings(model=embedding_model,openai_api_key=os.getenv('OPENAI_API_KEY'))


In [43]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [44]:
docs=['../data/AMS/AMS_2020.pdf','../data/AMS/AMS_2018.pdf']

In [45]:
docs_out=[]
for doc in docs:
    loader = PyPDFLoader(doc)
    data = loader.load()
    docs_out.extend(data)

In [46]:
docs_out[101]

Document(page_content='92 REACT Qualification C ampaign  \n \nA complete qualification campaign has been defined for each model and temperature variant, for a total of \n6 campaigns. Eac h campaign includes 3 different units covering different mechanical interfaces for the \ncustomers. Figure 7 summarizes the qualification campaign sequence for each variant.  \n \n \nFigure 7. REACT Qualification Campaign  \n \nThe qualification campaigns of the REACT 5kN and 15kN Standard Temperature  units  have already been \ncompleted, while the rest of campaigns are on- going. The main results and findings are:  \n• The units are being qualified to a lifetime over 50 cycles, which enables for at least 10 cycles on \nground and one in orbit, according to ECSS. Extension to 70 cycles during qualification to enable up \nto 15 on ground uses is being implemented on new campaigns. REACT 15kN Standard temperature \nhas already completed its qualification for 70 cycles.  \n• All the units withst ood thei

In [47]:
num_tokens_from_string(docs_out[101].page_content)

649

In [63]:
chunk_size=400
k_parent=5
# parent_splitter=CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size*k_parent, chunk_overlap=0)
# child_splitter=CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=0)

# I don't think the splitters above work for what I want!
child_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
parent_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size*k_parent, chunk_overlap=0)

In [66]:
split_docs_child=child_splitter.split_documents(docs_out)
split_docs_child[:5]

[Document(page_content='i NASA/CP-20205009766  \n45th Aerospace Mechanisms Symposium  \nC\nompiled/Edited by:  Edward A. Boesiger \nP\nroceedings of a symposium  \nHosted by the NASA Johnson Space Center  and \nLockheed Martin Space \nSponsored and Organized by the Mechanisms Education Association \n2020', metadata={'source': '../data/AMS/AMS_2020.pdf', 'page': 0}),
 Document(page_content='ii', metadata={'source': '../data/AMS/AMS_2020.pdf', 'page': 1}),
 Document(page_content='iii PREFACE \n \nThe Aerospace Mechanisms Symposium (AMS) provides a unique forum for those \nactive in the design, production and use of aerospace mechanisms. A major focus is the \nreporting of problems and solutions associated with the development and flight \ncertification of new mechanisms. Sponsored and organized by the Mechanisms', metadata={'source': '../data/AMS/AMS_2020.pdf', 'page': 2}),
 Document(page_content='Education Association, responsibility for hosting the AMS is shared by the National \nAeron

In [67]:
split_docs_parent=parent_splitter.split_documents(docs_out)
split_docs_parent[:5]

[Document(page_content='i NASA/CP-20205009766  \n45th Aerospace Mechanisms Symposium  \nC\nompiled/Edited by:  Edward A. Boesiger \nP\nroceedings of a symposium  \nHosted by the NASA Johnson Space Center  and \nLockheed Martin Space \nSponsored and Organized by the Mechanisms Education Association \n2020', metadata={'source': '../data/AMS/AMS_2020.pdf', 'page': 0}),
 Document(page_content='ii', metadata={'source': '../data/AMS/AMS_2020.pdf', 'page': 1}),
 Document(page_content='iii PREFACE \n \nThe Aerospace Mechanisms Symposium (AMS) provides a unique forum for those \nactive in the design, production and use of aerospace mechanisms. A major focus is the \nreporting of problems and solutions associated with the development and flight \ncertification of new mechanisms. Sponsored and organized by the Mechanisms \nEducation Association, responsibility for hosting the AMS is shared by the National \nAeronautics and Space Administr ation and Lockheed Martin Space.  \n \nThe 45th AMS was sc

In [None]:
questions=["What are some challenges associated angular contact of ball bearings?"]

In [None]:
from langchain import hub
from langchain.prompts.prompt import PromptTemplate

# Prompts on the hub: https://smith.langchain.com/hub/my-prompts?organizationId=45eb8917-7353-4296-978d-bb461fc45c65
CONDENSE_QUESTION_PROMPT = hub.pull("dmueller/ams-chatbot-qa-condense-history")
QA_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval")
QA_WSOURCES_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval-wsources")
QA_GENERATE_PROMPT=hub.pull("dmueller/generate_qa_prompt")
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

In [None]:
# Combine documents, from queries.py

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [None]:
# From queries.py

memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)

## Parent-Child with Full Parent Retrieval

In [77]:
type='standard'
# type='parent-child'

persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')   
try:
    persistent_client.delete_collection(name="standard-test")
except:
    pass   
vectorstore = Chroma(collection_name='standard-test',
                     embedding_function=query_model)


In [71]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# page_chunks = text_splitter.split_documents(docs_out)
# vectorstore.add_documents(page_chunks)

store=InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)
child_splitter._chunk_size

400

In [73]:
retriever.add_documents(docs_out[:500],ids=None)

In [74]:
# Original pdf pages stored in memory
len(list(store.yield_keys()))

500

In [75]:
# Retrieved chunks with size chunk_size
sub_docs = vectorstore.similarity_search("bearing preload")
print(sub_docs[0].page_content)
print(len(sub_docs[0].page_content))


Shimming and grinding procedures were confirmed to 
produce the desired preload, which was measured 
indirectly (by measuring the gap between the clamp and 
shaft/hub)  during assembly of the bear ings. Review of the 
clamp design analysis with a high- fidelity  Finite Element 
Analysis (FEA) model confirmed good correlation with the 
load vs. deflection curves seen in in the as -built assembly.
398


In [76]:
# Original pdf page stored in memory
retrieved_docs = retriever.get_relevant_documents("bearing preload")
print(retrieved_docs[0].page_content)
print(len(retrieved_docs[0].page_content))
print('tokens: '+str(num_tokens_from_string(retrieved_docs[0].page_content)))


325 the bearings was exonerated by measuring the hardness of a set of bearings from the same lot as the 
bearings in the EM unit . Tests resulted in an  average hardness of 57.6 on the Rockwell Hardness Scale C 
(HRC), which was considered in- family with the specified requirement of 58- 60 for the 440C stainless steel 
bearings .  
Bearing analysis via Orbis 3.0 indicated that an angular misalignment of 0.00028 in (7 µm) could introduce 
localized Maximum Mean Hertzian Contact Stress (MMHCS) greater than 335 ksi  (2310 MPa) . Both a 
coordinate measuring machine (CMM) and computed tomography  (CT) (Figure 11) scans were used to 
evaluate this condition. CMM measurements showed a 0.0004- in (10-µm) difference across the bearing 
diameter while computerized tomography ( CT) scans did not show any signs of misalignment. However, 
the CT scans only had a voxel resolution of 0.002 in (51 µm) , which is not fine enough to detect the potential 
misalignment. The CMM measurements could only b

## Parent-child with partial parent retrieval
https://colab.research.google.com/github/datastax/ragstack-ai/blob/main/examples/notebooks/advancedRAG.ipynb

In [78]:
type='standard'
# type='parent-child'

persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')   
try:
    persistent_client.delete_collection(name="pc-test")
except:
    pass   
vectorstore = Chroma(client=persistent_client,
                    collection_name='pc-test',
                    embedding_function=query_model)


In [79]:
store_pc=InMemoryStore()
parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store_pc,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [80]:
# Split and load the documents into the vector and parent stores
parent_retriever.add_documents(docs_out[:500])

In [81]:
len(list(store_pc.yield_keys()))

842

In [82]:
# Child retrieved chunks with size chunk_size
sub_docs = vectorstore.similarity_search("bearing preload")
print(sub_docs[0].page_content)
print(len(sub_docs[0].page_content))

cases are high enough in axial force to counter balance the cross -moment load, these  bearings are not 
required to support the cross -moment  through prel oad, allowing the preload to be low, between 44- 88 N 
(10-20 lb) . Preload is therefore set to maintain ram t ip alignment throughout the stroke of the mechanism,
320


In [83]:
# Parent chunked documents with chunk_size*k_parent
retrieved_docs = retriever.get_relevant_documents("bearing preload")
print(retrieved_docs[0].page_content)
print(len(retrieved_docs[0].page_content))
print('tokens: '+str(num_tokens_from_string(retrieved_docs[0].page_content)))


325 the bearings was exonerated by measuring the hardness of a set of bearings from the same lot as the 
bearings in the EM unit . Tests resulted in an  average hardness of 57.6 on the Rockwell Hardness Scale C 
(HRC), which was considered in- family with the specified requirement of 58- 60 for the 440C stainless steel 
bearings .  
Bearing analysis via Orbis 3.0 indicated that an angular misalignment of 0.00028 in (7 µm) could introduce 
localized Maximum Mean Hertzian Contact Stress (MMHCS) greater than 335 ksi  (2310 MPa) . Both a 
coordinate measuring machine (CMM) and computed tomography  (CT) (Figure 11) scans were used to 
evaluate this condition. CMM measurements showed a 0.0004- in (10-µm) difference across the bearing 
diameter while computerized tomography ( CT) scans did not show any signs of misalignment. However, 
the CT scans only had a voxel resolution of 0.002 in (51 µm) , which is not fine enough to detect the potential 
misalignment. The CMM measurements could only b

# Loading existing store
https://stackoverflow.com/questions/77438251/langchain-parentdocumetretriever-save-and-load