# Advanced RAG

In [None]:
import os
import chromadb

from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.documents import Document as lancghain_Document
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Pinecone

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.storage import LocalFileStore
from pathlib import Path
import uuid

from operator import itemgetter
from langchain.memory import ConversationBufferMemory
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel
from langchain.prompts.prompt import PromptTemplate

from langchain.text_splitter import RecursiveCharacterTextSplitter

from pinecone import Pinecone as pinecone_client

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

local_db_path='../db/'
embedding_model='text-embedding-ada-002'

llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_API_KEY'), model_name="gpt-3.5-turbo", temperature=0.1)
query_model=OpenAIEmbeddings(model=embedding_model,openai_api_key=os.getenv('OPENAI_API_KEY'))


In [None]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
docs=['../data/AMS/AMS_2020.pdf','../data/AMS/AMS_2018.pdf']

In [None]:
docs_out=[]
for doc in docs:
    loader = PyPDFLoader(doc)
    data = loader.load()
    docs_out.extend(data)

In [None]:
chunk_size=400
k_parent=5
# parent_splitter=CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size*k_parent, chunk_overlap=0)
# child_splitter=CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=0)

# I don't think the splitters above work for what I want!
child_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
parent_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size*k_parent, chunk_overlap=0)

In [None]:
split_docs_child=child_splitter.split_documents(docs_out)
# split_docs_child[:5]

In [None]:
split_docs_parent=parent_splitter.split_documents(docs_out)
# split_docs_parent[:5]

In [None]:
questions=["What are some challenges associated with angular preload on ball bearings?"]

In [None]:
from langchain import hub
from langchain.prompts.prompt import PromptTemplate

# Prompts on the hub: https://smith.langchain.com/hub/my-prompts?organizationId=45eb8917-7353-4296-978d-bb461fc45c65
CONDENSE_QUESTION_PROMPT = hub.pull("dmueller/ams-chatbot-qa-condense-history")
QA_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval")
QA_WSOURCES_PROMPT=hub.pull("dmueller/ams-chatbot-qa-retrieval-wsources")
QA_GENERATE_PROMPT=hub.pull("dmueller/generate_qa_prompt")
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

In [None]:
# Combine documents, from queries.py

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [None]:
# From queries.py

memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)

## Parent-Child with Full Parent Retrieval

In [None]:
type='standard'
# type='parent-child'

persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')   
try:
    persistent_client.delete_collection(name="standard-test")
except:
    pass   
vectorstore = Chroma(collection_name='standard-test',
                     embedding_function=query_model)


In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# page_chunks = text_splitter.split_documents(docs_out)
# vectorstore.add_documents(page_chunks)

store=InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)
child_splitter._chunk_size

In [None]:
retriever.add_documents(docs_out[:500],ids=None)

In [None]:
# Original pdf pages stored in memory
len(list(store.yield_keys()))

In [None]:
# Retrieved chunks with size chunk_size
sub_docs = vectorstore.similarity_search("bearing preload")
print(sub_docs[0].page_content)
print(len(sub_docs[0].page_content))


In [None]:
# Original pdf page stored in memory
retrieved_docs = retriever.get_relevant_documents("bearing preload")
print(retrieved_docs[0].page_content)
print(len(retrieved_docs[0].page_content))
print('tokens: '+str(num_tokens_from_string(retrieved_docs[0].page_content)))


## Parent-child with partial parent retrieval
https://colab.research.google.com/github/datastax/ragstack-ai/blob/main/examples/notebooks/advancedRAG.ipynb

In [None]:
type='standard'
# type='parent-child'

persistent_client = chromadb.PersistentClient(path=local_db_path+'/chromadb')   
try:
    persistent_client.delete_collection(name="pc-test")
except:
    pass   
vectorstore = Chroma(client=persistent_client,
                    collection_name='pc-test',
                    embedding_function=query_model)


In [None]:
# In case you want to store on a remote sql db

# from langchain_community.storage import AstraDBStore

# ASTRA_DB_API_ENDPOINT = 'https://f17fd04c-2051-4034-85e5-2f7fc320ab0a-us-east-1.apps.astra.datastax.com'
# ASTRA_DB_APPLICATION_TOKEN = 'AstraCS:TKHGUJWdOGIXqDthOIAgZqOe:a8549547743f6999fffb6810461eb29ea1cc1252098f0f9baf67b3920904385c'
# collecton_name='test'

# store_pc = AstraDBStore(
#     api_endpoint=ASTRA_DB_API_ENDPOINT,
#     token=ASTRA_DB_APPLICATION_TOKEN,
#     collection_name="my_store",
# )

# import redis

# r = redis.Redis(
#   host='usw2-devoted-ladybird-30146.upstash.io',
#   port=30146,
#   password='5c0dbf8cdc65467096032041b68f7d66'
# )

# r.set('foo', 'bar')
# print(r.get('foo'))

# from langchain.storage import UpstashRedisByteStore
# from upstash_redis import Redis

# URL = 'https://usw2-devoted-ladybird-30146.upstash.io'
# TOKEN = 'AXXCACQgOTUwNTA5MzAtZTViYy00ZTMwLTgwOGItNzM1MmY1ZjJlOWIwNWMwZGJmOGNkYzY1NDY3MDk2MDMyMDQxYjY4ZjdkNjY='

# redis_client = Redis(url=URL, token=TOKEN)
# store_pc = UpstashRedisByteStore(client=redis_client, ttl=None, namespace="test-ns")

In [None]:
# store_pc=InMemoryByteStore()

# root_path = Path.cwd() / "data"  # can also be a path set by a string

docs_temp=docs_out[:500]

In [None]:
id_key = "doc_id"
root_path = Path.cwd().parent / 'db/chromadb/parent-docs'
store_pc = LocalFileStore(root_path)
doc_ids = [str(uuid.uuid4()) for _ in docs_temp]

parent_retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store_pc,
    id_key=id_key,
)

In [None]:
sub_docs = []
for i, doc in enumerate(docs_temp):
    _id = doc_ids[i]
    _sub_docs = child_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [None]:
parent_retriever.vectorstore.add_documents(sub_docs)
parent_retriever.docstore.mset(list(zip(doc_ids, docs_temp)))

In [None]:
# Child retrieved chunks with size chunk_size
sub_docs = vectorstore.similarity_search("bearing preload")
print(sub_docs[0].page_content)
print(len(sub_docs[0].page_content))

In [None]:
# Parent chunked documents with chunk_size*k_parent
retrieved_docs = retriever.get_relevant_documents("bearing preload")
print(retrieved_docs[0].page_content)
print(len(retrieved_docs[0].page_content))
print('tokens: '+str(num_tokens_from_string(retrieved_docs[0].page_content)))


## Try opening existing retriever and local store

In [None]:
store_pc_exists = LocalFileStore(root_path)

id_key = "doc_id"
parent_retriever_exists = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store_pc_exists,
    id_key=id_key,
)

In [None]:
# Child retrieved chunks with size chunk_size
sub_docs = parent_retriever_exists.vectorstore.similarity_search("bearing preload")
print(sub_docs[0].page_content)
print(len(sub_docs[0].page_content))

In [None]:
# Parent chunked documents with chunk_size*k_parent
# Only works when you create the local vector store. So it works!
retrieved_docs = parent_retriever_exists.get_relevant_documents("bearing preload")
print(retrieved_docs[0].page_content)
print(len(retrieved_docs[0].page_content))