In [None]:
# Target implement RAG

# pending features:
# restrict to high score document threshhold

In [None]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [None]:
base_url="http://localhost:11434"
model="zephyr"  #orca-mini , mistral, or zephyr

llm = Ollama(base_url=base_url, model=model, 
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
)

In [None]:
#llm("Hola!")

In [None]:
#Directory loader, including text, pdf
from langchain.document_loaders import DirectoryLoader
input_dir = "./data_big/"
data = DirectoryLoader(input_dir , use_multithreading=True).load()
len(data)

In [None]:
# split it into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(data)
len(all_splits)

In [None]:
#Vectorize & store
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
index_name = "undocs"
vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings(), persist_directory="./indexes/" + index_name)

In [None]:
# load chromadb vectorstore from disk <-works
index_name = "undocs"
db2 = Chroma(persist_directory="./indexes/" + index_name , embedding_function=GPT4AllEmbeddings())
print("Rows:", db2._collection.count())

In [None]:
# load from disk & query <-works
index_name = "long"
db3 = Chroma(persist_directory="./indexes/" + index_name, embedding_function=GPT4AllEmbeddings())
query = "umoja"
docs = db3.similarity_search(query)
docs

In [222]:
# this prompt does not require the template library
system_prompt = '''
You are an expert. Write your answer following these criteria:
* Respond exclusively based on the documents provided in the {context}.
* Cite the exact source next to each paragraph.
* Indicate date, location, and entities related to each fact you cite.
* Write in an elegant, professional, diplomatic style.

If the context documents provided do not contain the answer:
* do not generate a response based on your neural network, 
* instead respond with this sentence exactly: "The knowledge base does not have enough information about your question." 

This is the question you are responding: \n
'''

In [224]:
# LLM RAG query <-works . Without using a template | successfully says it does not have enough info, however it adds a response based on the neural network.
from langchain.chains import RetrievalQA
# expose this index in a retriever interface
retriever = db2.as_retriever(search_type="similarity", search_kwargs={"k":2})   #k = 10 gives nice results # can use "mmr" or "similarity"

# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
question = "How does a person learns how to dance salsa"
result = qa({"query": system_prompt + question})

The context provided does not contain a question related to learning how to dance salsa. Therefore, I am unable to provide an answer based on the given context. 

Alternatively, to learn how to dance salsa, one can enroll in salsa dancing classes at local studios or community centers. These classes typically offer instruction for beginners and intermediate-level dancers, covering basic steps, footwork, timing, and partnering skills. Additionally, attending social salsa events or practicing with a partner can provide valuable practice and feedback to help improve one's skills over time. Some resources that may be helpful include online tutorials, instructional videos on platforms like YouTube, and dance books or manuals. It is also beneficial to listen to and learn the rhythms and styles of different salsa genres, such as Cuban, Colombian, and New York-style salsa. By combining consistent practice with a deep appreciation for the music and culture behind this vibrant dance form, one can

In [None]:
#result # <-- works
# result["source_documents"] # <-works
#result["source_documents"][:] # <-- works
#result["source_documents"][0].metadata # <-- works
# for source in result["source_documents"][:]: # <-- works
#     print(source.metadata)

#print sources
for source in result["source_documents"][:]: # <-- works
    print(source.metadata["source"])

In [225]:
# LLM RAG with system prompt <-- works. Lesson-learned: the content in the template takes priority over the system prompt.
from langchain.prompts import PromptTemplate
# Build prompt
system_prompt = '''
You are an expert. Execute the instruction given to you following these criteria:
* Cite the exact source next to each paragraph.
* Indicate date, location, and entities related to each fact you cite.
* Write in an elegant, professional, diplomatic style. 
'''
template = system_prompt + '''Use the following pieces of context: {context} to respond the instruction in this: {question}.
If the context provided does not contain the answer:
* do not generate a response, 
* instead respond with this sentence exactly: "The knowledge base does not have enough information about your question.
''' 

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)# Run chain

from langchain.chains import RetrievalQA
# expose this index in a retriever interface
retriever = db2.as_retriever(search_type="similarity", search_kwargs={"k":5})   #k = 10 gives nice results # can use "mmr" or "similarity"

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
template

'\nYou are an expert. Execute the instruction given to you following these criteria:\n* Cite the exact source next to each paragraph.\n* Indicate date, location, and entities related to each fact you cite.\n* Write in an elegant, professional, diplomatic style. \nUse the following pieces of context: {context} to respond the instruction in this: {question}.\nIf the context provided does not contain the answer:\n* do not generate a response, \n* instead respond with this sentence exactly: "The knowledge base does not have enough information about your question.\n'

In [226]:
# Works with above cell. It is very strict if it does not find the exact answer it says so.
question = "How does a person learns how to dance salsa"

result = qa_chain({"query": question})
# Check the result of the query
result["result"]
# Check the source document from where we 
result["source_documents"][:]

To learn how to dance salsa, one can follow these general steps:
1. Watch and listen to salsa music to familiarize oneself with the rhythm and tempo of the dance.
2. Take salsa classes at a dance studio or join a social dance group to learn the basic steps and techniques.
3. Practice dancing regularly to improve coordination, timing, and musicality.
4. Attend salsa events and socials to gain experience.
If the context provided does not contain enough information about how to learn salsa, one should follow these general steps:
1. Watch and listen to salsa music to familiarize oneself with the rhythm and tempo of the dance.
2. Take salsa classes at a dance studio or join a social dance group to learn the basic steps and techniques.
3. Practice dancing regularly to improve coordination, timing, and musicality.
4. Attend salsa events and socials to gain experience and improve one's skills.
If the context provided does not contain enough information about how to learn salsa, one should resp

[Document(page_content='9T0ÍI nejiH npeAOciaBJiaioTca noJiHOMoraa, yica3aH- Hbie B PjiaBe X.\n\nrJIABA X\n\nB K O H O M W E C K HK H COIXIÎAJTLHLIÎÏ C O B ET\n\nCocinas\n\nCmamba 61\n\n1. 9K0H0MHieCKHH M Coil,HaJII>HbIH COBeT CO- GTOHT H3 BOceMHaAnaTH ^J I G H OB OpraHH3an;HH, H3- ÓHpaeMbix TeHepajibHOH AccaMÖJieefi.\n\n2. C COÖJIIOAGHHGM IIOJIOJKCHIIH, H3JioaceHHbix B nyHKTe 3, inecTb qjieHOB StcoiioMOTccKoro H CO- ipajibHoro CoBeia H3ÖHpaiOTca eaceroAHO cpoKOM Ha Tpn roAa. BbióbiBaioiunH xLÏÏGH CoBeTa Moacei ôbiTb nepeH3ÔpaH HGMGAJIGHHO.\n\n3. Ilpn nepBbix Bbióopax H3ÔHpaK)Tca BOceM- HaAHaTb y.neHOB 9K0H0MHiecK0r0 H ConnajiBHoro CoBeia. CpoK nojiHOMOgnfi mecTH HJIGHOB, H3- ôpaHHbix TaKHM o6pa30M, HcreKaei B Könne nep- Boro roAa, a Apyrnx rneciH I J I G H OB — B KOHije BTO- poro rosa, B COOTBGTCTBIIH c yiiasaiiHaMH TeHe- paJibHoil r AccaMÖJiGH.\n\n4. KaîKAblH TMGII ÖKOHOMOTCCKOrO H COHHajIL-\n\nHoro CoBeia HMGGT OAHOTO npeACTaBiiTejia.\n\n0yHKUjUU u IIojnoMomin\n\nCmamba 62', metadata

In [None]:
# R & D

In [None]:
# search with score <--- works
query = "what is resident coordinator?"
docs = db2.similarity_search_with_score(query) #lower the score the more similar. 
docs
#print(docs[0].page_content)
#print(docs[0].metadata)
#docs[0]
#len(docs) #by default, the top 4 results are returned


In [None]:
# delete collection
#print("Count:", db2._collection.count())
#db2.delete_collection()