In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

# Access environment variables
langchain_tracing_v2 = os.getenv('LANGCHAIN_TRACING_V2')
langchain_endpoint = os.getenv('LANGCHAIN_ENDPOINT')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
langchain_project = os.getenv('LANGCHAIN_PROJECT')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [2]:
import os
import fitz  # PyMuPDF
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [4]:
# Load PDF Document from local machine
local_pdf_path = "../data/Rich-Dad-Poor-Dad.pdf" 
# local_pdf_path = "../data/DerejeHinsermuSenbatu-CV.pdf"  

# Make sure the file exists
if not os.path.exists(local_pdf_path):
    raise FileNotFoundError(f"The file at {local_pdf_path} was not found.")

# Load PDF document
loader = PyPDFLoader(local_pdf_path)
docs = loader.load()

In [5]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [6]:
# better to split text into sentences that gives us a full context
import fitz 
from tqdm.auto import tqdm

def text_formatter(text: str) -> str: 
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = [] 
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 16,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_setence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=local_pdf_path)
pages_and_texts[:2]


  from .autonotebook import tqdm as notebook_tqdm
0it [00:00, ?it/s]

241it [00:00, 405.18it/s]


[{'page_number': -16,
  'page_char_count': 108,
  'page_word_count': 26,
  'page_setence_count_raw': 2,
  'page_token_count': 27.0,
  'text': 'Robert T . Kiyosaki W ha t The Rich Teach Their Kids About M oney –  That The Poor And M iddle Class Do Not!'},
 {'page_number': -15,
  'page_char_count': 125,
  'page_word_count': 28,
  'page_setence_count_raw': 1,
  'page_token_count': 31.25,
  'text': '“Rich Dad Poor Dad is a starting point for anyone looking to   gain control of their financial future.” \t \t \t \t \t – USA TODAY'}]

In [7]:
import random
random.sample(pages_and_texts, k =2)


[{'page_number': 118,
  'page_char_count': 2171,
  'page_word_count': 421,
  'page_setence_count_raw': 27,
  'page_token_count': 542.75,
  'text': 'Chapter Six: Lesson 6 120 one reason, and it was not for the benefits. I was a shy person, and the  thought of selling was the most frightening subject in the world. Xerox  has one of the best sales-training programs in America. Rich dad was proud of me. My educated dad was ashamed. Being  an intellectual, he thought that salespeople were below him. I worked  with Xerox for four years until I overcame my fear of knocking on  doors and being rejected. Once I could consistently be in the top five  in sales, I again resigned and moved on, leaving behind another great  career with an excellent company. In 1977, I formed my first company. Rich dad had groomed  Mike and me to take over companies. So I now had to learn to form  them and put them together. My first product, the nylon-and-Velcro  wallet, was manufactured in the Far East and shipped 

In [8]:
#check page number 50 -16 =34
pages_and_texts[50]

{'page_number': 34,
 'page_char_count': 2141,
 'page_word_count': 392,
 'page_setence_count_raw': 26,
 'page_token_count': 535.25,
 'text': 'Chapter One: Lesson 1 36 It hurts the poor people the most, so they have worse health than  those with money. Because the doctors raise their fees, the attorneys  raise their fees. Because the attorneys’ fees have gone up, schoolteachers  want a raise, which raises our taxes, and on and on and on. Soon there  will be such a horrifying gap between the rich and the poor that chaos  will break out and another great civilization will collapse. History  proves that great civilizations collapse when the gap between the haves  and have-nots is too great. Sadly, America is on that same course  because we haven’t learned from history. We only memorize historical  dates and names, not the lesson.” “Aren’t prices supposed to go up?” I asked. “In an educated society with a well-run government, prices should   actually come down. Of course, that is often only 

In [9]:
# pip install spacy
# !python -m spacy download en_core_web_sm

In [10]:
# split page text into sentences
from spacy.lang.en import English
from spacy.lang.en import English
# nlp = spacy.load("en_core_web_sm")
nlp = English()
nlp.add_pipe("sentencizer")
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])


100%|██████████| 241/241 [00:00<00:00, 338.75it/s]


In [11]:
random.sample(pages_and_texts, k=1)

[{'page_number': 167,
  'page_char_count': 2250,
  'page_word_count': 451,
  'page_setence_count_raw': 33,
  'page_token_count': 562.5,
  'text': 'Rich Dad Poor Dad 169 you have a second party who wants to deal. Most sellers ask too much.  It is rare that a seller asks a price that is less than something is worth. Moral of the story: Make offers. People who are not investors have no  idea what it feels like to try to sell something. I have had a piece of real  estate that I wanted to sell for months. I would have welcomed any offer.  They could have offered me 10 pigs, and I would have been happy— not at the offer, but just because someone was interested. I would have  countered, maybe for a pig farm in exchange. But that’s how the game  works. The game of buying and selling is fun. Keep that in mind. It’s fun  and only a game. Make offers. Someone might say yes. I always make offers with escape clauses. In real estate, I make an  offer with language that details “subject-to” contingen

In [12]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)


Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,page_sentence_count_spacy
count,241.0,241.0,241.0,241.0,241.0,241.0
mean,104.0,1549.12,297.89,17.66,387.28,20.47
std,69.71,764.91,149.7,10.05,191.23,12.0
min,-16.0,0.0,1.0,1.0,0.0,0.0
25%,44.0,935.0,184.0,9.0,233.75,10.0
50%,104.0,1860.0,367.0,19.0,465.0,23.0
75%,164.0,2169.0,421.0,26.0,542.25,29.0
max,224.0,2594.0,474.0,39.0,648.5,48.0


### so we splitted our text into sentences, our next step is chuncking our sentences together

In [13]:
# lets do it

# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 12

# Create a function to split lists of texts recursively into chunk size
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

# test_list = list(range(25))
# split_list(test_list)


In [14]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])


100%|██████████| 241/241 [00:00<00:00, 325443.42it/s]


In [15]:
random.sample(pages_and_texts, k=1)

[{'page_number': 163,
  'page_char_count': 2063,
  'page_word_count': 427,
  'page_setence_count_raw': 22,
  'page_token_count': 515.75,
  'text': 'Rich Dad Poor Dad 165 10.\tTeach and you shall receive: the power of giving  Both of my dads were teachers. My rich dad taught me a lesson  I have carried all my life: the necessity of being charitable or giving.  My educated dad gave a lot of his time and knowledge, but almost never gave away money.  He usually said that he would give when he had some extra money,  but of course there was rarely any extra. My rich dad gave money as well as education. He believed firmly  in tithing. “If you want something, you first need to give,” he would  always say. When he was short of money, he gave money to his church  or to his favorite charity. If I could leave one single idea with you, it is that idea. Whenever  you feel short or in need of something, give what you want first and  it will come back in buckets. That is true for money, a smile, love,

In [16]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)


Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,104.0,1549.12,297.89,17.66,387.28,20.47,2.17
std,69.71,764.91,149.7,10.05,191.23,12.0,1.07
min,-16.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,44.0,935.0,184.0,9.0,233.75,10.0,1.0
50%,104.0,1860.0,367.0,19.0,465.0,23.0,2.0
75%,164.0,2169.0,421.0,26.0,542.25,29.0,3.0
max,224.0,2594.0,474.0,39.0,648.5,48.0,4.0


### Splitting each chunk into its own item

In [17]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts): 
    for sentence_chunk in item["sentence_chunks"]: 
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict) 

len(pages_and_chunks)


100%|██████████| 241/241 [00:00<00:00, 10570.19it/s]


522

In [18]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 202,
  'sentence_chunk': 'In the seminar, I went into my emotions and briefly touched my spirit. I cried a lot because I had a lot to cry about. I had done and seen things no one should ever be asked to do. During the seminar, I hugged a man, something I had never done before, not even with my father. On Sunday night, it was difficult leaving this self-development workshop. The seminar had been a gentle, loving, honest environment. Monday morning was a shock to once again be surrounded by young egotistical pilots, dedicated to flying, killing and dying for country. After that weekend seminar, I knew it was time to change. I knew developing myself emotionally and spiritually to become a kinder, gentler, and more compassionate person would be the hardest thing I could do. It went against all my years at the military academy and flight school. I never returned to traditional education again. I had no desire to study for grades, degrees, promotions, or credentials again.',

### Embedding our text chunks

In [19]:
#  pip install sentence-transformers

In [21]:
from langchain_openai import OpenAIEmbeddings

# from langchain.embeddings import HuggingFaceEmbeddings  # Ensure you have the correct import
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize the HuggingFace embedding model
hf_model_name = "BAAI/bge-base-en-v1.5"
embedding = HuggingFaceEmbeddings(model_name=hf_model_name)

# Create an instance of OpenAIEmbeddings
# embedding = OpenAIEmbeddings()

# Create a list of sentences
sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
             "Sentences can be embedded one by one or in a list.",
             "I like horses!"]

# Sentences are encoded/embedded by calling embedding.embed()
embeddings = [embedding.embed_documents(sentence) for sentence in sentences]
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")





Sentence: The Sentence Transformer library provides an easy way to create embeddings.
Embedding: [[0.022168699651956558, 0.011722829192876816, -0.028966065496206284, 0.0014183141756802797, -2.6371195417596027e-05, -0.0020959232933819294, 0.03957924246788025, 0.036203477531671524, -0.044172145426273346, -0.03688161075115204, -0.04799896851181984, 0.00395927345380187, -0.05969937890768051, 0.023139962926506996, -0.04816706106066704, 0.07522429525852203, 0.0580630861222744, -0.04448765143752098, 0.035138294100761414, -0.023722296580672264, 0.011092978529632092, -0.01972087286412716, -6.694190960843116e-05, -0.004711703862994909, 0.03133523091673851, -0.007973152212798595, 0.0022103870287537575, 0.007873154245316982, -0.01111793052405119, -0.0010815794812515378, 0.03489679470658302, -0.020524978637695312, -0.03412974253296852, -0.03458615019917488, -0.011266320943832397, 0.010660963132977486, -0.00339316064491868, -0.006199222058057785, -0.015644319355487823, -0.04910874366760254, -0.04419

In [22]:
# Initialize the embedding model
# embedding_model = OpenAIEmbeddings()
embedding_model = HuggingFaceEmbeddings(model_name=hf_model_name)

# Apply embeddings to each chunk and print progress
for i, chunk in enumerate(pages_and_chunks):
    chunk["embedding"] = embedding_model.embed_documents(chunk["sentence_chunk"])
    print(f"Processed chunk {i + 1}/{len(pages_and_chunks)}: {chunk['sentence_chunk']}")


Processed chunk 1/522: Robert T . Kiyosaki W ha t The Rich Teach Their Kids About M oney – That The Poor And M iddle Class Do Not!
Processed chunk 2/522: “Rich Dad Poor Dad is a starting point for anyone looking to  gain control of their financial future.”	 	 	 	 	 – USA TODAY
Processed chunk 3/522: What The Rich Teach Their Kids About Money— That The Poor And Middle Class Do Not!By Robert T. Kiyosaki RICH DAD POOR DAD
Processed chunk 4/522: If you purchase this book without a cover, or purchase a PDF , jpg, or tiff copy of this book, it is likely stolen property or a counterfeit. In that case, neither the authors, the publisher, nor any of their employees or agents has received any payment for the copy. Furthermore, counterfeiting is a known avenue of financial support for organized crime and terrorist groups. We urge you to please not purchase any such copy and to report any instance of someone selling such copies to Plata Publishing LLC. This publication is designed to provide compe

In [117]:
#check random chuncking text
# pages_and_chunks[50]
# random.sample(pages_and_chunks, k=1)

In [25]:
from langchain.schema import Document

In [26]:
# Prepare documents for the vector store
documents = [
    Document(
        page_content=chunk["sentence_chunk"],
        metadata={"page_number": chunk["page_number"]}
    )
    for chunk in pages_and_chunks
]

In [27]:
# Save to vector store
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory="../db")


In [28]:
# Persist the vector store to disk
vectorstore.persist()

  warn_deprecated(


In [13]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize the HuggingFace embedding model (needed for loading the vector store)
hf_model_name = "BAAI/bge-base-en-v1.5"
embedding_model = HuggingFaceEmbeddings(model_name=hf_model_name)

# Load the vector store from the persisted directory
persist_directory = "../db"
vectorstore = Chroma(persist_directory=persist_directory, embedding_function = embedding_model)

# Now you can use `vectorstore` to query or perform operations as needed




In [14]:
query = "What did the president say about Ketanji Brown Jackson"
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

We have child labor laws, you know. My dad works for the government, you know.” “Wow!”said rich dad. “Now you sound just like most of the people who used to work for me—people I’ve either fired or who have quit.” “So what do you have to say?”I demanded, feeling pretty brave for a little kid. “You lied to me. I’ve worked for you, and you have not kept your word. You haven’t taught me anything.” “How do you know that I’ve not taught you anything?”asked rich  dad calmly. “


In [33]:
# Define the query
query = "what poor dad and rich dad advises"
print(f"Query: {query}")

# Perform similarity search
docs = vectorstore.similarity_search(query, k=5)  # Adjust 'k' for the number of top results you want

# Print the top-k results
for i, doc in enumerate(docs):
    print(f"Result {i + 1}:")
    print(f"Page Number: {doc.metadata['page_number']}")
    print(f"Text: {doc.page_content}")
    print("")

# Example: Print the content of the most similar document
print("Most Similar Document Content:")
print(docs[0].page_content)

Query: what poor dad and rich dad advises
Result 1:
Page Number: 212
Text: For those of you who may not have read Rich Dad Poor Dad, it is about the different lessons my two dads taught me about money and life choices. One was my real dad, and the other was my best friend’s dad. One was highly educated and the other was a high school dropout. One was poor, and the other was rich. 2 Introduction

Result 2:
Page Number: 2
Text: Introduction 4 One dad recommended, “Study hard so you can find a good company to work for.”The other recommended, “Study hard so you can find a good company to buy.”One dad said, “The reason I’m not rich is because I have you kids.”The other said, “The reason I must be rich is because I have you kids.”One encouraged talking about money and business at the dinner table, while the other forbade the subject of money to be discussed over a meal. One said, “When it comes to money, play it safe. Don’t take risks.”The other said, “Learn to manage risk.”One believed, “Ou

In [47]:
# Define the query
query = "good foods for protein"
print(f"Query: '{query}'\n")
#
# Perform similarity search with scores
results = vectorstore.similarity_search_with_score(query, k=5)

# Print the top-k results with scores
print("Results:")
for i, (doc, score) in enumerate(results):
    print(f"Result {i + 1}:")
    print(f"Score: {score:.4f}")
    print(f"Page Number: {doc.metadata['page_number']}")
    print(f"Text: {doc.page_content}")
    print("")

# Example: Print the content of the most similar document
print("Most Similar Document Content:")
print(results[0][0].page_content)


Query: 'good foods for protein'

Results:
Result 1:
Score: 0.9277
Page Number: 131
Text: Do not do what poor and middle-class people do: put their few eggs in many baskets. Put a lot of your eggs in a few baskets and FOCUS: Follow One Course Until Successful.

Result 2:
Score: 0.9985
Page Number: 206
Text: • “Why is money so important to you?” • “Money won’t make you happy.” • “Just live below your means.” • “Play it safe. Don’t go for your dreams.”Diet and Exercise I mention emotional and spiritual development because that is what it takes to make a permanent change in life. For example, it rarely works to tell an overweight person, “Just eat less and exercise more.”Diet and exercise may make sense mentally, but most people who are overweight do not eat because they are hungry. They eat to feed an emptiness in their emotions and their soul. When a person goes on a diet-and-exercise program, they are only working on their mind and their body. Without emotional development and spiritual

In [53]:

def retrieve_relevant_resources(query: str, vectorstore: Chroma, k: int = 5):
    """
    Perform similarity search with scores and return the top k results.
    """
    results = vectorstore.similarity_search_with_score(query, k)
    return results

def print_top_results_and_scores(query: str, vectorstore: Chroma, k: int = 5):
    """
    Finds relevant passages given a query and prints them out along with their scores.
    """
    results = retrieve_relevant_resources(query, vectorstore, k)

    print("Results:")
    for i, (doc, score) in enumerate(results):
        print(f"Result {i + 1}:")
        print(f"Score: {score:.4f}")
        print(f"Page Number: {doc.metadata['page_number']}")
        print(f"Text: {doc.page_content}")
        print("")



In [54]:
# Example usage
query = "good foods for protein"
print(f"Query: '{query}'\n")
print_top_results_and_scores(query=query, vectorstore=vectorstore, k=5)

# Example: Print the content of the most similar document
results = retrieve_relevant_resources(query=query, vectorstore=vectorstore, k=5)
print("Most Similar Document Content:")
print(results[0][0].page_content)

Query: 'good foods for protein'

Results:
Result 1:
Score: 0.9277
Page Number: 131
Text: Do not do what poor and middle-class people do: put their few eggs in many baskets. Put a lot of your eggs in a few baskets and FOCUS: Follow One Course Until Successful.

Result 2:
Score: 0.9985
Page Number: 206
Text: • “Why is money so important to you?” • “Money won’t make you happy.” • “Just live below your means.” • “Play it safe. Don’t go for your dreams.”Diet and Exercise I mention emotional and spiritual development because that is what it takes to make a permanent change in life. For example, it rarely works to tell an overweight person, “Just eat less and exercise more.”Diet and exercise may make sense mentally, but most people who are overweight do not eat because they are hungry. They eat to feed an emptiness in their emotions and their soul. When a person goes on a diet-and-exercise program, they are only working on their mind and their body. Without emotional development and spiritual

In [75]:
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate

prompt_template = ChatPromptTemplate(
    input_variables=['context', 'question'],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=['context', 'question'],
                template=(
                    "You are an assistant for question-answering tasks. Only use the following pieces of retrieved context to answer the question. "
                    "Do not use any outside knowledge. If you don't know the answer based on the context, just say that you don't know. "
                    "Use three sentences maximum and keep the answer concise.\n"
                    "Question: {question} \n"
                    "Context: {context} \n"
                    "Answer:"
                )
            )
        )
    ]
)


In [121]:
# from langchain.llms import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)


In [79]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [81]:
def prompt_formatter(query: str, context_items: list):
    """
    Format the prompt for the LLM based on the query and context items.
    """
    context = "- " + "\n- ".join([item.page_content for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
    formatted_prompt = base_prompt.format(context=context, query=query)
    return formatted_prompt


In [122]:

def retrieve_relevant_resources(query: str, vectorstore: Chroma, k: int = 5):
    """
    Perform similarity search with scores and return the top k results.
    """
    results = vectorstore.similarity_search_with_score(query, k)
    return [doc for doc, score in results]

def create_rag_chain(vectorstore, prompt_template, llm):
    def retrieve_and_format(query):
        docs = retrieve_relevant_resources(query, vectorstore, k=5)
        return docs

    # Define the LLM chain
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)

    # Define the retrieval and generation chain
    def rag_chain(question):
        context_items = retrieve_and_format(question)
        formatted_prompt = prompt_formatter(question, context_items)
        response = llm_chain.invoke({"context": formatted_prompt, "question": question})
        #debug
        # print("Full Response:", response)
        return response
    return rag_chain

# Create the RAG chain
rag_chain = create_rag_chain(vectorstore, prompt_template, llm)

In [123]:
# Example usage
question = "what does it mean poor and rich dad"
response = rag_chain(question)
print("Response:")
print(response['text'])


Response:
The concept of "Rich Dad Poor Dad" refers to the contrasting perspectives and financial philosophies presented by the author's two fathers. The "poor dad" represents a mindset focused on scarcity, while the "rich dad" embodies a mindset of abundance and financial empowerment. Through these two figures, the book explores the impact of beliefs and attitudes towards money on one's financial success and life choices.


Embed the Query: Use the embedding model to embed the query.
Perform Similarity Search: Use the embedded query to perform the similarity search.

In [124]:
# Getting an LLM for local generation

def retrieve_relevant_resources(query: str, vectorstore: Chroma, embedding_model, k: int = 5):
    """
    Embed the query, perform similarity search with scores, and return the top k results.
    """
    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    
    # Perform similarity search
    results = vectorstore.similarity_search_by_vector(query_embedding, k)
    return results

def create_rag_chain(vectorstore, prompt_template, llm, embedding_model):
    def retrieve_and_format(query):
        docs = retrieve_relevant_resources(query, vectorstore, embedding_model, k=5)
        return docs

    # Define the LLM chain
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)

    # Define the retrieval and generation chain
    def rag_chain(question):
        context_items = retrieve_and_format(question)
        formatted_prompt = prompt_formatter(question, context_items)
        response = llm_chain.invoke({"context": formatted_prompt, "question": question})
        
        # Extract the answer from the response
        answer = response["text"]

        return answer

    return rag_chain

# Create the RAG chain
rag_chain = create_rag_chain(vectorstore, prompt_template, llm, embedding_model)

In [126]:
# Example usage
question = "what does it mean poor and rich dad"
response = rag_chain(question)
print("Response:")
print(response)


Response:
"Rich Dad Poor Dad" refers to the book's concept of contrasting perspectives on money and life lessons from two father figures - one rich and one poor. The rich dad emphasizes the importance of money and financial literacy, while the poor dad has a different mindset towards wealth. The book explores how these different views shape one's approach to finances and life choices.


In [7]:
# splits

In [6]:
# Check embeddings (debug)
print(f"Number of splits: {len(splits)}")

Number of splits: 4


# check embedding before save to vector db

In [8]:
# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
# Embed
# vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(),persist_directory="../db/chroma_db")
#load from the disk
# vectorstore = Chroma(persist_directory="../db/Chroma_db",embedding = OpenAIEmbeddings())


In [None]:
print(dir(vectorstore))

['_Chroma__query_collection', '_LANGCHAIN_DEFAULT_COLLECTION_NAME', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_asimilarity_search_with_relevance_scores', '_client', '_client_settings', '_collection', '_cosine_relevance_score_fn', '_embedding_function', '_euclidean_relevance_score_fn', '_get_retriever_tags', '_max_inner_product_relevance_score_fn', '_persist_directory', '_select_relevance_score_fn', '_similarity_search_with_relevance_scores', 'aadd_documents', 'aadd_texts', 'add_documents', 'add_images', 'add_texts', 'adelete', 'afrom_documents', 'afrom_texts', 'amax_marginal_relevance_search', 'amax_marginal_re

In [None]:
# # Directly access and print the document splits at specific indices
# indices_to_check = [0, 1, 2]
# for index in indices_to_check:
#     if index < len(splits):
#         embedded_doc = splits[index]
#         print(f"Document at index {index}:")
#         print(embedded_doc.page_content)
#     else:
#         print(f"Index {index} is out of range.")

# retriever = vectorstore.as_retriever()


In [9]:
retriever = vectorstore.as_retriever()

In [9]:
print(dir(retriever))

['Config', 'InputType', 'OutputType', '__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__orig_bases__', '__parameters__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__ror__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__try_update_forward_refs__', '__validators__', '__weakref__', '_abatch_with_confi

In [10]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x77dc4c2cdab0>)

In [11]:
# Define the prompt template
prompt_template = ChatPromptTemplate(
    input_variables=['context', 'question'],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=['context', 'question'],
                template=(
                    "You are an assistant for question-answering tasks. Only use the following pieces of retrieved context to answer the question. "
                    "Do not use any outside knowledge. If you don't know the answer based on the context, just say that you don't know. "
                    "Use three sentences maximum and keep the answer concise.\n"
                    "Question: {question} \n"
                    "Context: {context} \n"
                    "Answer:"
                )
            )
        )
    ]
)


In [28]:
# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
def create_rag_chain(retriever, prompt_template, llm):
    return (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | llm
        | StrOutputParser()
    )

rag_chain = create_rag_chain(retriever, prompt_template, llm)

# Question
response = rag_chain.invoke("Can you provide contact information for Dereje?")
print(response)



# Evalaute Rag  with Ragas

In [29]:
# Define the questions and ground truth
questions = [
    "What is Dereje's professional background and area of expertise?",
    "Can you provide contact information for Dereje?",
    "what soft skills does Dereje posses?"
]

# groudn truth
ground_truths = [
    ["Dereje Hinsermu is an experienced Machine Learning Engineer with a background in Computer Engineering. His area of expertise is in generative AI and delivering impactful technologies for business growth."],
    ["Contact information for Dereje Hinsermu: Phone Nos: +25124629640, +251973401776, E-mail: derejehinsermu2@gmail.com. LinkedIn profile: https://www.linkedin.com/in/dereje-hinsermu-519a26161/. Github profile: https://github.com/derejehinsermu"],
    ["Dereje soft skilss: Excellent communication and interpersonal Proven leadership and team collaboration Strong organizational and project management"]

]

In [33]:
answers = []
contexts = []

# Inference
for query in questions:
    response = rag_chain.invoke(query)
    answers.append(response)
    relevant_docs = retriever.get_relevant_documents(query)
    contexts.append([doc.page_content for doc in relevant_docs])

# Prepare data dictionary
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

In [39]:
from datasets import Dataset

In [41]:
# convert dict to dataset
dataset = Dataset.from_dict(data)
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 3
})

In [43]:
# pip install ragas

In [45]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating: 100%|██████████| 12/12 [00:13<00:00,  1.15s/it]


In [46]:
df.head()

Unnamed: 0,question,answer,contexts,ground_truths,ground_truth,context_precision,context_recall,faithfulness,answer_relevancy
0,What is Dereje's professional background and a...,Dereje Hinsermu is an experienced Machine Lear...,"[Dereje\nHinsermu\nSenbatu\nTogo\nst,\nAddis\n...",[Dereje Hinsermu is an experienced Machine Lea...,Dereje Hinsermu is an experienced Machine Lear...,1.0,1.0,1.0,0.923795
1,Can you provide contact information for Dereje?,Contact information for Dereje Hinsermu: Phone...,"[Dereje\nHinsermu\nSenbatu\nTogo\nst,\nAddis\n...",[Contact information for Dereje Hinsermu: Phon...,Contact information for Dereje Hinsermu: Phone...,1.0,1.0,1.0,0.932876
2,what soft skills does Dereje posses?,Dereje possesses excellent communication and i...,"[Dereje\nHinsermu\nSenbatu\nTogo\nst,\nAddis\n...",[Dereje soft skilss: Excellent communication a...,Dereje soft skilss: Excellent communication an...,1.0,1.0,0.666667,0.963963


# Generate Prompt

In [16]:

if os.path.exists(local_pdf_path):
    # Load and split PDF document
    loader = PyPDFLoader(local_pdf_path)
    docs = loader.load_and_split()

    # Split
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    # Embed
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

    retriever = vectorstore.as_retriever()
else:
    retriever = None

In [18]:
from langchain_core.messages import HumanMessage, AIMessage
#### PROMPT GENERATION ####

def generate_optimized_prompts(query, num_prompts=2):
    # Initialize ChatOpenAI
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

    # Define the base prompt
    base_prompt = f"User Query: {query}\n\nGenerate an optimized prompt:"

    # Use the retriever to get relevant context from the user's documents (if available)
    context_text = ""
    if retriever:
        context = retriever.get_relevant_documents(query)
        if context:
            context_text = "\n\n".join(doc.page_content for doc in context)

    # Generate multiple optimized prompts using ChatOpenAI with the retrieved context
    generated_prompts = []
    for _ in range(num_prompts):
        messages = [
            AIMessage(content="""
                      "You are an assistant specialized in generating optimized prompts. 
                      example1:original prompmt :Interior furniture design with rocks. and 
                      optimized prompt:Interior furniture design with rocks, rustic, earthy, minimalist, natural, organic, textured, contemporary, modern, Scandinavian, zen, Japanese, wood, stone, sustainable, eco-friendly, neutral colors, clean lines, spatial, cozy.
                      example2:Write me programming job candidate requirements and optimized prompt
                      You are a senior software engineer responsible for assessing the ideal candidate for a programming job. Your role involves analyzing technical skills, experience, and personality traits that contribute to successful software development. With extensive knowledge of programming languages, frameworks, and algorithms, you can accurately evaluate candidates' potential to excel in the field. As an expert in this domain, you can easily identify the qualities necessary to thrive in a programming role. Please provide a detailed yet concise description of the ideal candidate, covering technical skills, personal qualities, communication abilities, and work experience. Focus your knowledge and experience on creating a guide for our recruiting process.

                      """),
            HumanMessage(content=base_prompt + "\n\nContext: " + context_text)
        ]
        response = llm(messages=messages)
        generated_prompts.append(response.content.strip())
    return generated_prompts

In [22]:
# Example usage
# queries = "who is Dereje?"
queries = "Write me programming job candidate requirements"

# queries = ' what does it mean poor dad and rich dad in the books of rebort'

# for query in queries:
    # Generate multiple optimized prompts using RAG
generated_prompts = generate_optimized_prompts(queries)

# Print the generated prompts
print(f"Query: {queries}")
for prompt in generated_prompts:
    print(f"Optimized Prompt: {prompt}\n")

Query: Write me programming job candidate requirements
Optimized Prompt: Optimized Prompt:

Seeking a skilled Machine Learning Engineer for a Deep Learning Based Afaan Oromo talking Robot project. Ideal candidates should possess the following qualifications:

SOFT SKILLS:
- Excellent communication and interpersonal skills
- Proven leadership and team collaboration abilities
- Strong organizational and project management capabilities

TECHNICAL SKILLS:
- Proficiency in GCP, Django, Flask, TensorFlow
- Programming expertise in Python, C, Git & Github
- Experience in fine-tuning large language models

EXPERIENCE:
- Chief AI Officer at Atriv.ai, Netherlands (Remote) leading AI division
- Part-Time Machine Learning Engineer at Gebeya Inc, Addis Ababa, Ethiopia, specializing in large language models
- Freelance Machine Learning Engineer on Upwork focusing on predictive modeling and NLP

Additional Details:
- Located in Addis Ababa, Ethiopia
- Contact: +25124629640, +251973401776
- Email: der

# Evaluate Prompt

In [None]:
# evaluate Code Goes here