In [1]:
! py -m pip install litellm
! py -m pip install qdrant-client
! py -m pip install -U duckduckgo_search



In [2]:
import os 
import PyPDF2
from tqdm.notebook import tqdm
import re
import json

# Ensure CA certificates for secure connection
os.environ['CURL_CA_BUNDLE'] = 'C:/Users/RACHANAA/Downloads/cacert.pem'

In [3]:
import PyPDF2
import PyPDF2
def read_pdfs_from_folder(folder_path):
    pdf_list = []
    
    # Loop through all the files from a folder
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            
            # open each pdf file
            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                content = ""
                
                # read each page's content and append it to a string
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    content += page.extract_text()
                
                # add the pdf content to the list
                pdf_list.append({"content": content, "filename": filename})
                
    return pdf_list

folder_path = ""

#### Reading Web URLs

In [4]:
from typing import Optional
import requests

def fetch_url_content(url: str) -> Optional[str]:
    """
    Fetches the content of a web page given its URL.
    Args:
        url (str): The URL of the web page to fetch.
    Returns:
        Optional[str]: The content of the web page as a string, or None if an error occurs.
    """
    prefix_url: str = "https://r.jina.ai/"
    full_url: str = prefix_url + url # concatenate the prefix URL with the main url 
    try:
        response = requests.get(full_url) # perform a GET request
        if response.status_code == 200: # check if the request was successful
            return response.content.decode('utf-8') # return the content of the response
        else:
            print(f"Failed to fetch content. Status code: {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"An error occurred while fetching the content: {e}")
        return None

In [5]:
# Repleace the url with the specific endpoint that you want to fetch
url: str = "https://www.freechildrenstories.com/gemma"
content: Optional[str] = fetch_url_content(url)

if content is not None:
    print("Content retrieved successfully.")
else:
    print("Failed to retrieve the content from URL")

Content retrieved successfully.


In [6]:
texts = []
metadatas = []
texts.append(content)
metadatas.append({"url": url})

assert len(metadatas) == len(texts)

In [7]:
metadatas

[{'url': 'https://www.freechildrenstories.com/gemma'}]

#### Splitting the texts

In [8]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from litellm import completion

In [9]:
token_size = 150
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-3.5-turbo",
    chunk_size=token_size,
    chunk_overlap=0
)

In [10]:
def clean_text(text):
    # remove all the newline characters
    text = text.replace("\n", " ").replace('\r', ' ')
    
    #replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # strip leading and trailing spaces
    text = text.strip()
    
    return text

In [11]:
text_chunks = text_splitter.split_text(content)
print(f"Total Chunks: {len(text_chunks)}")


Total Chunks: 229


In [12]:
import json
import requests
import requests
def get_embeddings(texts, model="text-embedding-3-small", api_key = os.getenv("OPENAI_API_KEY")):
    # define the api url
    url = "https://api.openai.com/v1/embeddings"
    
    # Prepare headers with the API key
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    # Prepare the data payload
    data = {
        "input": texts,
        "model": model
    }
    
    # send a post request to the openai api 
    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200: # check if the request was successful
        return response.json()['data']
    else:
        print(f"Error: {response.status_code}")
        return None

In [13]:
openai_api_key = os.getenv("OPENAI_API_KEY")

embeddings_objects = get_embeddings(text_chunks, api_key = os.getenv("OPENAI_API_KEY"))
assert len(embeddings_objects) == len(text_chunks), "Number of embeddings does not match number of text chunks"

In [14]:
embeddings = [obj["embedding"] for obj in embeddings_objects]
len(embeddings[0])

1536

In [15]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

# Create in-memory Qdrant instance
qdrant = QdrantClient(":memory:")

In [16]:
VECTOR_SIZE = 1536
collection_name = "agentic_rag_base"
# Create collection to store books
qdrant.recreate_collection(
    collection_name= collection_name,
    vectors_config = VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
)

print(f"Collection '{collection_name}' created successfully in Qdrant Cloud.")

Collection 'agentic_rag_base' created successfully in Qdrant Cloud.


  qdrant.recreate_collection(


In [17]:
ids = []
payload = []
for id, text in enumerate(text_chunks):
    ids.append(id)
    payload.append({"url":url, "content": text})

len(payload)

229

In [18]:
qdrant.upload_collection(
    collection_name = collection_name,
    vectors = embeddings, 
    payload = payload,
    ids = ids,
    batch_size = 256,
)

In [19]:
qdrant.count(collection_name)

CountResult(count=229)

In [20]:
def search(text: str, top_k: int):
    query_embedding = get_embeddings(text, api_key=openai_api_key)[0]["embedding"]
    
    search_result = qdrant.search(
        collection_name=collection_name,
        query_vector = query_embedding,
        query_filter = None,
        limit = top_k
    )
    return search_result

def format_docs(docs):
    return "\n\n".join(doc.payload['content'] for doc in docs)

#### Prompts
1. First prompt will check to see if the retrieved context can answer the user question
2. Second prompt will get the context and question and generate the response

##### First Prompt

In [21]:
decision_system_prompt = """
Your job is to decide if a given question can be answered with a given context. 
If context can answer the question, return 1. 
If not, return 0.
Context: {context}
"""

user_prompt = """
Question: {question}
Answer:
"""

##### Second Prompt

In [22]:
system_prompt = """
You are an expert for answering questions. Answer the questions according only to the question that is being given
If the question cannot be answered using the context, simply say I don't know. Please do not make stuff up.
Your answer MUST be informative, concise, and action driven. Your answer must be in markdown.
Context: {context}
"""

user_prompt = """
Question: {question}
Answer: 
"""

##### Ask Questions

In [42]:
question = "What was the riddle?"
results = search(question, top_k = 3)
context = format_docs(results)

In [43]:
response = completion(
    model="gpt-4o-mini",
    messages=[{"content": decision_system_prompt.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}],
    max_tokens=500,
    # format="json"
    
)
has_answer = response.choices[0].message.content
has_answer

'0'

In [41]:
from IPython.display import Markdown, display
from duckduckgo_search import DDGS

In [38]:
def format_search_results(results):
    return "\n\n".join(doc["body"] for doc in results)
    

print(f"Question: {question}")
if has_answer == '1':
    print("Context can answer the question")
    response = completion(
        model="gpt-4o-mini",
        messages=[{"content": system_prompt.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}],
        max_tokens=500
    )
    print("Answer:")
    display(Markdown(response.choices[0].message.content))
else:
    print("Context is NOT relevant. Searching online...")
    results = DDGS().text(question, max_results=5)
    context = format_search_results(results)
    print("Found online sources. Generating the response...")
    response = completion(
        model="gpt-4o-mini",
        messages=[{"content": system_prompt.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}],
        max_tokens=500
    )
    print("Answer:")
    display(Markdown(response.choices[0].message.content))

Question: What does Gemma do in the story?
Context can answer the question
Answer:


Gemma settles down and reflects on a riddle while expressing her frustration about not knowing certain things. She paces and recites what she remembers of the riddle, indicating her desire to solve it.