In [None]:
# Assessment submitted by Fiaz S Mohammad
# Candidate assessment for building a Retreival-Augmented Generation(RAG) application


# OBJECTIVE : The goal of this assessment is to build a retrieval-augmented generation(RAG) application. This application should be able to read and comprehend the content of a provided PDF document(in this case Onepane.pdf) and accurately answer questions based in the document's content.
''' REQUIREMENTS :
The RAG application should :
1. Provide answers to questions based on the document's content.
2. Print the retrieved content from the document that was used to generate these answers.
'''

" REQUIREMENTS :\nThe RAG application should :\n1. Provide answers to questions based on the document's content.\n2. Print the retrieved content from the document that was used to generate\xa0these\xa0answers.\n"

In [None]:
!pip install PyMuPDF scikit-learn openai==0.28 #Installing the required libraries : 1.PyMuPDF - For data extraction from the pdf 2. scikit-learn - For TF-IDF Vectorization and cosine similarity. 3. Openai - To use LLM- ChatGPT 3.5


Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.35.14
    Uninstalling openai-1.35.14:
      Successfully uninstalled openai-1.35.14
Successfully installed openai-0.28.0


In [None]:
#Code snippet for RAG :

#Importing the required libraries.....
import fitz  # PyMuPDF
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def extract_text_from_pdf(pdf_path):                 #This code snippet is used to extract and return the text content from each page of a PDF document.
    text = []
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text.append(page.get_text())
    return text

def retrieve_relevant_section(text, query, top_n=1):      #This code snippet is used to identify and return the most relevant sections of text from a list based on their similarity to a query, using TF-IDF vectorization and cosine similarity.
    tfidf_vectorizer = TfidfVectorizer().fit_transform(text)
    query_tfidf = TfidfVectorizer().fit(text).transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_vectorizer).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-top_n-1:-1]
    return [text[i] for i in related_docs_indices]

def generate_answer(openai_api_key, query, context):     #This code snippet is used to generate an answer to a query based on provided context by calling the OpenAI API using the gpt-3.5-turbo model, and then returning the generated answer.
    openai.api_key = openai_api_key
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}\nAnswer:"}
        ],
        max_tokens=150
    )
    return response.choices[0].message['content'].strip()

def answer_query(pdf_path, openai_api_key, query):      #This code snippet is used to extract text from a PDF, retrieve the most relevant sections based on a query, generate an answer using the OpenAI API with the relevant context, and return both the context and the generated answer.
    text = extract_text_from_pdf(pdf_path)
    relevant_section = retrieve_relevant_section(text, query)
    context = " ".join(relevant_section)
    answer = generate_answer(openai_api_key, query, context)
    return context, answer


pdf_path = "/content/Onepane[27].pdf"
#openai_api_key = ""
query = "why onepane?"  #User input
context, answer = answer_query(pdf_path, openai_api_key, query)

print(f"Retrieved Content: {context}") #Prints the retrieved content from the pdf
print(f"Answer: {answer}") #Prints the generated answer using the LLM model (GPT3.5) and pdf.




Retrieved Content:  
 
 
   Onepane.ai 
About us 
Welcome to Onepane, your trusted partner in revolutionizing the way you 
manage and optimize your cloud resources. Onepane is a cutting-edge 
SaaS software product designed to empower businesses across various 
cloud platforms. Our mission is to provide you with the tools and insights 
necessary to harness the full potential of your cloud infrastructure. 
What is onepane? 
Onepane employs intelligent algorithms to suggest and enforce 
standardized naming conventions for your cloud resources. It also allows 
you to set up smart tags based on specific criteria, making resource 
management and tracking more efficient. 
By unifying incidents across cloud environments, Onepane provides a 
centralized platform to manage and resolve incidents more efficiently. The 
enhanced visibility into resources helps in diagnosing issues and reducing 
downtime. 
Onepane offers a comprehensive dashboard that provides real-time 
insights into your cloud res