# **Q&A system (Using LangChain)**

## **Setup**

### Importing libraries

In [None]:
!pip install pypdf -q

#wrt Vector db
!pip install docarray -q

#wrt OpenAI
!pip install python-dotenv -q
!pip install openai -q #for embeddings
!pip install tiktoken -q #for embeddings

#wrt LangChain
!pip install langchain -q

#wrt Gradio
!pip install gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.5/254.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.0/137.0 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import urllib.request

from langchain.document_loaders import PyPDFLoader #for loading .pdf file
from langchain.vectorstores import DocArrayInMemorySearch

import openai

from IPython.display import display, Markdown

#wrt UI
import time
import gradio as gr

### Config.

#### API key

In [None]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) #read local .env file

### Utilities

In [None]:
def download_pdf(url, output_path):
    """
    download .pdf file from URL & save it at output_path
    """
    urllib.request.urlretrieve(url, output_path)

    #or-
    #!curl -o paper.pdf https://arxiv.org/pdf/2303.13519.pdf

## **UI**

In [None]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from langchain.chains import RetrievalQA

from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature = 0)



def get_ans(filename, question, model_to_use):
  pdf_path = filename
  loader = PyPDFLoader(pdf_path)
  docs = loader.load_and_split()
  db = DocArrayInMemorySearch.from_documents(
                                            docs,
                                            embeddings
                                            )

  qa_stuff = RetrievalQA.from_chain_type(
                                        retriever=db.as_retriever(),
                                        return_source_documents = True,
                                        chain_type="stuff",
                                        llm=llm,
                                        verbose=True
                                        )

  response = qa_stuff(question) #dict_keys(['query', 'result', 'source_documents'])
  answer = response["result"]
  context = response["source_documents"]

  #formatting context
  context = ""
  for i in range(len(response["source_documents"])):
      source_document_path = response["source_documents"][i].metadata["source"]
      page_number = str(response["source_documents"][i].metadata["page"])

      context += "\n" + "#"*50 + "\n"
      context += f"Relevant source text: {source_document_path}, page {page_number}"+"\n"+"#"*50+"\n"
      context += response["source_documents"][i].page_content + "\n"

  return(answer, context)



def question_answer(url, file, question, model_to_use):
  start_time = time.perf_counter()

  if url.strip() == "" and file == None:
      return "[ERROR]: Both URL and PDF is empty. Provide atleast one."

  if url.strip() != "" and file != None:
      return "[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF)."

  if question.strip() == "":
      return "[ERROR]: Question field is empty"

  if url.strip() != "":
      glob_url = url
      download_pdf(glob_url, "document.pdf")
      filename="document.pdf"
  else:
    filename = file.name


  answer, context = get_ans(filename, question, model_to_use)

  end_time = time.perf_counter()
  exec_time = end_time - start_time #second


  return(answer, context, exec_time)




In [None]:
title = "Q&A System"
description = "This Q&A System allows you to input an entire document & ask questions about its contents. This system has ability to add reference to the specific page number from where the information was found. This adds credibility to the answers generated & also helps you locate the relevant information in the document."

with gr.Blocks() as demo:

  gr.Markdown(f"<center><h1>{title}</h1></center>")
  gr.Markdown(description)

  with gr.Row():

    with gr.Group():
      url = gr.Textbox(value="https://clinicaltrials.gov/ProvidedDocs/00/NCT02415400/Prot_000.pdf", label='URL')
      gr.Markdown("<center><h6>or<h6></center>")
      file = gr.File(label='PDF', file_types=['.pdf'])
      question = gr.Textbox(value="When to perform randomization", label="question (FLAN-T5: Eg- 'question: What is inclusion criteria', 'Summarize: Study Design', 'on full input text: summarize')")
      model_to_use = gr.Dropdown(["OpenAI", "T5", "aitextgen", "BART", "FLAN-T5"], value="OpenAI", label='model_to_use')
      btn = gr.Button(value='Submit')
      btn.style(full_width=True)

  with gr.Group():
      exec_time = gr.Textbox(label='execution time (s)')
      answer = gr.Textbox(label='answer')
      context = gr.Textbox(label='Relevant chunks within document (Context)')

  btn.click(question_answer, inputs=[url, file, question, model_to_use], outputs=[answer, context, exec_time])

demo.queue()

demo.launch(share=True)