# SmartDoc Q&A


Install Libraries

In [None]:
!pip install langchain chromadb beautifulsoup4 git+https://github.com/julian-r/python-magic.git unstructured detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2 tiktoken pytesseract sentence_transformers pypdf faiss-cpu transformers
print("done")

Import Libraries

In [None]:
!pip install langchain_community accelerate ctransformers unstructured[pdf]
print("done")

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain import OpenAI, VectorDBQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk
import pytesseract
import csv
import torch
from langchain_community.llms import CTransformers
from langchain.chains import QAGenerationChain, StuffDocumentsChain, LLMChain, RetrievalQA
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings, HuggingFaceHubEmbeddings
from langchain_community.vectorstores import FAISS
from accelerate import Accelerator

print("done")

Download LLM LLAMA2

In [None]:
!huggingface-cli download TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
print("done")

Generating Question list below!!!

Change Document path below

In [None]:
# file_path = "/content/embedded-linux-primer-29-50.pdf"
file_path = "/kaggle/input/sample/sample.pdf"
print("done")

In [None]:
min_ques=5
print("done")

In [None]:
def file_processing(file_path):
    loader = PyPDFLoader(file_path)
    data = loader.load()
    question_gen = ''
    for page in data:
        question_gen += page.page_content
    splitter_ques_gen = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    chunks_ques_gen = splitter_ques_gen.split_text(question_gen)
    document_ques_gen = [[Document(page_content=t)] for t in chunks_ques_gen]
    return document_ques_gen
document_chunks=file_processing(file_path)
print("done")

In [None]:
def load_llm():
    accelerator = Accelerator()

    config = {'max_new_tokens': 1048, 'repetition_penalty': 1.1, 'context_length': 8000, 'temperature':0.3, 'gpu_layers':50}

    llm = CTransformers(model = "/kaggle/input/llama-2-7b-chat/gguf/test/1/llama-2-7b-chat.Q4_K_M.gguf",

                        model_type = "llama",

                        gpu_layers=50,

                        config=config)

    llm, config = accelerator.prepare(llm, config)
    return llm

import re
def q_format(text):
    # Define a regular expression pattern to match the question after the numbering and the period
    pattern = r'^\s*\d+\.\s*(.*)$'
    # Use re.match to search for the pattern in the text
    match = re.match(pattern, text)
    # If there is a match, extract the question
    if match:
        question = match.group(1)
        return question.strip()  # Remove leading/trailing whitespace
    else:
        # If no match found, return None
        return None

def llm_pipeline(file_path, min_ques):
    llm_ques_gen_pipeline = load_llm()
    stuff_template = """
    You are tasked with generating as many interrogative questions as possible based on the provided technical text, which may include code snippets. Your objective is to create a comprehensive set of questions that prompt the reader to reflect on key information and deepen their understanding of the content while ensuring no important details are overlooked.
    Below is an excerpt from the text:
    ----------------
    {text}
    ----------------
    Your task is to formulate a series of clear and concise questions that inquire about specific details, concepts, technical processes, and implications presented in the text. Focus on extracting relevant information and formulating questions that encourage critical thinking and engagement with the material.
    Consider the following guidelines when crafting your questions:
    - Ensure that each question is an interrogative sentence.
    - Cover a range of topics and levels of complexity to comprehensively explore the content.
    - Aim for a balance between factual questions and questions that require interpretation, analysis, or application of technical knowledge.
    - Provide context or background information if necessary to frame the questions effectively.
    - Be mindful not to lose any important information from the text while formulating questions.
    - Include questions related to any code snippets, their functionality, purpose, and potential use cases.
    - Generate different questions each time you are prompted.
    - Generate as many questions as possible.
    - Refrain from generating any irrelevant outputs or non-interrogative sentences. Do not include statements or comments like "I am generating questions for you."
    QUESTIONS:
    """
    PROMPT_QUESTIONS = PromptTemplate(template=stuff_template, input_variables=["text"])
    llm_chain = LLMChain(llm=llm_ques_gen_pipeline, prompt=PROMPT_QUESTIONS)
    stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

    ques_set = set()
    chunk_list=[]
    for chunk in document_chunks:
      chunk_list.append(chunk)
    dic={}
    while len(ques_set) <= min_ques:
        c=1
        for chunk in document_chunks:
            ques = stuff_chain.run(chunk)
            ques_list = [element for element in ques.split("\n") if element.endswith('?')]
            for q in ques_list:
                q=q_format(q)
                if q == None:
                  continue
                if q not in ques_set:
                    ques_set.add(q)
                    if c in dic:
                      dic[c].append(q)
                    else:
                      dic[c]=[q]
                    print("Question: ", q)
            c+=1
    return list(ques_set), dic, chunk_list
  # Specify the path to your PDF file
questions, dic, chunk_list = llm_pipeline(file_path, min_ques)  # Example usage with a maximum limit of 10 question-answer pairs
# print("Generated QA pairs saved in CSV:", output_csv)
print("done")

<!-- Here we are generating answer for list of questions. -->

In [None]:
print(len(questions))
# print(questions)
import csv
csv_file_path = '/kaggle/working/questions.csv'
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['chunk','Question'])  # Write header
    for chunk in dic:
      for q in dic[chunk]:
        writer.writerow([chunk_list[chunk-1], q])
print("done")

import embeddings to document chunks

In [None]:
# !from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name="roberta-base-nli-stsb-mean-tokens")
print("done")

Create vector store

In [None]:
!pip install unstructured[pdf]
print("done")

In [None]:
loader = PyPDFLoader("/kaggle/input/sample/sample.pdf")

docs = loader.load()
print("done")

In [None]:
char_text_splitter = CharacterTextSplitter(chunk_size=  1000, chunk_overlap=100)
doc_texts = char_text_splitter.split_documents(docs)
for chunk in doc_texts:
  chunk.page_content="{{{{CHUNK_STARTING}}}}"+chunk.page_content+"{{{{CHUNK_ENDING}}}}"
print("done")

In [None]:
vStore = FAISS.from_documents(doc_texts, embeddings)
print("done")

Load LLM

In [None]:
from accelerate import Accelerator

def load_llm():

    accelerator = Accelerator()
    config = {'max_new_tokens': 1048, 'repetition_penalty': 1.1, 'context_length': 8000, 'temperature':0.3, 'gpu_layers':50}
    llm = CTransformers(model = "/kaggle/input/llama-2-7b-chat/gguf/test/1/llama-2-7b-chat.Q4_K_M.gguf",
                        model_type = "llama",
                        gpu_layers=50,
                        config=config)
    llm, config = accelerator.prepare(llm, config)
    return llm
print("done")

Initialize VectorDBQA Chain from LangChain

In [None]:
!pip install accelerate
print("done")

In [None]:
!pip install langchain --upgrade
print("done")

In [None]:
model = VectorDBQA.from_chain_type(llm=load_llm(), chain_type="stuff", vectorstore=vStore, k=2)
print("done")

Question Anwering

In [None]:
# !huggingface-cli login
import os
from huggingface_hub import login

# Set your Hugging Face token in the environment variable
os.environ['HUGGINGFACE_TOKEN'] = 'use your hugging face token here'

# Authenticate using the Hugging Face token
login(token=os.environ['HUGGINGFACE_TOKEN'])

print("done")

In [None]:
!mkdir static
print("done")

In [None]:
base_folder = '/kaggle/working/static/out/'
def format(s):
    return s.split("{{{{CHUNK_STARTING}}}}")
# base_folder = 'static/output/'
if not os.path.isdir(base_folder):
    os.mkdir(base_folder)
output_file = base_folder + "QA.csv"
def answer_generator():
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
      csv_writer = csv.writer(csvfile)
      csv_writer.writerow(["Question", "Answer"])
      i=0
      for q in questions:
        i+=1
        if i%4==0:
          a=format(model.run(q))[0]
          csv_writer.writerow([q, a])
          print("Question:",q)
          print("Answer:",a)
answer_generator()
print("done")

In [None]:
question = "How does 5G ensure flexibility and programmability?"
response = model.run(question)
print(response)
print("done")

In [None]:
question = "How does the 5G system address service continuity?"
response = model.run(question)
print(response)
print("done")