***Input is a PDF file and questions, Output is an answers based on PDF file.***

**Step 1: Ingest the PDF File**
Use Langchain to read and extract text from the PDF.

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/your_pdf.pdf")
docs = loader.load()

**Step 2: Preprocess the Text**
Clean and preprocess the extracted text (e.g., remove unnecessary characters, format sections). Segment the text into meaningful chunks.

In [None]:
from langchain.text_splitter import CharacterTextSplitter

char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
doc_texts = char_text_splitter.split_documents(docs)
for chunk in doc_texts:
  chunk.page_content="{{{{CHUNK_STARTING}}}}"+chunk.page_content+"{{{{CHUNK_ENDING}}}}"

**Step 3: Create Vector Store**
Generate embeddings for the text segments and store them in a vector store for efficient similarity search and retrieval.

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="roberta-base-nli-stsb-mean-tokens")
vStore = FAISS.from_documents(doc_texts, embeddings)

**Step 4: Initialize and Configure the LLM**
Initialize and configure the LLM within Langchain using the pre-trained model and the vector store. Apply appropriate hyperparameters and settings.

In [None]:
from accelerate import Accelerator
from langchain.llms import CTransformers
from langchain.chains import VectorDBQA

def load_llm():
    accelerator = Accelerator()
    config = {'max_new_tokens': 1048, 'repetition_penalty': 1.1, 'context_length': 8000, 'temperature':0.3, 'gpu_layers':50}
    llm = CTransformers(model="/content/llama-2-7b.Q4_K_M.gguf", model_type="llama", gpu_layers=50, config=config)
    llm, config = accelerator.prepare(llm, config)
    return llm
model = VectorDBQA.from_chain_type(llm=load_llm(), chain_type="stuff", vectorstore=vStore, k=2)

**Step 5: Generate Q&A Pairs**
Utilize the initialized LLM to generate potential Q&A pairs based on the text segments. Format and save the generated Q&A pairs.

In [None]:
import os
import csv

def format(s):
    return s.split("{{{{CHUNK_STARTING}}}}")

base_folder = 'static/output/'
if not os.path.isdir(base_folder):
    os.mkdir(base_folder)
output_file = base_folder + "QA.csv"

def answer_generator():
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Question", "Answer"])
        i = 0
        for q in questions:
            i += 1
            a = format(model.run(q))[0]
            csv_writer.writerow([q, a])
            print("Question:", q)
            print("Answer:", a)
            if i == 15:
                break
answer_generator()