In [None]:
!pip install PyPDF2 langchain langchain_community langchain_google_genai google-generativeai transformers faiss-gpu

In [2]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from transformers import BartForConditionalGeneration, BartTokenizer

In [3]:
pdf_path = '/content/Darrin_CV.pdf'
text = ""
with open(pdf_path, "rb") as file:
    pdf_reader = PdfReader(file)
    for page in pdf_reader.pages:
        text += page.extract_text()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
text_chunks = text_splitter.split_text(text)

In [5]:
google_api_key = 'AIzaSyARn_PcqweM5MXHxYaIWGQcf-BDJMP1bDw'
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=google_api_key)
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")

In [None]:
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [7]:
prompt_template = """
Answer the question in a detailed way and include all the related details, if the answer is not in
provided context just say, "Answer is not available", don't generate random responses\n\n
Context:\n {context}\n
Question: \n{question}\n

Answer:
"""
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.4, google_api_key=google_api_key)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

In [8]:
question = 'Where is he doing his studies?'
db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = db.similarity_search(question)

if not docs:
    print("No relevant documents found.")
else:
    response = chain.invoke({"input_documents": docs, "question": question})
    answer = response.get("output_text", "No answer generated.")

    if "answer is not available in the context" in answer.lower():
        concatenated_text = " ".join([doc.page_content for doc in docs])
        inputs = tokenizer(concatenated_text, max_length=1024, return_tensors='pt', truncation=True)
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        answer += "\n\nGenerated Summary/Insight: " + summary

    print("Answer:", answer)

Answer: Vellore Institute of Technology (VIT) Vellore, India
