In [32]:
import faiss
import fitz  # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

In [35]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

def split_into_passages(text, chunk_size=10):
    sentences = text.split(". ")
    passages = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) < chunk_size:
            chunk += sentence + ". "
        else:
            passages.append(chunk.strip())
            chunk = sentence + ". "
    if chunk:
        passages.append(chunk.strip())
    return passages

pdf_text = extract_text_from_pdf("./data/document.pdf")
documents = split_into_passages(pdf_text)
print(f'Extracted {len(documents)} passages from the PDF.')
print(f'First 3 passages: {documents[:3]}')

Extracted 28 passages from the PDF.
First 3 passages: ['Dr.', 'Kokossis, FIChemE, FIEE, FRSA, and FIET, is Professor of Process Systems Engineering \nat the National Technical University of Athens.', 'He is a Chartered Engineer with IChemE \n(UK).']


In [40]:
# 2. Load embedding model and encode documents
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # much better than raw BERT
document_embeddings = embedder.encode(documents)

# 3. Create FAISS index
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(document_embeddings))

# 4. Define your query and embed it
query = "Is Konstantinos an electrical engineer or a chemical engineer?"
query_embedding = embedder.encode([query])

# 5. Search for top relevant docs
top_k = 1
distances, indices = index.search(np.array(query_embedding), top_k)
retrieved_texts = [documents[i] for i in indices[0]]

# 6. Use a local language model to generate the answer
generator = pipeline("text2text-generation", model="google/flan-t5-small")
input_text = f"Context: {retrieved_texts[0]} \n\nQuestion: {query}"
result = generator(input_text)

print(result[0]['generated_text'])


Device set to use cuda:0


electrical engineer
