In [1]:
# Cell 1: Import Libraries and Define Functions
import os
import fitz
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

def load_pdf_to_documents(pdf_path: str) -> list:
    """
    Extract text from each page of a PDF and return a list of Document objects.
    """
    documents = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            text = page.get_text()
            if text.strip():  # Skip empty pages
                documents.append(Document(page_content=text, metadata={"page": i}))
    return documents

def build_vector_store(documents: list, chunk_size: int = 500, chunk_overlap: int = 100,
                       model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                       persist_directory: str = "../her2_faiss_db") -> FAISS:
    """
    Split documents into chunks, embed them, and build a FAISS vector store saved locally.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_documents(documents)
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local(persist_directory)
    return vectorstore

def load_vector_store(model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                      persist_directory: str = "../her2_faiss_db") -> FAISS:
    """
    Load an existing FAISS vector store from a local directory.
    """
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    return FAISS.load_local(persist_directory, embedding_model)

def query_vector_store(vectorstore: FAISS, query: str, k: int = 3):
    """
    Perform a similarity search on the vector store given a query string and return the top k documents.
    """
    return vectorstore.similarity_search(query, k=k)


In [3]:
# Cell 2: Process the PDF and Build/Load the Vector Store
pdf_path = "../data/her2_paper.pdf"
persist_directory = "../her2_faiss_db"

# If the vector store already exists, load it; otherwise, process the PDF and create it.
if os.path.exists(persist_directory):
    print("Loading existing vector store...")
    vectorstore = load_vector_store(persist_directory=persist_directory)
else:
    print("Processing PDF and building vector store...")
    documents = load_pdf_to_documents(pdf_path)
    vectorstore = build_vector_store(documents, persist_directory=persist_directory)


Processing PDF and building vector store...


In [8]:
# check the number of embedded chunks 
print("Number of chunks in the vector store:", vectorstore.index.ntotal)


Number of chunks in the vector store: 99


In [6]:
# Cell 3: Query the Vector Store
user_query = "What is the role of HER2 in breast cancer?"
results = query_vector_store(vectorstore, user_query, k=3)

print("\nQuery Results:")
for doc in results:
    page = doc.metadata.get("page", "Unknown")
    print(f"\n--- Page {page} ---")
    print(doc.page_content)



Query Results:

--- Page 6 ---
additional group in which HER-2/neu measurements will have an
impact in predicting biologic behavior ofthe tumor, and as a result,
in design of treatment strategy. Finally, if the HER-2/neu gene
product functions as a growth factor receptor that plays a role in the
pathogenesis of breast cancer, identification of its ligand and
development ofspecific antagonists could have important therapeu-
tic implications.
REFERENCES AND NOTES
1. J. M. Bishop, Annu. Rep. Biochm. 52, 301 (1983).

--- Page 2 ---
In the current study, alterations of the gene in 189
primary human breast cancers were instigated HER-2/
neu was found to be amplified frm 2- to
eater than 20-
fold in 30% ofthe tumors. Correlation ofgene amplifica-
tion with several disease parameters was evaluated Am-
plification of the HER-2/neu gene was a significant pre-
dictor of both overall survival and time to relapse in
patients with breast cancer. It retained its significance
even when adjustments we