In [9]:
import fitz
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [10]:
# Load PDF and extract text per page
text_pages = []
doc = fitz.open("../data/her2_paper.pdf")
for i, page in enumerate(doc):
    text = page.get_text()
    text_pages.append(Document(page_content=text, metadata={"page": i}))

In [11]:
# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(text_pages)

In [12]:
len(chunks)

99

In [13]:
# Embed and save
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embedding_model)
vectorstore.save_local("../her2_faiss_db")