In [None]:
!pip install langchain
!pip install langchain-huggingface
!pip install langchain_community

In [None]:
!pip install transformers
!pip install torch
!pip install accelerate


In [2]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
DATA_PATH = "data/"

In [4]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [5]:

documents = load_pdf_files(data=DATA_PATH)
print("Length of PDF pages: ", len(documents))

Length of PDF pages:  4505


In [6]:
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=create_chunks(extracted_data=documents)
print("Length of TEXT CHUNKS: ", len(text_chunks))

Length of TEXT CHUNKS:  41255


In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
def get_embedding_model():
    embeding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeding_model

embeding_model=get_embedding_model()

In [8]:
from langchain_community.vectorstores import FAISS 

# Step 4: Store embeddings in FAISS
DB_FAISS_PATH = "vectorstore/db_faiss"
db = FAISS.from_documents(text_chunks, embeding_model)
db.save_local(DB_FAISS_PATH)


In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load FAISS index
db = FAISS.load_local(
    "vectorstore/db_faiss",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

# Test query
query = "What is cancer?"  # Replace with your original query
docs = db.similarity_search(query, k=3)
for i, doc in enumerate(docs):
    print(f"Document {i+1}:")
    print(doc.page_content)
    print("Metadata:", doc.metadata)
    print("-" * 50)

Document 1:
cells and tissues develop.
Cancer— A process where abnormal cells within
the body begin to grow out of control, acquire the
ability to invade nearby structures, and travel
through the bloodstream in order to invade distant
structures.
Malignant— Refers to cancer or cancer cells.
Sarcoma— A type of cancer that originates from
connective tissue such as bone or muscle.
Stromal— A type of tissue that is associated with the
support of an organ.
Tubule— Tissues and cells associated with the
Metadata: {'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': 'data\\MedicalBook.pdf', 'total_pages': 4505, 'page': 4025, 'page_label': '3996'}
--------------------------------------------------
Document 2:
David Kaminstein, MD
Cancer
Definition
Cancer is not just one disease, but a large group of
almost 100 diseases. Its two main characteristics are
uncontrolled growth of the ce