In [10]:
import fitz  # PyMuPDF

def extract_text_pymupdf(pdf_path):
    """Extracts text from a PDF using PyMuPDF while preserving structure."""
    doc = fitz.open(pdf_path)
    extracted_text = ""

    for page in doc:
        extracted_text += page.get_text("text") + "\n"  # Extracts structured text

    return extracted_text

# Example usage
pdf_files = ["physiology.pdf", "pathology.pdf", "pharmacology.pdf"]
extracted_texts = {pdf: extract_text_pymupdf(pdf) for pdf in pdf_files}

# Save extracted content as text files
for pdf, text in extracted_texts.items():
    with open(f"{pdf}.txt", "w", encoding="utf-8") as f:
        f.write(text)

print("Text extraction completed using PyMuPDF.")


Text extraction completed using PyMuPDF.


In [11]:
import json  # Import json for saving the hierarchy

def create_hierarchy(text):
    """Converts textbook text into a hierarchical structure."""
    chapters = text.split("\n\nChapter ")  # Assuming "Chapter" marks sections
    tree = {"root": []}

    for i, chapter in enumerate(chapters):
        sections = chapter.split("\n\nSection ")  # Assuming "Section" marks subsections
        chapter_node = {"id": f"chapter_{i}", "sections": []}

        for j, section in enumerate(sections):
            paragraphs = section.split("\n\n")  # Splitting paragraphs
            section_node = {"id": f"section_{i}_{j}", "content": paragraphs}
            chapter_node["sections"].append(section_node)

        tree["root"].append(chapter_node)

    return tree

# Ensure 'extracted_texts' contains the processed textbook content
extracted_texts = {"physiology.pdf": "Chapter 1\n\nSection 1.1\n\nParagraph 1...\n\nSection 1.2\n\nParagraph 2..."} 

# Create hierarchy
hierarchy = create_hierarchy(extracted_texts["physiology.pdf"])

# Save the hierarchy to a JSON file
with open("hierarchy.json", "w") as f:
    json.dump(hierarchy, f, indent=2)

print("Hierarchy saved successfully!")


Hierarchy saved successfully!


In [12]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

# Tokenize the corpus
corpus = [" ".join(sec["content"]) for ch in hierarchy["root"] for sec in ch["sections"]]
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

# Train BM25 model
bm25 = BM25Okapi(tokenized_corpus)

def search_bm25(query):
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)
    return sorted(zip(corpus, scores), key=lambda x: x[1], reverse=True)[:5]

# Example search for a Physiology-related question
print(search_bm25("Explain the mechanism of muscle contraction"))


[('Chapter 1', 0.0), ('1.1 Paragraph 1...', 0.0), ('1.2 Paragraph 2...', 0.0)]


In [13]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(corpus)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

def search_dense(query):
    query_embedding = model.encode([query])
    _, indices = index.search(np.array(query_embedding), 5)
    return [corpus[i] for i in indices[0]]

# Example search
print(search_dense("diabetes management"))


['Chapter 1', '1.2 Paragraph 2...', '1.1 Paragraph 1...', '1.2 Paragraph 2...', '1.2 Paragraph 2...']


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Use a very small model for better performance on CPU
model_name = "facebook/opt-350m"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with lower precision for efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if device == "cuda" else "cpu",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Function to generate answers
def generate_answer_local(query):
    input_text = f"Question: {query}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    output = model.generate(
        **inputs, 
        max_length=50,  # Reduce length to avoid infinite loops
        temperature=0.7,  
        top_p=0.9,  
        do_sample=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
print(generate_answer_local("What are the symptoms of pneumonia?"))

# Clear CUDA memory (optional)
if device == "cuda":
    torch.cuda.empty_cache()


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Question: What are the symptoms of pneumonia?
Answer: This is a long-term care facility. The residents are generally well, but they are still getting treated for pneumonia.
Q: What does pneumonia mean?
Answer: It is


model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

In [14]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Load Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Ensure corpus is loaded
corpus_file = "corpus.txt"
if not os.path.exists(corpus_file):
    print("Error: corpus.txt not found! Creating a sample corpus...")
    corpus = ["Diabetes management strategies", "Symptoms of pneumonia", "Treatment for hypertension"]
    
    with open(corpus_file, "w", encoding="utf-8") as f:
        f.write("\n".join(corpus))
    print("Sample corpus.txt created.")
else:
    with open(corpus_file, "r", encoding="utf-8") as f:
        corpus = [line.strip() for line in f.readlines()]

# ✅ Now encode corpus
embeddings = model.encode(corpus)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# ✅ Save FAISS index
faiss.write_index(index, "medical_index.faiss")
print("FAISS index saved as 'medical_index.faiss'")


FAISS index saved as 'medical_index.faiss'


In [15]:
# ✅ Load FAISS index
# index = faiss.read_index("medical_index.faiss")

# def search_dense(query):
#     query_embedding = model.encode([query])
#     _, indices = index.search(np.array(query_embedding), 5)
#     return [corpus[i] for i in indices[0]]

# # ✅ Example search
# print(search_dense("diabetes management"))
# Ensure corpus is clean and normalized
corpus = [line.strip().lower() for line in open("corpus.txt", "r", encoding="utf-8").readlines()]

# Encode corpus
embeddings = model.encode(corpus, convert_to_numpy=True)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)



In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Use a lightweight model
model_name = "facebook/opt-350m"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if device == "cuda" else "cpu",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Load sentence transformer for embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load FAISS index (assuming `index` and `corpus` are already built)
index = faiss.read_index("medical_index.faiss")  # Load FAISS index
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = f.readlines()  # Load the text data

# Search FAISS for relevant context
def search_dense(query):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(np.array(query_embedding), 3)  # Get top 3 docs
    return [corpus[i] for i in indices[0]]

# Function to generate answers using retrieved context
def generate_answer_local(query):
    retrieved_docs = search_dense(query)  # Get relevant docs
    context = " ".join(retrieved_docs)  # Merge context

    # Improve prompt formatting
    input_text = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    output = model.generate(
        **inputs, 
        max_length=100,  
        temperature=0.7,  
        top_p=0.9,  
        do_sample=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
print(generate_answer_local("What are the symptoms of pneumonia?"))

# Clear CUDA memory (optional)
if device == "cuda":
    torch.cuda.empty_cache()


Context: Symptoms of pneumonia
 Treatment for hypertension Diabetes management strategies


Question: What are the symptoms of pneumonia?
Answer:

Symptoms of pneumonia can range from mild to severe. Some symptoms may include a fever, difficulty breathing, loss of appetite, or a loss of sense of taste or smell. Symptoms of pneumonia are most commonly associated with a cough, fever, and loss of appetite.

Symptoms of pneumonia may not appear until after you are in the hospital. Some


In [16]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text("text"))
    return "\n".join(text)

# Extract from all PDFs
pdf_paths = ["pathology.pdf", "physiology.pdf", "pharmacology.pdf"]
full_corpus = []
for pdf in pdf_paths:
    full_corpus.append(extract_text_from_pdf(pdf))

# Save extracted text
with open("corpus.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(full_corpus))

print("✅ Corpus updated with real medical content.")


✅ Corpus updated with real medical content.


In [17]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# ✅ Load updated corpus
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = [line.strip().lower() for line in f.readlines() if line.strip()]

# ✅ Check corpus size
print(f"📚 Corpus now contains {len(corpus)} entries.")

# ✅ Encode full-text corpus
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(corpus, convert_to_numpy=True)

# ✅ Rebuild FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

print("✅ FAISS index rebuilt successfully.")


📚 Corpus now contains 118499 entries.
✅ FAISS index rebuilt successfully.


In [20]:
print(search_dense("What are the symptoms of pneumonia?"))
print(search_dense("How to manage diabetes?"))
print(search_dense("Best treatment for hypertension?"))


['developing pneumonia.', 'pneumonias', 'pneumonias', 'bacterial pneumonias.', 'pneumonia']
['diabetes', 'diabetes', 'diet, exercise, oral drugs, insulin', 'require insulin therapy to control hyperglycaemia or to', 'chronic debilitating conditions like uncontrolled diabetes,']
['drugs for the combined therapy of hypertension are selected to minimize', 'severe hypertension must be treated with the combination of drugs', 'orally for the treatment of mild to moderate chronic hypertension.', 'drugs for hypertension emergency', 'with systemic hypertension has been suggested by some']


In [21]:
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import faiss
import numpy as np

# ✅ Load Sentence-BERT model for dense embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ✅ Load Corpus
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = [line.strip().lower() for line in f.readlines()]

# ✅ Encode corpus with Sentence-BERT
embeddings = embedding_model.encode(corpus, convert_to_numpy=True)

# ✅ Build FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# ✅ Tokenize corpus for BM25
tokenized_corpus = [doc.split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# ✅ FAISS Semantic Search
def search_dense(query, top_k=5, min_score=0.5):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i, idx in enumerate(indices[0]):
        if distances[0][i] > min_score:  # ✅ Filter low-score matches
            results.append(corpus[idx])

    return results if results else ["No relevant context found."]

# ✅ BM25 Keyword-Based Search
def search_bm25(query, top_k=5):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    top_n = np.argsort(scores)[::-1][:top_k]
    return [corpus[i] for i in top_n]

# ✅ Hybrid Search: FAISS + BM25 Reranking
def hybrid_search(query, top_k=5):
    faiss_results = search_dense(query, top_k)
    bm25_results = search_bm25(query, top_k)

    # ✅ Merge & Deduplicate Results
    final_results = list(set(faiss_results + bm25_results))[:top_k]
    return final_results

# ✅ Example Usage
query = "What are the symptoms of pneumonia?"
results = hybrid_search(query)
print("🔍 Search Results:", results)


🔍 Search Results: ['pneumonia', 'of gestation. symptoms of cytomegalovirus infection are', 'symptoms', 'bacterial pneumonias.', 'developing pneumonia.']


In [38]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# ✅ Use a smaller & faster model
model_name = "facebook/opt-125m"  # Switch to "mistral-7B-instruct-v0.1" if GPU available
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model efficiently
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)

def generate_answer(query):
    # Retrieve relevant contexts
    context = search_dense(query, top_k=3)  # Reduce retrieved passages
    context_text = " ".join(context)

    # Format input for LLM
    input_text = f"Context: {context_text}\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # ✅ Use a lower max_length & increase top_k/top_p for speed
    output = model.generate(
        **inputs, 
        max_length=50,  # Reduce token count
        temperature=0.7,  
        top_p=0.85,  
        do_sample=True
    )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example
print(generate_answer("What are the symptoms of pneumonia?"))


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Context: pneumonias pneumonias developing pneumonia.
Question: What are the symptoms of pneumonia?
Answer:
I have been told that I have pneumonia and my lungs are starting to dry up.
I have been told that I


In [23]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from sentence_transformers import SentenceTransformer

# ✅ Load Sentence-BERT for retrieval
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ✅ FAISS index should be already built & loaded
# Ensure `corpus` and `index` exist in your environment

def search_dense(query, top_k=3):
    """Retrieve relevant contexts using FAISS."""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    # ✅ Fix: Avoid out-of-bounds error
    results = [
        corpus[idx] for i, idx in enumerate(indices[0]) 
        if idx < len(corpus) and distances[0][i] < 0.8  # Valid index + threshold
    ]

    return results if results else ["No relevant context found."]

# ✅ Load lightweight FLAN-T5 for generation
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

def generate_answer(query):
    """Generate answers using FLAN-T5 with retrieved context."""
    context = search_dense(query, top_k=3)  
    context_text = " ".join(context)

    # Format input for factual QA
    input_text = f"""
    You are a medical assistant answering user queries based on the given context.

    Context:
    {context_text}

    Question: {query}
    Provide a short and precise answer.
    Answer:
    """

    # Tokenize and generate response
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
    output = model.generate(
        **inputs, 
        max_length=50,  
        temperature=0.7,  
        top_p=0.85,  
        do_sample=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Example Queries
print(generate_answer("What are the symptoms of pneumonia?"))
print(generate_answer("How to manage diabetes?"))
print(generate_answer("Best treatment for hypertension?"))


pneumonias
diabetes
combination of drugs orally


In [24]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import faiss
import torch

# ✅ Load sentence embedding model (FAISS-compatible)
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ✅ Load corpus (Ensure it has meaningful sentences)
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = [line.strip().lower() for line in f.readlines()]

# ✅ Encode corpus & Build FAISS Index
embeddings = embedding_model.encode(corpus, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# ✅ FAISS Search Function with Improved Filtering
def search_dense(query, top_k=3):
    """Retrieve relevant contexts using FAISS with meaningful text filtering."""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    # ✅ Filter out short, irrelevant text
    results = [
        corpus[idx] for i, idx in enumerate(indices[0]) 
        if idx < len(corpus) and distances[0][i] < 0.8 and len(corpus[idx].split()) > 5
    ]
    return results if results else ["No relevant context found."]

# ✅ Load lightweight FLAN-T5 model for fast generation
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# ✅ Generate medical answers with FAISS-based retrieval
def generate_answer(query):
    """Generate medical answers using FLAN-T5 with FAISS-retrieved context."""
    context = search_dense(query, top_k=3)
    context_text = " ".join(context)

    if "No relevant context found." in context_text:
        return "I'm sorry, but I couldn't find relevant information."

    # ✅ Improve prompt structure for better answers
    input_text = f"""
    You are a medical assistant. Answer based on the given context.

    Context:
    {context_text}

    Question: {query}
    
    Provide a short, precise medical answer.
    Answer:
    """

    # Tokenize & Generate response
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
    output = model.generate(**inputs, max_length=100, temperature=0.7, top_p=0.9, do_sample=True)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Test with sample queries
print(generate_answer("What are the symptoms of pneumonia?"))
print(generate_answer("How is diabetes managed?"))
print(generate_answer("What is the best treatment for hypertension?"))


I'm sorry, but I couldn't find relevant information.
mellitus
combination of drugs orally
