In [6]:
import fitz  # PyMuPDF

def extract_text_pymupdf(pdf_path):
    """Extracts text from a PDF using PyMuPDF while preserving structure."""
    doc = fitz.open(pdf_path)
    extracted_text = ""

    for page in doc:
        extracted_text += page.get_text("text") + "\n"  # Extracts structured text

    return extracted_text

# Example usage
pdf_files = ["physiology.pdf", "pathology.pdf", "pharmacology.pdf"]
extracted_texts = {pdf: extract_text_pymupdf(pdf) for pdf in pdf_files}

# Save extracted content as text files
for pdf, text in extracted_texts.items():
    with open(f"{pdf}.txt", "w", encoding="utf-8") as f:
        f.write(text)

print("Text extraction completed using PyMuPDF.")


Text extraction completed using PyMuPDF.


In [7]:
import json  # Import json for saving the hierarchy

def create_hierarchy(text):
    """Converts textbook text into a hierarchical structure."""
    chapters = text.split("\n\nChapter ")  # Assuming "Chapter" marks sections
    tree = {"root": []}

    for i, chapter in enumerate(chapters):
        sections = chapter.split("\n\nSection ")  # Assuming "Section" marks subsections
        chapter_node = {"id": f"chapter_{i}", "sections": []}

        for j, section in enumerate(sections):
            paragraphs = section.split("\n\n")  # Splitting paragraphs
            section_node = {"id": f"section_{i}_{j}", "content": paragraphs}
            chapter_node["sections"].append(section_node)

        tree["root"].append(chapter_node)

    return tree

# Ensure 'extracted_texts' contains the processed textbook content
extracted_texts = {"physiology.pdf": "Chapter 1\n\nSection 1.1\n\nParagraph 1...\n\nSection 1.2\n\nParagraph 2..."} 

# Create hierarchy
hierarchy = create_hierarchy(extracted_texts["physiology.pdf"])

# Save the hierarchy to a JSON file
with open("hierarchy.json", "w") as f:
    json.dump(hierarchy, f, indent=2)

print("Hierarchy saved successfully!")


Hierarchy saved successfully!


In [8]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

# Tokenize the corpus
corpus = [" ".join(sec["content"]) for ch in hierarchy["root"] for sec in ch["sections"]]
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

# Train BM25 model
bm25 = BM25Okapi(tokenized_corpus)

def search_bm25(query):
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)
    return sorted(zip(corpus, scores), key=lambda x: x[1], reverse=True)[:5]

# Example search for a Physiology-related question
print(search_bm25("Explain the mechanism of muscle contraction"))


[('Chapter 1', 0.0), ('1.1 Paragraph 1...', 0.0), ('1.2 Paragraph 2...', 0.0)]


In [9]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(corpus)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

def search_dense(query):
    query_embedding = model.encode([query])
    _, indices = index.search(np.array(query_embedding), 5)
    return [corpus[i] for i in indices[0]]

# Example search
print(search_dense("diabetes management"))


['Chapter 1', '1.2 Paragraph 2...', '1.1 Paragraph 1...', '1.2 Paragraph 2...', '1.2 Paragraph 2...']


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Use a very small model for better performance on CPU
model_name = "facebook/opt-350m"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with lower precision for efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if device == "cuda" else "cpu",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Function to generate answers
def generate_answer_local(query):
    input_text = f"Question: {query}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    output = model.generate(
        **inputs, 
        max_length=50,  # Reduce length to avoid infinite loops
        temperature=0.7,  
        top_p=0.9,  
        do_sample=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
print(generate_answer_local("What are the symptoms of pneumonia?"))

# Clear CUDA memory (optional)
if device == "cuda":
    torch.cuda.empty_cache()


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Question: What are the symptoms of pneumonia?
Answer: This is a long-term care facility. The residents are generally well, but they are still getting treated for pneumonia.
Q: What does pneumonia mean?
Answer: It is


model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

In [10]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Load Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Ensure corpus is loaded
corpus_file = "corpus.txt"
if not os.path.exists(corpus_file):
    print("Error: corpus.txt not found! Creating a sample corpus...")
    corpus = ["Diabetes management strategies", "Symptoms of pneumonia", "Treatment for hypertension"]
    
    with open(corpus_file, "w", encoding="utf-8") as f:
        f.write("\n".join(corpus))
    print("Sample corpus.txt created.")
else:
    with open(corpus_file, "r", encoding="utf-8") as f:
        corpus = [line.strip() for line in f.readlines()]

# ✅ Now encode corpus
embeddings = model.encode(corpus)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# ✅ Save FAISS index
faiss.write_index(index, "medical_index.faiss")
print("FAISS index saved as 'medical_index.faiss'")


Error: corpus.txt not found! Creating a sample corpus...
Sample corpus.txt created.
FAISS index saved as 'medical_index.faiss'


In [11]:
# ✅ Load FAISS index
index = faiss.read_index("medical_index.faiss")

def search_dense(query):
    query_embedding = model.encode([query])
    _, indices = index.search(np.array(query_embedding), 5)
    return [corpus[i] for i in indices[0]]

# ✅ Example search
print(search_dense("diabetes management"))


['Diabetes management strategies', 'Treatment for hypertension', 'Symptoms of pneumonia', 'Treatment for hypertension', 'Treatment for hypertension']


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Use a lightweight model
model_name = "facebook/opt-350m"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if device == "cuda" else "cpu",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Load sentence transformer for embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load FAISS index (assuming `index` and `corpus` are already built)
index = faiss.read_index("medical_index.faiss")  # Load FAISS index
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = f.readlines()  # Load the text data

# Search FAISS for relevant context
def search_dense(query):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(np.array(query_embedding), 3)  # Get top 3 docs
    return [corpus[i] for i in indices[0]]

# Function to generate answers using retrieved context
def generate_answer_local(query):
    retrieved_docs = search_dense(query)  # Get relevant docs
    context = " ".join(retrieved_docs)  # Merge context

    # Improve prompt formatting
    input_text = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    output = model.generate(
        **inputs, 
        max_length=100,  
        temperature=0.7,  
        top_p=0.9,  
        do_sample=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
print(generate_answer_local("What are the symptoms of pneumonia?"))

# Clear CUDA memory (optional)
if device == "cuda":
    torch.cuda.empty_cache()


Context: Symptoms of pneumonia
 Treatment for hypertension Diabetes management strategies


Question: What are the symptoms of pneumonia?
Answer:

Symptoms of pneumonia can range from mild to severe. Some symptoms may include a fever, difficulty breathing, loss of appetite, or a loss of sense of taste or smell. Symptoms of pneumonia are most commonly associated with a cough, fever, and loss of appetite.

Symptoms of pneumonia may not appear until after you are in the hospital. Some
