In [3]:
import os
import xml.etree.ElementTree as ET

DATA_DIR = "MedQuAD/1_CancerGov_QA"   # example folder
all_docs = []

# Loop through all XML files
for file in os.listdir(DATA_DIR):
    if file.endswith(".xml"):
        file_path = os.path.join(DATA_DIR, file)

        # Parse XML
        tree = ET.parse(file_path)
        root = tree.getroot()

        source = root.attrib.get("source", "")
        focus = root.findtext("Focus", "")

        # Extract all QAPairs
        for qa in root.findall(".//QAPair"):
            question = qa.findtext("Question", "").strip()
            answer = qa.findtext("Answer", "").strip()

            if question and answer:
                all_docs.append({
                    "text": f"Q: {question}\nA: {answer}",
                    "source": source,
                    "focus": focus
                })

print("Total QA pairs loaded:", len(all_docs))
print(all_docs[0])

Total QA pairs loaded: 729
{'text': "Q: What is (are) Non-Small Cell Lung Cancer ?\nA: Key Points\n                    - Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung.     - There are several types of non-small cell lung cancer.    - Smoking is the major risk factor for non-small cell lung cancer.    - Signs of non-small cell lung cancer include a cough that doesn't go away and shortness of breath.    - Tests that examine the lungs are used to detect (find), diagnose, and stage non-small cell lung cancer.    - Certain factors affect prognosis (chance of recovery) and treatment options.    - For most patients with non-small cell lung cancer, current treatments do not cure the cancer.\n                \n                \n                    Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung.\n                    The lungs are a pair of cone-shaped breathing organs in the che

In [4]:
import os
import json
import xml.etree.ElementTree as ET

DATA_DIR = "MedQuAD"   # Root folder
all_docs = []

# Walk through all subfolders
for root_dir, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".xml"):
            file_path = os.path.join(root_dir, file)

            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                # Document metadata
                source = root.attrib.get("source", "")
                url = root.attrib.get("url", "")
                focus = root.findtext("Focus", "")

                # Extract QA pairs
                for qa in root.findall(".//QAPair"):
                    question = qa.findtext("Question", "").strip()
                    answer = qa.findtext("Answer", "").strip()

                    if question and answer:
                        all_docs.append({
                            "question": question,
                            "answer": answer,
                            "focus": focus,
                            "source": source,
                            "url": url,
                            "text": f"Q: {question}\nA: {answer}"
                        })
            except Exception as e:
                print(f"Error parsing {file_path}: {e}")

# Save as JSON
with open("medquad_qa.json", "w", encoding="utf-8") as f:
    json.dump(all_docs, f, indent=2, ensure_ascii=False)

print("✅ Extraction complete!")
print("Total QA pairs:", len(all_docs))
print("Sample:", all_docs[0])

✅ Extraction complete!
Total QA pairs: 16407
Sample: {'question': 'What is (are) keratoderma with woolly hair ?', 'answer': 'Keratoderma with woolly hair is a group of related conditions that affect the skin and hair and in many cases increase the risk of potentially life-threatening heart problems. People with these conditions have hair that is unusually coarse, dry, fine, and tightly curled. In some cases, the hair is also sparse. The woolly hair texture typically affects only scalp hair and is present from birth. Starting early in life, affected individuals also develop palmoplantar keratoderma, a condition that causes skin on the palms of the hands and the soles of the feet to become thick, scaly, and calloused.  Cardiomyopathy, which is a disease of the heart muscle, is a life-threatening health problem that can develop in people with keratoderma with woolly hair. Unlike the other features of this condition, signs and symptoms of cardiomyopathy may not appear until adolescence or 

In [7]:
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load JSON file
with open("medquad_qa.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Total records loaded:", len(data))

# Initialize Hugging Face embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Extract texts for embedding
texts = [item["text"] for item in data]
metadata = [{"question": item["question"], 
             "answer": item["answer"], 
             "source": item["source"], 
             "url": item["url"], 
             "focus": item["focus"]} for item in data]

# Create embeddings
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Normalize embeddings (important for cosine similarity)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index
dimension = embeddings.shape[1]  # 384 for MiniLM
index = faiss.IndexFlatIP(dimension)  # Inner Product = cosine since normalized
index.add(embeddings)

print("✅ FAISS index built!")
print("Index size:", index.ntotal)

# Save index & metadata for reuse
faiss.write_index(index, "medquad_index.faiss")

with open("medquad_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print("✅ Saved FAISS index and metadata")

ModuleNotFoundError: No module named 'sentence_transformers'

In [10]:
pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp311-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9

In [14]:
import os
import json
import xml.etree.ElementTree as ET
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# -----------------------------
# Step 1: Extract MedQuAD XMLs
# -----------------------------
DATA_DIR = "MedQuAD"   # root folder with all subfolders
all_docs = []

for root_dir, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".xml"):
            file_path = os.path.join(root_dir, file)

            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                # Document metadata
                source = root.attrib.get("source", "")
                url = root.attrib.get("url", "")
                focus = root.findtext("Focus", "")

                # Extract QA pairs
                for qa in root.findall(".//QAPair"):
                    question = qa.findtext("Question", "").strip()
                    answer = qa.findtext("Answer", "").strip()

                    if question and answer:
                        all_docs.append({
                            "question": question,
                            "answer": answer,
                            "focus": focus,
                            "source": source,
                            "url": url,
                            "text": f"Q: {question}\nA: {answer}"
                        })
            except Exception as e:
                print(f"Error parsing {file_path}: {e}")

# Save extracted dataset
with open("medquad_qa.json", "w", encoding="utf-8") as f:
    json.dump(all_docs, f, indent=2, ensure_ascii=False)

print("✅ Extraction complete! Total QA pairs:", len(all_docs))


# -----------------------------
# Step 2: Load / Save Embedding Model
# -----------------------------
MODEL_PATH = "models/all-MiniLM-L6-v2"

if os.path.exists(MODEL_PATH):
    print("📂 Loading model from local...")
    model = SentenceTransformer(MODEL_PATH)
else:
    print("⬇️ Downloading model from HuggingFace...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    model.save(MODEL_PATH)
    print("✅ Model saved locally at:", MODEL_PATH)


# -----------------------------
# Step 3: Build FAISS Index
# -----------------------------
documents = [item["text"] for item in all_docs]
embeddings = model.encode(documents, convert_to_numpy=True)

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print(f"✅ FAISS index built with {index.ntotal} documents")

# Save FAISS index
faiss.write_index(index, "medquad_index.faiss")
print("📂 FAISS index saved as medquad_index.faiss")


# -----------------------------
# Step 4: Search Function
# -----------------------------
def search(query, top_k=3):
    query_emb = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k)
    results = []
    
    for i, idx in enumerate(indices[0]):
        results.append({
            "rank": i+1,
            "question": all_docs[idx]["question"],
            "answer": all_docs[idx]["answer"],
            "focus": all_docs[idx]["focus"],
            "source": all_docs[idx]["source"],
            "url": all_docs[idx]["url"]
        })
    return results


# -----------------------------
# Step 5: Test Query
# -----------------------------
query = "What are the symptoms of leukemia?"
results = search(query, top_k=2)

for r in results:
    print("\n---")
    print(f"Rank {r['rank']}")
    print("Q:", r["question"])
    print("A:", r["answer"][:300], "...")
    print("Source:", r["source"])
    print("URL:", r["url"])

✅ Extraction complete! Total QA pairs: 16407
⬇️ Downloading model from HuggingFace...
✅ Model saved locally at: models/all-MiniLM-L6-v2
✅ FAISS index built with 16407 documents
📂 FAISS index saved as medquad_index.faiss

---
Rank 1
Q: What are the symptoms of Leukemia ?
A: Common symptoms of leukemia may include -  fevers  - frequent infections  - feeling weak or tired  -  headache  - bleeding and bruising easily  - pain in the bones or joints  -  swelling or discomfort in the abdomen (from an enlarged spleen)  -  swollen lymph nodes, especially in the neck or armpit  ...
Source: NIHSeniorHealth
URL: http://nihseniorhealth.gov/leukemia/toc.html

---
Rank 2
Q: What are the symptoms of Adult Acute Lymphoblastic Leukemia ?
A: Signs and symptoms of adult ALL include fever, feeling tired, and easy bruising or bleeding. The early signs and symptoms of ALL may be like the flu or other common diseases. Check with your doctor if you have any of the following:         - Weakness or feeling tired

In [13]:
pip install faiss-cpu   

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-macosx_14_0_arm64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m3.8 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0mm
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
from rag_pipeline import generate_answer


ModuleNotFoundError: No module named 'groq'