In [10]:
!pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import json

urls = [
    "https://www.healthline.com/health/diabetes",
    "https://www.healthline.com/health/high-blood-pressure-hypertension",
    "https://www.healthline.com/health/asthma",
    "https://www.healthline.com/health/anemia",
    "https://www.healthline.com/health/hypothyroidism",
    "https://www.healthline.com/health/dengue-fever",
    "https://www.healthline.com/health/influenza",
    "https://www.healthline.com/health/pneumonia"
]

def scrape_healthline(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    return {
        "title": title,
        "url": url,
        "text": text
    }

knowledge_base = []
for url in urls:
    article = scrape_healthline(url)
    if article["text"]:
        knowledge_base.append(article)

with open("medical_kb.json", "w", encoding='utf-8') as f:
    json.dump(knowledge_base, f, ensure_ascii=False, indent=2)

print(f"Scraped and saved {len(knowledge_base)} articles to medical_kb.json")


Scraped and saved 8 articles to medical_kb.json


In [11]:
!pip install sentence-transformers faiss-cpu

import json
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load KB
with open('medical_kb.json', 'r', encoding='utf-8') as f:
    articles = json.load(f)

def chunk_text_by_paragraph(text):
    paragraphs = [p.strip() for p in text.split('\n') if len(p.strip()) > 40]
    return paragraphs

def is_valid_chunk(chunk):
    blacklist = [
        "Bezzy communities", "mobile app", "Join Bezzy", "Connect with us",
        "advertiser", "collaboration", "Learn more", "peer-reviewed", "advertisement"
    ]
    for token in blacklist:
        if token.lower() in chunk.lower():
            return False
    return True

kb_chunks = []
metadata = []
for article in articles:
    for c in chunk_text_by_paragraph(article['text']):
        if is_valid_chunk(c):
            kb_chunks.append(c)
            metadata.append({'title': article['title'], 'url': article['url']})

print(f"Chunked KB to {len(kb_chunks)} passages after filtering.")

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(kb_chunks, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print(f"FAISS index created with {index.ntotal} chunks.")

faiss.write_index(index, "medical_kb.faiss")
with open("kb_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)


Chunked KB to 516 passages after filtering.


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

FAISS index created with 516 chunks.


In [18]:
!pip install transformers

from transformers import pipeline
generator = pipeline('text2text-generation', model='google/flan-t5-base', max_length=)

def rag_response(query, top_k=1):
    query_emb = model.encode([query]).astype('float32')
    D, I = index.search(query_emb, top_k)
    idx = I[0][0]
    title = metadata[idx]['title']
    chunk = kb_chunks[idx]
    prompt = (
        f"Refer to this medical fact from '{title}':\n{chunk}\n"
        f"Answer concisely: {query}\n"
        "For information only, not diagnostic advice."
    )
    print("PROMPT LENGTH:", len(prompt), "CONTENT:", prompt)
    gen_output = generator(prompt)
    if gen_output and 'generated_text' in gen_output[0]:
        return gen_output[0]['generated_text']
    else:
        return "No response generated. Try a shorter prompt or different model."

# Example usage
user_query = "What are the main symptoms of diabetes?"
print(rag_response(user_query))




Device set to use cpu


PROMPT LENGTH: 222 CONTENT: Refer to this medical fact from 'Everything You Need to Know About Diabetes':
The general symptoms of diabetes include:
Answer concisely: What are the main symptoms of diabetes?
For information only, not diagnostic advice.
blood sugar levels


In [16]:
!pip install gradio




In [23]:
import gradio as gr
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

# ---- Load RAG and KB Models ----
def load_models():
    with open('medical_kb.json', 'r', encoding='utf-8') as f:
        articles = json.load(f)
    with open("kb_metadata.json", "r", encoding="utf-8") as f:
        metadata = json.load(f)
    index = faiss.read_index("medical_kb.faiss")
    def chunk_text_by_paragraph(text):
        paragraphs = [p.strip() for p in text.split('\n') if len(p.strip()) > 40]
        return paragraphs
    def is_valid_chunk(chunk):
        blacklist = [
            "Bezzy communities", "mobile app", "Join Bezzy", "Connect with us",
            "advertiser", "collaboration", "Learn more", "peer-reviewed", "advertisement"
        ]
        for token in blacklist:
            if token.lower() in chunk.lower():
                return False
        return True
    kb_chunks = []
    for article in articles:
        for c in chunk_text_by_paragraph(article['text']):
            if is_valid_chunk(c):
                kb_chunks.append(c)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    generator = pipeline('text2text-generation', model='google/flan-t5-base', max_length=512)
    return model, generator, index, metadata, kb_chunks

model, generator, index, metadata, kb_chunks = load_models()

# ---- Five Patient Profiles ----
patient_profiles = {
    "Samarth": {
        'Name': 'Samarth',
        'Gender': 'Male',
        'Age': 36,
        'Height': '171 cm',
        'Weight': '74 kg',
        'Conditions': ['Hypertension', 'Diabetes'],
        'Symptoms': ['Headache', 'Fatigue'],
        'Last BP Check': '142/90 mmHg',
        'Last Fasting Glucose': '123 mg/dL'
    },
    "Aditi": {
        'Name': 'Aditi',
        'Gender': 'Female',
        'Age': 58,
        'Height': '160 cm',
        'Weight': '60 kg',
        'Conditions': ['Asthma', 'Anemia'],
        'Symptoms': ['Cough', 'Breathlessness'],
        'Last BP Check': '128/80 mmHg',
        'Last Hemoglobin': '10.5 g/dL'
    },
    "Ravi": {
        'Name': 'Ravi',
        'Gender': 'Male',
        'Age': 45,
        'Height': '165 cm',
        'Weight': '85 kg',
        'Conditions': ['Dengue'],
        'Symptoms': ['Fever', 'Joint pain', 'Headache'],
        'Last Platelet Count': '125,000 /μL'
    },
    "Meera": {
        'Name': 'Meera',
        'Gender': 'Female',
        'Age': 70,
        'Height': '150 cm',
        'Weight': '54 kg',
        'Conditions': ['Pneumonia'],
        'Symptoms': ['Cough', 'Shortness of breath'],
        'Last Temp Check': '101.2 F'
    },
    "Ahmed": {
        'Name': 'Ahmed',
        'Gender': 'Male',
        'Age': 62,
        'Height': '167 cm',
        'Weight': '68 kg',
        'Conditions': ['Influenza', 'Hypothyroidism'],
        'Symptoms': ['Fever', 'Chills', 'Fatigue', 'Weight gain'],
        'Last TSH Level': '6.5 μIU/mL'
    }
}

def format_profile(profile):
    return "\n".join([
        f"{k}: {', '.join(v) if isinstance(v, list) else v}"
        for k, v in profile.items()
    ])

# ---- RAG Chatbot Function ----
def rag_response(query, top_k=1):
    query_emb = model.encode([query]).astype('float32')
    D, I = index.search(query_emb, top_k)
    idx = I[0][0]
    title = metadata[idx]['title']
    chunk = kb_chunks[idx]
    prompt = (
        f"Patient profile:\n{profile_str}\n"
        f"Refer to this medical fact from '{title}':\n{chunk}\n"
        f"Based ONLY on the patient's profile and the fact above, answer: {query}\n"
        "For information only, not diagnostic advice."
    )
    gen_output = generator(prompt)
    if gen_output and 'generated_text' in gen_output[0]:
        return gen_output[0]['generated_text']
    else:
        return "No response generated. Try a shorter prompt or different model."

# ---- Personalized Q&A Function ----
def gradio_rag(query, selected_patient):
    profile = patient_profiles[selected_patient]
    global profile_str
    profile_str = format_profile(profile)
    # Disease keyword list from KB and common
    all_conditions = profile['Conditions']
    condition_keywords = ['diabetes', 'hypertension', 'asthma', 'anemia', 'hypothyroidism', 'dengue', 'influenza', 'pneumonia']
    found_in_profile = False
    for cond in all_conditions:
        if cond.lower() in query.lower():
            found_in_profile = True
            break
    if not found_in_profile and any(key in query.lower() for key in condition_keywords):
        matched_cond = [key for key in condition_keywords if key in query.lower()]
        profile_conditions = ", ".join(all_conditions)
        return (
            f"### Patient Profile\n{profile_str}\n\n"
            f"### Chatbot Answer\n"
            f"Note: This patient does not have {' or '.join(matched_cond)}—showing general medical information.\n"
            f"{rag_response(query)}"
        )
    else:
        # For general questions or matching conditions, answer normally
        return f"### Patient Profile\n{profile_str}\n\n### Chatbot Answer\n{rag_response(query)}"

# ---- Build Gradio Interface ----
interface = gr.Interface(
    fn=gradio_rag,
    inputs=[
        gr.Textbox(lines=2, label="Enter your medical question"),
        gr.Dropdown(list(patient_profiles.keys()), label="Select Patient Profile")
    ],
    outputs=gr.Markdown(label="Digital Twin & Chatbot Answer"),
    title="Medical RAG Chatbot with Multiple Digital Twins",
    description="Select a simulated patient profile and then ask any factual health question.",
    allow_flagging="never"
)

interface.launch(share=True)


Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://04bdc4c2600a153d7e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


