In [None]:
!pip install datasets sentence-transformers faiss-gpu-cu12 transformers torch datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
C

In [None]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("DSWF/ai_medical_chatbot_train")

In [None]:
print(ds)
ds['train'][0]  # Check the first sample

DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer', '__index_level_0__'],
        num_rows: 205532
    })
    test: Dataset({
        features: ['Question', 'Answer', '__index_level_0__'],
        num_rows: 51384
    })
})


{'Question': 'Hello, I have been on birth control pills for years and finally decided to stop them back in June. It s now September 15th and I am 8 days late from having a period. I have symptoms of being pregnant but when I took a test i t was negative. Can it be my cycle is changing?',
 'Answer': 'Hello dear,I understand your concern.In my opinion repetition of pregnancy test might be needed.If the repeat test is negative then the chance of pregnancy is rare.And the delayed period might be due to hormonal imbalance.It takes 6 months after stopping the hormonal pills for regular ovulation to occur.So nothing to worry.Relax.Avoid stress.Best regards...',
 '__index_level_0__': 52523}

In [None]:
# Combine question and answer into a single chunk
def create_chunks(example):
    return {"chunk": f"Question: {example['Question']}\nAnswer: {example['Answer']}"}

# Apply to the dataset
ds = ds.map(create_chunks)
chunks = ds['train']['chunk'] + ds['test']['chunk']  # List of chunks (question + answer strings)
chunks, len(chunks)

(['Question: Hello, I have been on birth control pills for years and finally decided to stop them back in June. It s now September 15th and I am 8 days late from having a period. I have symptoms of being pregnant but when I took a test i t was negative. Can it be my cycle is changing?\nAnswer: Hello dear,I understand your concern.In my opinion repetition of pregnancy test might be needed.If the repeat test is negative then the chance of pregnancy is rare.And the delayed period might be due to hormonal imbalance.It takes 6 months after stopping the hormonal pills for regular ovulation to occur.So nothing to worry.Relax.Avoid stress.Best regards...',
  'Question: Hello doctor, I have a two month old son born with a birth weight of 2.3 kg in august. At present his weight is 4 kg. Due to family circumstances I will have to join back work in the third month. In my absence from home how do I feed my son... I wish to breastfeed him as long as can...heard about pumping but I am not sure how to

In [None]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks
embeddings = model.encode(chunks, batch_size=1024, show_progress_bar=True, convert_to_numpy=True)
embeddings.shape

Batches:   0%|          | 0/251 [00:00<?, ?it/s]

(256916, 384)

In [None]:
# import gc
# import torch

# gc.collect()
# torch.cuda.empty_cache()


In [None]:
import faiss
import numpy as np

# Convert embeddings to float32 (required by FAISS)
embeddings = np.array(embeddings).astype('float32')

# Create a FAISS index (FlatL2 for exact search)
dimension = embeddings.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(dimension)

# Move index to GPU
res = faiss.StandardGpuResources()  # Create GPU resources
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move to GPU 0

# Add embeddings to the index
gpu_index.add(embeddings)

In [None]:
# from transformers import pipeline
# from sentence_transformers import SentenceTransformer
# import numpy as np

# # Load the generative model (e.g., GPT-2 or a medical-specific model)
# generator = pipeline('text-generation', model='gpt2')  # Replace with a better model if needed

# # Define the retriever function
# def retrieve_chunks(query, top_k=5):
#     # Encode the query
#     query_embedding = model.encode([query], show_progress_bar=False).astype('float32')

#     # Search the FAISS index
#     distances, indices = gpu_index.search(query_embedding, top_k)

#     # Retrieve the corresponding chunks
#     retrieved_chunks = [chunks[idx] for idx in indices[0]]
#     return retrieved_chunks

# # Define the RAG function
# def rag_generate(query, top_k=5):
#     # Retrieve relevant chunks
#     retrieved_chunks = retrieve_chunks(query, top_k)

#     # Combine chunks into a context
#     context = "\n\n".join(retrieved_chunks)

#     # Create the prompt for the generative model
#     prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

#     # Generate the response
#     response = generator(prompt, max_new_tokens=100, num_return_sequences=1, truncation=True)[0]['generated_text']
#     # Extract the answer part (remove prompt)
#     answer = response[len(prompt):].strip()
#     return retrieved_chunks ,answer

# # Example usage
# query = "What causes high blood pressure?"
# retrieved_chunks, answer = rag_generate(query)

# print(f"Query: {query}\nAnswer: {answer}")
# print()
# retrieve_chunks, len(retrieve_chunks)

In [None]:
# Move index back to CPU for saving
cpu_index = faiss.index_gpu_to_cpu(gpu_index)

# Save the index
faiss.write_index(cpu_index, "medical_chatbot_faiss.index")

# Save the chunks for reference
import pickle
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

In [None]:
# # Load the index
# cpu_index = faiss.read_index("medical_chatbot_faiss.index")
# gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)

# # Load the chunks
# with open("chunks.pkl", "rb") as f:
#     chunks = pickle.load(f)

# Uploading the faiss index to the **Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path
folder_path = '/content/drive/MyDrive/faiss_index2'

# Remove the existing folder and its contents (if it exists)
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
    print(f"Deleted existing folder: {folder_path}")
else:
    print(f"No existing folder found at: {folder_path}")

# Create a new empty folder
os.makedirs(folder_path, exist_ok=False)
print(f"Created new folder: {folder_path}")

# Verify the folder is empty
print("Contents of the new folder:", os.listdir(folder_path))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Deleted existing folder: /content/drive/MyDrive/faiss_index2
Created new folder: /content/drive/MyDrive/faiss_index2
Contents of the new folder: []


In [None]:
import shutil

# Move the files
shutil.move('/content/medical_chatbot_faiss.index', '/content/drive/MyDrive/faiss_index2/medical_chatbot_faiss.index')
shutil.move('/content/chunks.pkl', '/content/drive/MyDrive/faiss_index2/chunks.pkl')

print("Files moved to /content/drive/MyDrive/faiss_index2")

Files moved to /content/drive/MyDrive/faiss_index2


# Testing the RAG

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import faiss
import pickle

# Load the FAISS index
index_path = '/content/drive/MyDrive/faiss_index2/medical_chatbot_faiss.index'
index = faiss.read_index(index_path)

# Load the chunks
chunks_path = '/content/drive/MyDrive/faiss_index2/chunks.pkl'
with open(chunks_path, 'rb') as f:
    chunks = pickle.load(f)

print("FAISS index and chunks loaded successfully")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
FAISS index and chunks loaded successfully


In [None]:
# Check if faiss-gpu is available
try:
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    print("FAISS index moved to GPU")
except Exception as e:
    print(f"Failed to move index to GPU: {e}. Using CPU index.")
    gpu_index = index  # Fallback to CPU index

FAISS index moved to GPU


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Qwen2-1.5B-Instruct
model_name = "Qwen/Qwen2-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use float16 to save GPU memory
    device_map="auto"
)

# Fix padding token issue
model.config.pad_token_id = model.config.eos_token_id
print("Qwen2-1.5B-Instruct loaded successfully")

Qwen2-1.5B-Instruct loaded successfully


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Enable synchronous CUDA errors

from google.colab import drive
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import faiss
import pickle
import numpy as np
import torch

# Mount Google Drive
drive.mount('/content/drive')

# Load FAISS index and chunks
index_path = '/content/drive/MyDrive/faiss_index2/medical_chatbot_faiss.index'
chunks_path = '/content/drive/MyDrive/faiss_index2/chunks.pkl'

if not os.path.exists(index_path) or not os.path.exists(chunks_path):
    raise FileNotFoundError("FAISS index or chunks file not found. Recreate them first.")

index = faiss.read_index(index_path)
with open(chunks_path, 'rb') as f:
    chunks = pickle.load(f)

# Move index to GPU if faiss-gpu is available
try:
    res = faiss.StandardGpuResources()
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    print("FAISS index moved to GPU")
except Exception as e:
    print(f"Failed to move index to GPU: {e}. Using CPU index.")
    gpu_index = index  # Fallback to CPU

# Set up models
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


### LLM Model part
# Load Qwen2-1.5B-Instruct
# model_name = "Qwen/Qwen2-1.5B-Instruct"
model_name = "Qwen/Qwen1.5-4B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use float16 to save GPU memory
    device_map="auto"
)
model.config.pad_token_id = model.config.eos_token_id
print("Qwen2-1.5B-Instruct loaded successfully")
##########################################


# Retrieve chunks function
def retrieve_chunks(query, top_k=5):
    query_embedding = embed_model.encode([query], show_progress_bar=False).astype('float32')
    distances, indices = gpu_index.search(query_embedding, top_k)
    retrieved_chunks = [chunks[idx] for idx in indices[0] if idx < len(chunks)]  # Ensure valid indices
    return retrieved_chunks

# RAG generate function with Qwen
def rag_generate(query, top_k=5, max_context_tokens=800):
    # Retrieve chunks
    retrieved_chunks = retrieve_chunks(query, top_k)

    #num_chunks = len(retrieved_chunks)
    #print(f"Number of chunks retrieved for query '{query}': {num_chunks}")
    # Print chunks for debugging
    # print("Retrieved Chunks:")
    # for i, chunk in enumerate(retrieved_chunks, 1):
    #     print(f"Chunk {i}:\n{chunk}\n")

    # Combine chunks into context
    context = "\n\n".join(retrieved_chunks)

    # Truncate context
    context_tokens = tokenizer.encode(context, add_special_tokens=False)
    if len(context_tokens) > max_context_tokens:
        context_tokens = context_tokens[:max_context_tokens]
        context = tokenizer.decode(context_tokens, skip_special_tokens=True)

    # Create prompt using Qwen's chat template
    messages = [
        {"role": "system", "content": "You are a helpful medical assistant."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Ensure prompt fits within token limit (1000 for GPU memory)
    prompt_tokens = tokenizer.encode(text, add_special_tokens=False)
    max_prompt_tokens = 1000
    if len(prompt_tokens) > max_prompt_tokens:
        prompt_tokens = prompt_tokens[:max_prompt_tokens]
        text = tokenizer.decode(prompt_tokens, skip_special_tokens=True)

    # Generate response
    try:
        model_inputs = tokenizer([text], return_tensors="pt").to(device)
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
        answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"Generation failed for query '{query}': {e}")
        answer = "Error generating response."

    return answer, retrieved_chunks

# Clear GPU memory
torch.cuda.empty_cache()

# Test with multiple queries
queries = [
    "What causes high blood pressure?",
    "Why is high blood pressure so common with people with diabetes?"
]

for query in queries:
    answer, retrieved_chunks = rag_generate(query)
    print(f"Query: {query}\nAnswer: {answer}\n")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
FAISS index moved to GPU
Using device: cuda


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

Qwen2-1.5B-Instruct loaded successfully


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Query: What causes high blood pressure?
Answer: There are many factors that can contribute to high blood pressure. Some common causes include:

  * Unhealthy eating habits: Consuming too much sodium, sugar, and saturated fat can increase blood pressure.
  * Lack of physical activity: Regular exercise can help lower blood pressure by reducing stress and improving cardiovascular health.
  * Smoking and alcohol consumption: These substances can damage blood vessels and raise blood pressure.
  * Genetic factors: Some people may be more likely to develop high blood pressure due to their genetics.
  * Hormonal changes: Changes in hormones such as pregnancy, menopause, and certain medications can affect blood pressure.

It's important to note that high blood pressure is often referred to as "the silent killer" because it can go unnoticed until it becomes severe enough to cause complications such as heart attack or stroke. If you're concerned about your blood pressure, it's always best to spea

In [None]:
query = " I'm experiencing frequent headaches and blurred vision. Should I be worried?"
answer, _ = rag_generate(query)
print(f"Query: {query}\nAnswer: {answer}\n")

Query:  I'm experiencing frequent headaches and blurred vision. Should I be worried?
Answer: If you are experiencing frequent headaches and blurred vision, it's important to see a healthcare provider for an evaluation. Blurred vision can be caused by various conditions such as an eye disorder, inner ear problem, neurological condition, or eye injury. Your healthcare provider will conduct a thorough examination and may recommend additional tests to determine the underlying cause of your symptoms. It's important to address any issues with your vision promptly to prevent complications and ensure proper treatment.



In [None]:
# !pip install gradio

In [None]:
import gradio as gr
def medical_chatbot(query):
    answer, chunk_output = rag_generate(query)
    return f"**Answer:**\n{answer}\n\n**Retrieval Details:**\n{chunk_output}"

# Create Gradio interface
iface = gr.Interface(
    fn=medical_chatbot,
    inputs=gr.Textbox(label="Enter your medical query", placeholder="E.g., I'm experiencing frequent headaches and blurred vision. Should I be worried?"),
    outputs=gr.Markdown(label="Response"),
    title="Medical Chatbot",
    description="Ask medical questions, and the chatbot will provide answers based on a medical knowledge base.",
    examples=[
        ["I'm experiencing frequent headaches and blurred vision. Should I be worried?"],
        ["What causes high blood pressure?"],
        ["Why is high blood pressure so common with people with diabetes?"]
    ]
)

iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8b85a1c70f415d84aa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


