# 🧠 RAG Workshop Assignment - Meghana
**Student ID:** 24156991  
**Topic:** AI in Healthcare  

In [None]:
# -------------------------------------------------------------
# Import required libraries
# -------------------------------------------------------------
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [None]:
# -------------------------------------------------------------
# 1️⃣ Load Data from Excel
# -------------------------------------------------------------
file_path = "AI_Medical_Diagnosis_Data.xlsx"  # Uploaded dataset
df = pd.read_excel(file_path, sheet_name="Data")  # Load the 'Data' sheet

# Convert rows into readable text for RAG processing
data_texts = []
for index, row in df.iterrows():
    text = (
        f"Patient ID: {row['Patient ID']}, "
        f"Age: {row['Age']}, "
        f"Symptoms: {row['Symptoms']}, "
        f"AI Diagnosis: {row['AI Diagnosis']}, "
        f"Accuracy: {row['Accuracy (%)']}%"
    )
    data_texts.append(text)

# Combine all rows into one text block
full_text = "\n".join(data_texts)
print("✅ Excel data loaded and converted into text format.")

In [None]:
# -------------------------------------------------------------
# 2️⃣ Text Chunking (Modified Parameters)
# -------------------------------------------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
)
chunks = text_splitter.split_text(full_text)
print(f"✅ Created {len(chunks)} text chunks for embedding.")

In [None]:
# -------------------------------------------------------------
# 3️⃣ Embedding and Vectorstore
# -------------------------------------------------------------
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(chunks, embeddings)
print("✅ FAISS vectorstore created successfully.")

In [None]:
# -------------------------------------------------------------
# 4️⃣ Improved Retriever Settings
# -------------------------------------------------------------
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,
        "fetch_k": 10,
        "lambda_mult": 0.8
    }
)
print("✅ Retriever configured successfully.")

In [None]:
# -------------------------------------------------------------
# 5️⃣ Create RAG QA System
# -------------------------------------------------------------
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
print("✅ Retrieval-Augmented Generation (RAG) chain initialized.")

In [None]:
# -------------------------------------------------------------
# 6️⃣ Test Questions
# -------------------------------------------------------------
test_questions = [
    "Which diseases have the highest AI diagnostic accuracy?",
    "How does AI help identify lung-related diseases?",
    "What are the common symptoms among patients with high AI accuracy?",
    "Which diagnosis has the lowest accuracy score?"
]

print("\n🔍 Testing the RAG System on the Excel dataset:\n")
for q in test_questions:
    print(f"🧩 Question: {q}")
    print("💡 Answer:", qa_chain.run(q))
    print("-" * 80)

print("✅ RAG System Execution Complete — Using Excel Dataset: AI in Healthcare")