In [None]:
import os
import pandas as pd
import re
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from typing import List, Dict
from langchain.embeddings import HuggingFaceEmbeddings
#from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter


In [4]:
CSV_PATH = "data/medical_text_data.csv"   # your file
INDEX_DIR = "faiss_index"
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"  # multilingual, compact
EMBED_DIM = 384  # for the MiniLM family
CHUNK_SIZE = 400
CHUNK_OVERLAP = 50

In [5]:
os.makedirs(INDEX_DIR, exist_ok=True)

In [6]:
def read_and_prepare(csv_path: str) -> List[Document]:
    """
    Read CSV and convert each row into a Document with combined text.
    Expect either columns [instruction,response] or any columns; we'll combine all text fields.
    """
    df = pd.read_csv(csv_path)
    rows_text = []
    for _, r in df.iterrows():
        # combine textual columns
        text_fields = []
        for c in df.columns:
            val = r[c]
            if pd.isna(val):
                continue
            text_fields.append(f"{c}: {val}")
        full = " | ".join(text_fields)
        # simple clean
        full = re.sub(r"\s+", " ", str(full)).strip()
        rows_text.append(full)
    # chunk texts
    splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )
    docs = []
    for i, t in enumerate(rows_text):
        pieces = splitter.split_text(t)
        for j, p in enumerate(pieces):
            docs.append(Document(page_content=p, metadata={"source_row": i}))
    return docs

In [7]:
def build_embeddings_and_faiss(docs: List[Document], model_name: str = EMBED_MODEL_NAME):
    print("Loading embedding model:", model_name)
    # Use sentence-transformers directly (fast and compact)
    st_model = SentenceTransformer(model_name)
    texts = [d.page_content for d in docs]
    print("Embedding", len(texts), "chunks ...")
    embeddings = st_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

    # build FAISS index (Flat L2). For bigger scale consider HNSW/IVF+PQ
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    print("Adding embeddings to FAISS index...")
    index.add(embeddings.astype(np.float32))

    # Save the index and metadata
    faiss.write_index(index, os.path.join(INDEX_DIR, "index.faiss"))
    with open(os.path.join(INDEX_DIR, "docs.pkl"), "wb") as f:
        pickle.dump(docs, f)
    print("Saved FAISS index and docs.")
    return index, docs, st_model

In [8]:
def load_faiss_and_docs():
    idx_path = os.path.join(INDEX_DIR, "index.faiss")
    docs_path = os.path.join(INDEX_DIR, "docs.pkl")
    if not os.path.exists(idx_path) or not os.path.exists(docs_path):
        raise FileNotFoundError("Index or docs not found. Run build first.")
    index = faiss.read_index(idx_path)
    with open(docs_path, "rb") as f:
        docs = pickle.load(f)
    return index, docs

In [9]:

def build_langchain_faiss_vectorstore(docs: List[Document], hf_model_name=EMBED_MODEL_NAME):
    """
    Wrap FAISS into LangChain VectorStore for ease of retrieval with chains.
    This uses HuggingFaceEmbeddings (thin wrapper) pointing to sentence-transformers.
    """
    # LangChain wrapper using HuggingFaceEmbeddings consumes a model name; it expects transformers-compatible model.
    # We'll use the same model but via HuggingFaceEmbeddings for compatibility.
    embed = HuggingFaceEmbeddings(model_name=hf_model_name)
    # create FAISS vectorstore from texts + docs
    texts = [d.page_content for d in docs]
    metadatas = [d.metadata for d in docs]
    vect = FAISS.from_texts(texts, embed, metadatas=metadatas)
    # save
    vect.save_local(INDEX_DIR)
    return vect

In [10]:
def run_rag_query(vect_store):
    """
    Make a retrievalQA chain using Ollama as LLM (local).
    Make sure Ollama server is running and the model is pulled (e.g., 'llama3' or a small variant).
    """
    # LLM via Ollama - uses local Ollama server (default host: http://localhost:11434)
    # In LangChain new versions, use Ollama wrapper:
    llm = Ollama(model="llama3:instruct")  # adjust the model tag you pulled in ollama
    retriever = vect_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
    prompt = PromptTemplate.from_template(
        "You are a medical assistant that helps identify dental/tooth-related issues and gives safe, brief treatment guidance. "
        "User question: {question}\n\n"
        "Use retrieved context to answer. If not confident, say you are not sure and suggest seeing a dentist."
    )
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
    print("You can now ask questions. Type 'exit' to quit.")
    while True:
        q = input("\nUser query> ")
        if q.strip().lower() in ("exit", "quit"):
            break
        res = qa.run(q)
        print("\nAnswer:\n", res)

In [None]:
if __name__ == "__main__":
    # Step 1: Build docs & index (if not already built)
    if not os.path.exists(os.path.join(INDEX_DIR, "index.faiss")):
        print("Reading CSV and building documents...")
        documents = read_and_prepare(CSV_PATH)
        index, docs, st_model = build_embeddings_and_faiss(documents)
        # Optionally build LangChain FAISS wrapper for easier querying via langchain
        vect = build_langchain_faiss_vectorstore(docs)
    else:
        print("FAISS index exists. Loading...")
        # load via langchain wrapper
        from langchain.vectorstores import FAISS as LCFAISS
        vect = LCFAISS.load_local(INDEX_DIR, HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME))

    # Step 2: Run RAG interactive loop (Ollama must be running)
    run_rag_query(vect)

In [14]:
import pickle
import faiss
from langchain.docstore import InMemoryDocstore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

INDEX_DIR = "faiss_index"
EMBED_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"

# 1. Load old index + docs
print("📂 Loading FAISS index and docs.pkl ...")
faiss_index = faiss.read_index(f"{INDEX_DIR}/index.faiss")
with open(f"{INDEX_DIR}/docs.pkl", "rb") as f:
    docs = pickle.load(f)

print(f"✅ Loaded {len(docs)} documents")

# 2. Wrap existing index + docs (NO re-embedding)
embed = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
docstore = InMemoryDocstore({str(i): d for i, d in enumerate(docs)})
index_to_docstore_id = {i: str(i) for i in range(len(docs))}

vect = FAISS(
    embedding_function=embed,
    index=faiss_index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

# 3. Save properly (creates index.pkl alongside index.faiss)
vect.save_local(INDEX_DIR)
print("🎉 Done! Converted to LangChain format (index.pkl created)")


📂 Loading FAISS index and docs.pkl ...
✅ Loaded 246945 documents
🎉 Done! Converted to LangChain format (index.pkl created)


In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

INDEX_DIR = "faiss_index"
EMBED_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"

embed = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

vect = FAISS.load_local(
    INDEX_DIR,
    embed,
    allow_dangerous_deserialization=True  # ✅ required for pickle
)

print("✅ FAISS index loaded successfully")


✅ FAISS index loaded successfully


In [None]:
from langdetect import detect
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# -------------------------
# Config
# -------------------------
INDEX_DIR = "faiss_index"
EMBED_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"

instructions = {
    "en": "Answer ONLY using the medical context below. If the answer is not present, reply 'I don’t know'.",
    "or": "କୃପୟା ନିମ୍ନୋଲିଖିତ ପ୍ରସଙ୍ଗରୁ ମାତ୍ର ଉତ୍ତର ଦିଅନ୍ତୁ | ଉତ୍ତର ନଥିଲେ 'ମୁଁ ଜାଣିନି' କୁହନ୍ତୁ |",  # Odia
    "mr": "फक्त खालील संदर्भ वापरून उत्तर द्या. उत्तर उपलब्ध नसेल तर 'मला माहित नाही' असे म्हणा.",  # Marathi
    "ur": "صرف نیچے دیے گئے سیاق و سباق کا استعمال کرتے ہوئے جواب دیں۔ اگر جواب موجود نہیں ہے تو 'مجھے نہیں معلوم' کہیں۔",  # Urdu
    "ta": "கீழே உள்ள சூழலை மட்டுமே பயன்படுத்தி பதிலளிக்கவும். பதில் இல்லாவிட்டால் 'எனக்கு தெரியவில்லை' என்று சொல்லவும்.",  # Tamil
    "te": "క్రింద ఇచ్చిన సందర్భం ఆధారంగా మాత్రమే సమాధానం ఇవ్వండి. సమాధానం లేకపోతే 'నాకు తెలియదు' అని చెప్పండి.",  # Telugu
}

# -------------------------
# Build RAG pipeline
# -------------------------
def load_vectorstore():
    embed = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    vect = FAISS.load_local(INDEX_DIR, embed, allow_dangerous_deserialization=True)
    return vect

def build_prompt(query, lang):
    instruction_text = instructions.get(lang, instructions["en"])
    return f"""
You are a multilingual medical assistant.
{instruction_text}
Always answer in the SAME language as the question.

Context:
{{context}}

Question: {query}

Answer:
"""

def build_chain(vect, query, lang):
    llm = Ollama(model="phi3:mini")  # or smaller llama3 variant if GPU is limited
    retriever = vect.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    prompt = PromptTemplate.from_template(build_prompt(query, lang))
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return qa

# -------------------------
# Interactive Loop
# -------------------------
if __name__ == "__main__":
    vect = load_vectorstore()
    print("✅ Chatbot ready. Type 'exit' to quit.")

    while True:
        query = input("\n👤 You: ")
        if query.lower() in ["exit", "quit"]:
            break

        try:
            lang = detect(query)
        except:
            lang = "en"

        qa = build_chain(vect, query, lang)
        result = qa.invoke({"query": query})

        print("\n🤖 Bot:", result["result"])
        print("📎 Sources:", [d.metadata for d in result["source_documents"]])


In [None]:
import streamlit as st
from langdetect import detect
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# -------------------------
# Config
# -------------------------
INDEX_DIR = "faiss_index"
EMBED_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"

instructions = {
    "en": "Answer ONLY using the medical context below. If the answer is not present, reply 'I don’t know'.",
    "or": "କୃପୟା ନିମ୍ନୋଲିଖିତ ପ୍ରସଙ୍ଗରୁ ମାତ୍ର ଉତ୍ତର ଦିଅନ୍ତୁ | ଉତ୍ତର ନଥିଲେ 'ମୁଁ ଜାଣିନି' କୁହନ୍ତୁ |",  # Odia
    "mr": "फक्त खालील संदर्भ वापरून उत्तर द्या. उत्तर उपलब्ध नसेल तर 'मला माहित नाही' असे म्हणा.",  # Marathi
    "ur": "صرف نیچے دیے گئے سیاق و سباق کا استعمال کرتے ہوئے جواب دیں۔ اگر جواب موجود نہیں ہے تو 'مجھے نہیں معلوم' کہیں۔",  # Urdu
    "ta": "கீழே உள்ள சூழலை மட்டுமே பயன்படுத்தி பதிலளிக்கவும். பதில் இல்லாவிட்டால் 'எனக்கு தெரியவில்லை' என்று சொல்லவும்.",  # Tamil
    "te": "క్రింద ఇచ్చిన సందర్భం ఆధారంగా మాత్రమే సమాధానం ఇవ్వండి. సమాధానం లేకపోతే 'నాకు తెలియదు' అని చెప్పండి.",  # Telugu
}

# -------------------------
# Helper functions
# -------------------------
def load_vectorstore():
    embed = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    vect = FAISS.load_local(INDEX_DIR, embed, allow_dangerous_deserialization=True)
    return vect

def build_prompt(query, lang):
    instruction_text = instructions.get(lang, instructions["en"])
    return f"""
You are a multilingual medical assistant.
{instruction_text}
Always answer in the SAME language as the question.

Context:
{{context}}

Question: {query}

Answer:
"""

def get_answer(query, vect):
    try:
        lang = detect(query)
    except:
        lang = "en"

    llm = Ollama(model="phi3:mini")  # ✅ lightweight model for frontend
    retriever = vect.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    prompt = PromptTemplate.from_template(build_prompt(query, lang))
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return qa.invoke({"query": query})

# -------------------------
# Streamlit UI
# -------------------------
st.set_page_config(page_title="🩺 Medical Assistant Chatbot", layout="wide")

st.title("🩺 Multilingual Medical Chatbot")
st.markdown("⚠️ **Disclaimer:** This chatbot is for informational purposes only. Please consult a doctor for real medical advice.")

vect = load_vectorstore()

user_query = st.text_input("Enter your symptoms or question:")

if st.button("Ask") and user_query.strip():
    with st.spinner("Thinking..."):
        result = get_answer(user_query, vect)
        st.markdown(f"**🤖 Bot:** {result['result']}")
        with st.expander("📎 Sources"):
            for doc in result["source_documents"]:
                st.write(doc.metadata)
