In [7]:
%pwd

'/Users/subhasis/Projects/med-chat/research'

In [9]:
import os
os.chdir("../")

In [10]:
%pwd

'/Users/subhasis/Projects/med-chat'

In [16]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 

In [88]:
# extract text from pdf file
def load_pdf_files(data):
    loader= DirectoryLoader(
        path=data,glob="*.pdf",loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [90]:
extract_data = load_pdf_files("Data")

In [92]:
len(extract_data)

637

In [93]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> list[Document]:
    minimal_docs: List[Document] = []

    for doc in docs:
        src=doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [94]:
minimal_doc = filter_to_minimal_docs(extract_data)

In [96]:
# split the docs in smaller chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=500,
         chunk_overlap=20
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [97]:
texts_chunk = text_split(minimal_doc)

In [107]:
import chromadb
from langchain_community.embeddings import HuggingFaceEmbeddings
import uuid

# Suppose `docs` is a List[Document] from LangChain
doc_texts = [doc.page_content for doc in texts_chunk]

# 1️⃣ Create embedding function
embedding_fn = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"batch_size": 32}
    )

# 2️⃣ Generate actual embeddings
embeddings = embedding_fn.embed_documents(doc_texts)  # ✅ This returns List[List[float]]

# 3️⃣ Create Chroma client
client = chromadb.PersistentClient(path="../chroma_data")
collection_name = "med"

# 4️⃣ Get or create collection
existing_collections = [col.name for col in client.list_collections()]
if collection_name in existing_collections:
    collection = client.get_collection(collection_name)
else:
    collection = client.create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})

# === 4. Helper: Batch Processing ===
def batch_list(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

# === 5. Embed & Insert Incrementally ===
for batch in batch_list(doc_texts, 32):
    ids = [str(uuid.uuid4()) for _ in batch]
    batch_embeddings = embedding_fn.embed_documents(batch)

# 6️⃣ Add documents & embeddings
    collection.add(
            ids=ids,
            documents=batch,
            embeddings=batch_embeddings
        )
    print(f"✅ Added batch of {len(batch)} chunks")

print("✅ Added documents to Chroma")


✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅ Added batch of 32 chunks
✅

In [108]:
import chromadb
from langchain_community.embeddings import HuggingFaceEmbeddings

# === CONFIG ===
CHROMA_PATH = "../chroma_data"
COLLECTION_NAME = "med"
QUERY_TEXT = "abuse"  # Your search query
TOP_K = 3  # Number of results to return

# === 1. Load embedding function (must match what you used during insert) ===
embedding_fn = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"batch_size": 32}
    )

# === 2. Connect to Chroma and get collection ===
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_collection(COLLECTION_NAME)

# === 3. Embed the query ===
query_embedding = embedding_fn.embed_query(QUERY_TEXT)

# === 4. Search in Chroma ===
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=TOP_K,
    include=["documents", "distances"]
)

# === 5. Show results ===
print(f"\n🔍 Query: {QUERY_TEXT}\n")
for i, (doc, distance) in enumerate(zip(results["documents"][0], results["distances"][0]), start=1):
    print(f"{i}. Score: {distance:.4f} | Text: {doc[:100]}...")


🔍 Query: abuse

1. Score: 0.3593 | Text: Abuse
Definition
Abuse is defined as any thing that is harmful, injuri-
ous, or offensive. Abuse als...
2. Score: 0.3593 | Text: Abuse
Definition
Abuse is defined as any thing that is harmful, injuri-
ous, or offensive. Abuse als...
3. Score: 0.3593 | Text: Abuse
Definition
Abuse is defined as any thing that is harmful, injuri-
ous, or offensive. Abuse als...


In [None]:
import chromadb
from langchain_community.embeddings import HuggingFaceEmbeddings
from groq import Groq
from dotenv import load_dotenv

# ==== CONFIG ====
CHROMA_PATH = "../chroma_data"
COLLECTION_NAME = "med"
QUERY_TEXT = "any finance advice on dollar"
TOP_K = 5
load_dotenv()
GROQ_API_KEY = os.environ["GROQ_API_KEY"]

# ==== 1. Connect to Chroma ====
embedding_fn = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"batch_size": 32}
)

client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_collection(COLLECTION_NAME)

# ==== 2. Get relevant documents ====
query_embedding = embedding_fn.embed_query(QUERY_TEXT)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=TOP_K,
    include=["documents"]
)

retrieved_docs = results["documents"][0]
context_text = "\n\n".join(retrieved_docs)

# ==== 3. Send to Groq LLM ====
groq_client = Groq(api_key=GROQ_API_KEY)

prompt = f"""
You are a medical expert. Using the context below, answer the query clearly and concisely.

Query: {QUERY_TEXT}

Context:
{context_text}

If the answer is not in the context, say "I don’t have enough information."
"""

chat_completion = groq_client.chat.completions.create(
    model="llama3-8b-8192", 
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.2
)

print("\n📄 LLM Answer:\n")
print(chat_completion.choices[0].message.content)


NotFoundError: Collection [med] does not exists

In [None]:
import streamlit as st
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from groq import Groq
import os
from dotenv import load_dotenv

# 🔹 Load Groq API key from environment variable
load_dotenv()
GROQ_API_KEY = os.environ["GROQ_API_KEY"]

# 🔹 ChromaDB setup
client = chromadb.PersistentClient(path="./chroma_data")
collection_name = "med"

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Retrieve collection
try:
    collection = client.get_collection(name=collection_name)
except Exception:
    st.error(f"Collection '{collection_name}' not found.")
    st.stop()

# 🔹 Streamlit UI
st.set_page_config(page_title="Groq + Chroma Chat", page_icon="💬")
st.title("💬 Chat with Your Chroma Data via Groq LLM")

# Chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat messages
for msg in st.session_state.messages:
    role = "user" if msg["role"] == "user" else "assistant"
    with st.chat_message(role):
        st.markdown(msg["content"])

# Input box
if prompt := st.chat_input("Ask me something..."):
    # Save user message
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)

    # Step 1: Embed query & search Chroma
    query_embedding = embedding_model.encode([prompt]).tolist()
    results = collection.query(query_embeddings=query_embedding, n_results=5)

    retrieved_docs = [doc for doc in results["documents"][0]]
    context_text = "\n".join(retrieved_docs)

    # Step 2: Send to Groq
    full_prompt = f"Answer the question using the context below.\n\nContext:\n{context_text}\n\nQuestion: {prompt}"
    response = groq_client.chat.completions.create(
        model="mixtral-8x7b-32768",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": full_prompt}
        ],
        temperature=0.2
    )

    answer = response.choices[0].message.content

    # Step 3: Display assistant message
    st.session_state.messages.append({"role": "assistant", "content": answer})
    with st.chat_message("assistant"):
        st.markdown(answer)




In [87]:
import chromadb

# Connect to your persistent DB
client = chromadb.PersistentClient(path="./chroma_data")

# Delete collection by name
client.delete_collection(name="med")
print("Collection 'med' deleted.")


Collection 'med' deleted.
