# 🧠 Semantic Document Search with ChromaDB (Local + Cloud-ready)
Supports TXT, PDF, DOCX, CSV, JSON, and Website URLs with persistent vector store using ChromaDB.

## 📦 Install Dependencies
Ensure all required Python packages are installed including `chromadb`, `sentence-transformers`, `pdfplumber`, and `gradio`.

In [None]:
!pip install transformers sentence-transformers torch chromadb gradio pdfplumber python-docx pandas beautifulsoup4 requests

## 🧠 Load Embedding Model
We use `sentence-transformers/all-MiniLM-L6-v2`, a lightweight transformer-based model ideal for semantic search tasks.

In [None]:
import gradio as gr
import pdfplumber, docx, pandas as pd, json as js, requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

## 📂 Extract Text from Files and Websites
This utility supports common document formats and extracts raw text for embedding. Also includes support for website URLs using BeautifulSoup.

In [None]:
# Extract text from various formats and websites
def extract_text(file):
    name = file.name.lower()
    if name.endswith(".pdf"):
        with pdfplumber.open(file.name) as pdf:
            return "\n".join(page.extract_text() or "" for page in pdf.pages)
    elif name.endswith(".docx"):
        doc = docx.Document(file.name)
        return "\n".join([para.text for para in doc.paragraphs])
    elif name.endswith(".csv"):
        df = pd.read_csv(file.name)
        return df.to_string(index=False)
    elif name.endswith(".json"):
        data = js.load(file)
        return js.dumps(data, indent=2)
    else:
        return file.read().decode("utf-8", errors="ignore")

def extract_website_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except Exception as e:
        return f"Error fetching website: {str(e)}"

## 🧱 Text Chunking
To improve retrieval quality, long documents are broken into overlapping chunks using a sliding window technique.

In [None]:
# Text chunking utility
def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

## 💾 ChromaDB Setup
Chroma is initialized in local mode by default. You may optionally use the commented `HttpClient` code to connect to a cloud-hosted instance.

In [None]:
# ChromaDB Setup (Local SQLite, with optional cloud client commented)
import chromadb
from chromadb.utils import embedding_functions
# from chromadb.config import Settings
# chroma_client = chromadb.HttpClient(host='your-host', port=your-port, headers={"Authorization": "Bearer YOUR_API_KEY"})
chroma_client = chromadb.Client()  # Local persistence
collection = chroma_client.get_or_create_collection(name="document_chunks")

## 📤 Upload Files and Store in ChromaDB
Text chunks are embedded and stored in the Chroma collection, along with unique IDs and metadata.

In [None]:
# Upload and embed using ChromaDB
import uuid

def process_files_chroma(file_objs):
    global doc_texts
    doc_texts = []
    ids = []
    metadatas = []
    for file in file_objs:
        raw_text = extract_text(file)
        chunks = chunk_text(raw_text)
        doc_texts.extend(chunks)
        for chunk in chunks:
            ids.append(str(uuid.uuid4()))
            metadatas.append({"source": file.name})
    embeddings = model.encode(doc_texts).tolist()
    collection.add(documents=doc_texts, embeddings=embeddings, metadatas=metadatas, ids=ids)
    return f"✅ Uploaded {len(doc_texts)} chunks to ChromaDB."

## 🔍 Semantic Search from ChromaDB
The query is encoded and compared to stored vectors in Chroma. The most relevant document is returned.

In [None]:
# Search ChromaDB
def search_document_chroma(query):
    embedding = model.encode([query]).tolist()
    results = collection.query(query_embeddings=embedding, n_results=1)
    return results['documents'][0][0] if results['documents'] else 'No match found.'

In [None]:
# Gradio UI with ChromaDB support
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 ChromaDB-Powered Document Search")

    with gr.Row():
        file_input = gr.File(file_types=['.txt', '.pdf', '.docx', '.csv', '.json'], file_count="multiple", label="📄 Upload Files")
        upload_output = gr.Textbox(label="Upload Result")

    with gr.Row():
        query_input = gr.Textbox(label="🔍 Enter your search query")
        search_output = gr.Textbox(label="Best Matching Document")

    upload_button = gr.Button("Upload to Chroma")
    upload_button.click(process_files_chroma, inputs=file_input, outputs=upload_output)

    search_button = gr.Button("Search in Chroma")
    search_button.click(search_document_chroma, inputs=query_input, outputs=search_output)

demo.launch()

In [None]:
# Clear uploaded documents and reset the collection (in-memory only)
def clear_uploaded():
    global doc_texts
    doc_texts.clear()
    chroma_client.delete_collection(name="document_chunks")
    return "🗑️ Cleared all uploaded documents and embeddings."

    clear_button.click(clear_uploaded, outputs=upload_output)    clear_button = gr.Button("🗑️ Clear Uploads")## 🖥️ Interactive Gradio UI
This user interface allows you to upload documents, embed them to Chroma, and search semantically with ease.