# AI Personal Knowledge worker

A local, private chatbot that ingests your documents, vectorizes them, and lets you have a conversation with your own knowledge base. No data leaves your machine.

In [None]:
# Install Dependencies
!pip install gradio chromadb sentence-transformers anthropic pypdf python-docx pandas

In [None]:
# Imports
import os
import gradio as gr
import chromadb
from chromadb.utils import embedding_functions
from anthropic import Anthropic
from pypdf import PdfReader
from docx import Document
import pandas as pd
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
import pickle
import shutil



In [None]:
# Configuration
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
HF_TOKEN = os.environ.get("HF_TOKEN")
DOCS_FOLDER = "docs"
CHROMA_FOLDER = "chroma_db"
MODEL = "claude-opus-4-6"

GOOGLE_CLIENT_ID = os.environ.get("GOOGLE_CLIENT_ID")
GOOGLE_CLIENT_SECRET = os.environ.get("GOOGLE_CLIENT_SECRET")
GOOGLE_REDIRECT_URI = os.environ.get("GOOGLE_REDIRECT_URI")
TOKEN_FILE = "google_token.pickle"


SCOPES = [
    "https://www.googleapis.com/auth/drive.readonly",
    "https://www.googleapis.com/auth/documents.readonly"
]

CLIENT_CONFIG = {
    "installed": {
        "client_id": GOOGLE_CLIENT_ID,
        "client_secret": GOOGLE_CLIENT_SECRET,
        "redirect_uris": ["http://localhost"],
        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
        "token_uri": "https://oauth2.googleapis.com/token"
    }
}

os.makedirs(DOCS_FOLDER, exist_ok=True)
os.makedirs(CHROMA_FOLDER, exist_ok=True)

client = Anthropic(api_key=ANTHROPIC_API_KEY)


In [None]:
# Vector Database Setup
embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

chroma_client = chromadb.PersistentClient(path=CHROMA_FOLDER)

collection = chroma_client.get_or_create_collection(
    name="knowledge_base",
    embedding_function=embed_fn
)



In [None]:
#  File Parsers
def parse_pdf(filepath):
    reader = PdfReader(filepath)
    return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

def parse_docx(filepath):
    doc = Document(filepath)
    return "\n".join(para.text for para in doc.paragraphs if para.text.strip())

def parse_txt(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return f.read()

def parse_csv(filepath):
    df = pd.read_csv(filepath)
    return df.to_string(index=False)

def parse_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    parsers = {
        ".pdf": parse_pdf,
        ".docx": parse_docx,
        ".txt": parse_txt,
        ".md": parse_txt,
        ".csv": parse_csv
    }
    parser = parsers.get(ext)
    if parser:
        return parser(filepath)
    return None



In [None]:
#  Document Indexing
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

def index_documents():
    files = os.listdir(DOCS_FOLDER)
    if not files:
        return "No files found in /docs folder."

    indexed = 0
    skipped = 0

    for filename in files:
        filepath = os.path.join(DOCS_FOLDER, filename)
        text = parse_file(filepath)

        if not text:
            skipped += 1
            continue

        chunks = chunk_text(text)
        existing = collection.get(where={"source": filename})

        if existing["ids"]:
            skipped += 1
            continue

        ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
        metadatas = [{"source": filename} for _ in chunks]

        collection.add(documents=chunks, ids=ids, metadatas=metadatas)
        indexed += 1

    return f"Indexing complete ✓ | Indexed: {indexed} files | Skipped: {skipped} files | Total chunks: {collection.count()}"



In [None]:
#  Retrieval & Q&A
def retrieve_context(query, n_results=5):
    results = collection.query(query_texts=[query], n_results=n_results)
    docs = results["documents"][0]
    sources = [m["source"] for m in results["metadatas"][0]]
    context = "\n\n".join(docs)
    unique_sources = list(set(sources))
    return context, unique_sources

def ask_claude(query, chat_history):
    context, sources = retrieve_context(query)

    system_prompt = f"""You are a helpful personal knowledge assistant. 
Answer the user's question using only the context provided from their personal documents.
If the answer is not in the context, say so clearly.

Context from personal documents:
{context}"""

    messages = []
    for user_msg, assistant_msg in chat_history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": query})

    response = client.messages.create(
        model=MODEL,
        max_tokens=1024,
        system=system_prompt,
        messages=messages
    )

    answer = response.content[0].text
    if sources:
        answer += f"\n\n*Sources: {', '.join(sources)}*"

    return answer



In [None]:
#  Knowledge Base Management


def list_indexed_documents():
    results = collection.get()
    if not results["ids"]:
        return "No documents indexed yet."
    sources = sorted(set(m["source"] for m in results["metadatas"]))
    doc_list = "\n".join(f"• {s}" for s in sources)
    return f"Indexed documents ({len(sources)}):\n\n{doc_list}"

def clear_all():
    global collection

    chroma_client.delete_collection("knowledge_base")

    shutil.rmtree(CHROMA_FOLDER, ignore_errors=True)
    os.makedirs(CHROMA_FOLDER, exist_ok=True)

    collection = chroma_client.get_or_create_collection(
        name="knowledge_base",
        embedding_function=embed_fn
    )

    for f in os.listdir(DOCS_FOLDER):
        os.remove(os.path.join(DOCS_FOLDER, f))

    return "All documents cleared.", list_indexed_documents()



In [None]:
#  Google Auth Setup



def get_google_services():
    creds = None
    if os.path.exists(TOKEN_FILE):
        with open(TOKEN_FILE, "rb") as f:
            creds = pickle.load(f)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_config(CLIENT_CONFIG, SCOPES)
            creds = flow.run_local_server(port=0)
        with open(TOKEN_FILE, "wb") as f:
            pickle.dump(creds, f)
    drive_service = build("drive", "v3", credentials=creds)
    docs_service = build("docs", "v1", credentials=creds)
    return drive_service, docs_service

def disconnect_google():
    if os.path.exists(TOKEN_FILE):
        os.remove(TOKEN_FILE)
        return "Disconnected from Google Workspace ✓ Re-sync to reconnect."
    return "No active Google session found."



In [None]:
#  Fetch & Index Google Workspace Files
GOOGLE_MIME_EXPORT = {
    "application/vnd.google-apps.document": "text/plain",
    "application/vnd.google-apps.spreadsheet": "text/csv",
    "application/vnd.google-apps.presentation": "text/plain"
}

google_page_token = None
total_files_indexed = 0

def fetch_google_doc_text(docs_service, file_id):
    doc = docs_service.documents().get(documentId=file_id).execute()
    text = ""
    for element in doc.get("body", {}).get("content", []):
        paragraph = element.get("paragraph")
        if paragraph:
            for part in paragraph.get("elements", []):
                text_run = part.get("textRun")
                if text_run:
                    text += text_run.get("content", "")
    return text.strip()

def fetch_exported_text(drive_service, file_id, mime_type):
    export_mime = GOOGLE_MIME_EXPORT[mime_type]
    content = drive_service.files().export(fileId=file_id, mimeType=export_mime).execute()
    return content.decode("utf-8").strip() if isinstance(content, bytes) else content.strip()

def fetch_and_index_google_workspace(reset=False):
    global google_page_token, total_files_indexed

    if reset:
        google_page_token = None
        total_files_indexed = 0

    try:
        drive_service, docs_service = get_google_services()
    except Exception as e:
        return f"Auth failed: {str(e)}", list_indexed_documents()

    query = " or ".join([f"mimeType='{m}'" for m in GOOGLE_MIME_EXPORT.keys()])
    query += " and trashed=false"

    try:
        response = drive_service.files().list(
            q=query,
            corpora="allDrives",
            includeItemsFromAllDrives=True,
            supportsAllDrives=True,
            spaces="drive",
            fields="nextPageToken, files(id, name, mimeType)",
            pageSize=20,
            pageToken=google_page_token
        ).execute()
    except Exception as e:
        return f"Drive API error: {str(e)}", list_indexed_documents()

    files = response.get("files", [])
    google_page_token = response.get("nextPageToken")

    if not files:
        return "No Google Workspace files found.", list_indexed_documents()

    indexed, skipped = 0, 0

    for file in files:
        file_id = file["id"]
        mime_type = file["mimeType"]
        file_type = mime_type.split(".")[-1]
        doc_name = f"gdrive_{file_type}_{file['name']}"

        existing = collection.get(where={"source": doc_name})
        if existing["ids"]:
            skipped += 1
            continue

        try:
            if mime_type == "application/vnd.google-apps.document":
                text = fetch_google_doc_text(docs_service, file_id)
            else:
                text = fetch_exported_text(drive_service, file_id, mime_type)
        except Exception as e:
            print(f"Skipping {file['name']}: {str(e)}")
            skipped += 1
            continue

        if not text:
            skipped += 1
            continue

        chunks = chunk_text(text)
        ids = [f"{doc_name}_chunk_{i}" for i in range(len(chunks))]
        metadatas = [{"source": doc_name} for _ in chunks]

        collection.add(documents=chunks, ids=ids, metadatas=metadatas)
        indexed += 1

    total_files_indexed += indexed
    more = "More files available — click Load More." if google_page_token else "All files synced ✓"
    status = f"Batch complete | This batch: {indexed} indexed, {skipped} skipped | Total indexed: {total_files_indexed} | {more}"

    return status, list_indexed_documents()

def sync_google_fresh():
    return fetch_and_index_google_workspace(reset=True)

def load_more_google():
    return fetch_and_index_google_workspace(reset=False)



In [None]:
#  Gradio UI
def upload_and_index(files):
    if not files:
        return "No files uploaded.", list_indexed_documents()
    for file in files:
        filename = os.path.basename(file.name)
        dest = os.path.join(DOCS_FOLDER, filename)
        with open(file.name, "rb") as src, open(dest, "wb") as dst:
            dst.write(src.read())
    status = index_documents()
    return status, list_indexed_documents()

def chat(user_message, history):
    if not user_message.strip():
        return "", history

    context, sources = retrieve_context(user_message)

    messages = []
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})
    messages.append({"role": "user", "content": user_message})

    system_prompt = f"""You are a helpful personal knowledge assistant.
Answer the user's question using only the context provided from their personal documents.
If the answer is not in the context, say so clearly.

Context from personal documents:
{context}"""

    response = client.messages.create(
        model=MODEL,
        max_tokens=1024,
        system=system_prompt,
        messages=messages
    )

    answer = response.content[0].text
    if sources:
        answer += f"\n\n*Sources: {', '.join(sources)}*"

    history.append({"role": "user", "content": user_message})
    history.append({"role": "assistant", "content": answer})

    return "", history

with gr.Blocks(title="Personal Knowledge Worker") as app:
    gr.Markdown("# Personal Knowledge Worker")

    with gr.Tabs():
        with gr.Tab("Local Documents"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Upload Documents")
                    file_input = gr.File(file_count="multiple", label="Select Files")
                    index_btn = gr.Button("Index Documents", variant="primary")
                    index_status = gr.Textbox(label="Status", interactive=False)
                    gr.Markdown("### Knowledge Base")
                    doc_list = gr.Textbox(label="Indexed Documents", lines=6, interactive=False,
                                          value=list_indexed_documents())
                    clear_btn = gr.Button("Clear All Documents", variant="stop")

                with gr.Column(scale=2):
                    gr.Markdown("### Chat with your Knowledge Base")
                    chatbot = gr.Chatbot(height=450)
                    msg_input = gr.Textbox(placeholder="Ask a question about your documents...", label="Your Question")
                    clear_chat_btn = gr.ClearButton([msg_input, chatbot])

        with gr.Tab("Google Workspace"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Google Workspace Sync")
                    gr.Markdown("Fetches 20 files at a time from your Google Drive.")
                    with gr.Row():
                        google_sync_btn = gr.Button("Sync First 20", variant="primary")
                        google_more_btn = gr.Button("Load More", variant="secondary")
                    google_disconnect_btn = gr.Button("Disconnect Google", variant="secondary")
                    google_status = gr.Textbox(label="Status", interactive=False)
                    gr.Markdown("### Knowledge Base")
                    google_doc_list = gr.Textbox(label="Indexed Documents", lines=6, interactive=False,
                                                  value=list_indexed_documents())
                    google_clear_btn = gr.Button("Clear All Documents", variant="stop")

                with gr.Column(scale=2):
                    gr.Markdown("### Chat with your Knowledge Base")
                    chatbot2 = gr.Chatbot(height=450)
                    msg_input2 = gr.Textbox(placeholder="Ask a question about your Google Workspace...", label="Your Question")
                    clear_chat_btn2 = gr.ClearButton([msg_input2, chatbot2])

    # Local tab events
    index_btn.click(upload_and_index, inputs=file_input, outputs=[index_status, doc_list])
    clear_btn.click(clear_all, outputs=[index_status, doc_list])
    msg_input.submit(chat, inputs=[msg_input, chatbot], outputs=[msg_input, chatbot])

    # Google tab events
    google_sync_btn.click(sync_google_fresh, outputs=[google_status, google_doc_list])
    google_more_btn.click(load_more_google, outputs=[google_status, google_doc_list])
    google_disconnect_btn.click(disconnect_google, outputs=[google_status])
    google_clear_btn.click(clear_all, outputs=[google_status, google_doc_list])
    msg_input2.submit(chat, inputs=[msg_input2, chatbot2], outputs=[msg_input2, chatbot2])

app.launch()