# Personal Knowledge Worker

Search through your exported Notion Workspace with Gemini models using RAG.

How to export the content from Notion: https://www.notion.com/help/export-your-content

## Imports and Setup

In [None]:
!pip install -U -q langchain-google-genai

In [None]:
import os
import re
import glob
from dotenv import load_dotenv
import gradio as gr
import numpy as np

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
LLM_MODEL = "gemini-2.5-flash-lite"
EMBEDDINGS_MODEL = "models/gemini-embedding-001"
db_name = "vector_db"

In [None]:
load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')

## Vector DB Setup

### Clean up and Load Documents

In [None]:
# Clean up the Notion directory, remove MD5 hashes from filenames and directory names

# Root directory of your export
root_dir = "notion_export"

# Regex to match the hash: space + 24-32 hex chars (sometimes longer)
hash_pattern = re.compile(r"\s[0-9a-f]{16,32}(_all)?")

for dirpath, dirnames, filenames in os.walk(root_dir, topdown=False):
    # Rename files
    for filename in filenames:
        new_name = re.sub(hash_pattern, "", filename)
        if new_name != filename:
            old_path = os.path.join(dirpath, filename)
            new_path = os.path.join(dirpath, new_name)
            print(f"Renaming file: {old_path} -> {new_path}")
            os.rename(old_path, new_path)

    # Rename directories
    for dirname in dirnames:
        new_name = re.sub(hash_pattern, "", dirname)
        if new_name != dirname:
            old_path = os.path.join(dirpath, dirname)
            new_path = os.path.join(dirpath, new_name)
            print(f"Renaming dir: {old_path} -> {new_path}")
            os.rename(old_path, new_path)


In [None]:
# Read in documents using LangChain's loaders

documents = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    # Define doc_type relative to root_dir
    doc_type = os.path.relpath(dirpath, root_dir)

    # for main pages in Notion
    if doc_type == ".":
        doc_type = "Main"
    
    loader = DirectoryLoader(
        dirpath,
        glob="**/*.md",  # recursive match inside dirpath
        loader_cls=TextLoader
    )
    
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

### Create chunks

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
len(chunks)

In [None]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

### Create Embeddings

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDINGS_MODEL)

In [None]:
# If you don't want to recreate the collection

vectorstore = Chroma(embedding_function=embeddings, persist_directory=db_name)

In [None]:
# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create our Chroma vectorstore!

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

## RAG pipeline using LangChain

In [None]:
# create a new Chat with ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model=LLM_MODEL, temperature=0.7)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Gradio User Interface

In [None]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)