# Indexing Documents in ChromaDB

We'll use LangChain to load the text files, HuggingFace to create the embeddings locally, and ChromaDB to store the index.

In [1]:
import os
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

## Load the documents from your folder

In [2]:
loader = DirectoryLoader('./support_docs', glob="./*.txt", loader_cls=TextLoader)
raw_documents = loader.load()
raw_documents

[Document(metadata={'source': 'support_docs/billing_policy.txt'}, page_content='Customers can request a full refund within 30 days of purchase if the service has not been fully utilized.\n\nRefunds take 5-10 business days to process.\n\nWe accept all major credit cards and PayPal.'),
 Document(metadata={'source': 'support_docs/technical_troubleshooting.txt'}, page_content='If the application fails to launch, first ensure your operating system is up to date.\n\nFor Windows 11 users, verify that Windows Defender is not blocking the executable.\n\nClearing the application cache in the AppData/Local folder often resolves sync issues.'),
 Document(metadata={'source': 'support_docs/account_security.txt'}, page_content="To reset your password, click 'Forgot Password' on the login screen.\n\nTwo-factor authentication (2FA) is mandatory for all administrative accounts.\n\nNever share your temporary login codes with support agents.")]

In [3]:
raw_documents[0].__dict__

{'id': None,
 'metadata': {'source': 'support_docs/billing_policy.txt'},
 'page_content': 'Customers can request a full refund within 30 days of purchase if the service has not been fully utilized.\n\nRefunds take 5-10 business days to process.\n\nWe accept all major credit cards and PayPal.',
 'type': 'Document'}

## Split documents into smaller chunks

In [4]:

# Create a text splitter that splits on paragraphs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,  # Maximum size of each chunk
    chunk_overlap=0,  # No overlap between chunks
    separators=["\n\n"],  # Split on double newlines (paragraphs) first
    length_function=len,
)

# Split the text
docs = text_splitter.split_documents(raw_documents)

In [5]:
# Display first 5 chunks
for i, doc in enumerate(docs[:5], 1):
    print(f"doc {i}:")
    print(doc.page_content)
    print("-" * 50)

doc 1:
Customers can request a full refund within 30 days of purchase if the service has not been fully utilized.
--------------------------------------------------
doc 2:
Refunds take 5-10 business days to process.
--------------------------------------------------
doc 3:
We accept all major credit cards and PayPal.
--------------------------------------------------
doc 4:
If the application fails to launch, first ensure your operating system is up to date.
--------------------------------------------------
doc 5:


For Windows 11 users, verify that Windows Defender is not blocking the executable.
--------------------------------------------------


## Initialize the Embedding Model (Local HuggingFace)

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

## Create and Save the Vector Database

In [None]:
# 'persist_directory' saves the index so you don't have to re-index every time.
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

print(f"Successfully indexed {len(docs)} chunks into ChromaDB.")

Successfully indexed 9 chunks into ChromaDB.
