In [6]:
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display
from openai import OpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

MODEL = "gpt-4o-mini"
DB_NAME = "vector_db"

# Load and chunk a document
loader = TextLoader('books/moby_dick.txt', encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1700, chunk_overlap=300)
chunks = text_splitter.split_documents(documents)

# Use OpenAI Auto-Encoding LLM 'text-embedding-ada-002' for embedding
load_dotenv('../.env')
embeddings = OpenAIEmbeddings()

In [7]:
# Chroma is a popular open source Vector Database based on SQLLite
if os.path.exists(DB_NAME):
    Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()

# Create a Chroma vector store
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_NAME)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 939 documents
