In [1]:
# Install all the required libraries as needed.
# !pip install langchain
# !pip install unstructured
# !pip install sentence_transformers
# !pip install huggingface_hub
# !pip install "unstructured[pdf]"
# !pip install chromadb

In [1]:
# Import necessary libraries.
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
# Create directory loader object to load the data from the directory.
pdfLoader = DirectoryLoader('./data/')

In [3]:
# Initial the document loader to load the data from the directory.
documents = []
documents.extend(pdfLoader.load())

In [4]:
# Display the number of documents in the directory.
print(f'You have {len(documents)} document(s) in your data folder.')

You have 1 document(s) in your data folder.


In [5]:
# Split the documents into chunks of 1000 characters with 200 characters overlap.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)
print(len(documents))

Created a chunk of size 1164, which is longer than the specified 1000


202


In [6]:
# Define to variables to use "sentence-transformers/all-MiniLM-L6-v2" embedding model from HuggingFace.
# https://huggingface.co/blog/getting-started-with-embeddings
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [7]:
# Generate the embeddings for the documents and store them in the Chroma Vector Store.
db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db/")
db.persist()