In [None]:
%pip install ragstack-ai pdfminer.six

In [None]:
import getpass

astra_token = getpass.getpass("Astra token:")
astra_endpoint = input("Astra db endpoint:")
openai_api_key = getpass.getpass("OpenAI API Key:")

## Load all the documents into memory

In [None]:
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader, TextLoader, PDFMinerLoader, UnstructuredMarkdownLoader

data_loaders = {
    "html": { "loader": BSHTMLLoader, "kwargs": {}},
    "md": { "loader": UnstructuredMarkdownLoader, "kwargs": {}},
    "pdf": { "loader": PDFMinerLoader, "kwargs": {"concatenate_pages": True}},
    "txt": { "loader": TextLoader, "kwargs": {}},
}

docs = []

for extension in data_loaders:
    print(f"Loading {extension} files...")
    loader_cls = data_loaders[extension]["loader"]
    loader_kwargs = data_loaders[extension]["kwargs"]
    loader = DirectoryLoader('data/', glob=f"*/source_files/*.{extension}", show_progress=True, loader_cls=loader_cls, loader_kwargs=loader_kwargs)
    docs.extend(loader.load())

print(f"\nProcessing done.")
len(docs)

## chunk, embed, and store the docs into an AstraDB vector store

Chunk data in 5 different sizes and compare RAG results

In [None]:
# setup the things
from langchain.vectorstores.astradb import AstraDB
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter

embedding = OpenAIEmbeddings(openai_api_key=openai_api_key, max_retries=20 )

#chunk_sizes = [128, 256, 512, 1024, 2048] # chunk size of 1024 is too big for astraPy vector store currently. max of 5000 bytes per entry.
chunk_sizes = [128, 256, 512, 768]

names = []
vstores = {}
splitters = {}

for size in chunk_sizes:
    name = f"open_ai_{size}"
    names.append(name)
    vstores[name] = AstraDB(collection_name=name, embedding=embedding, token=astra_token, api_endpoint=astra_endpoint)
    splitters[name] = TokenTextSplitter(chunk_size = size, chunk_overlap=0)

In [None]:
#vstores["open_ai_896"].delete_collection()

In [None]:
# do the work
for name in names:
    print(f"embedding docs for: {name}...")
    chunked_docs = splitters[name].split_documents(docs)
    vstores[name].add_documents(chunked_docs)