In [None]:
%pip install ragstack-ai pdfminer.six

In [None]:
from dotenv import load_dotenv

load_dotenv()

# this notebook assumes the following env vars exist in a .env file:
#
# ASTRA_DB_ENDPOINT
# ASTRA_DB_TOKEN
# AZURE_OPENAI_ENDPOINT
# AZURE_OPENAI_API_KEY
# OPENAI_API_VERSION

# LangChain

## Load all the documents into memory

In [None]:
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader, TextLoader, PDFMinerLoader, UnstructuredMarkdownLoader

data_loaders = {
    "html": { "loader": BSHTMLLoader, "kwargs": {}},
    "md": { "loader": UnstructuredMarkdownLoader, "kwargs": {}},
    "pdf": { "loader": PDFMinerLoader, "kwargs": {"concatenate_pages": True}},
    "txt": { "loader": TextLoader, "kwargs": {}},
}

docs = []

for extension in data_loaders:
    print(f"Loading {extension} files...")
    loader_cls = data_loaders[extension]["loader"]
    loader_kwargs = data_loaders[extension]["kwargs"]
    loader = DirectoryLoader('data/', glob=f"*/source_files/*.{extension}", show_progress=True, loader_cls=loader_cls, loader_kwargs=loader_kwargs)
    docs.extend(loader.load())

print(f"\nProcessing done.")
len(docs)

## chunk, embed, and store the docs into an AstraDB vector store

Chunk data in 5 different sizes and compare RAG results

In [None]:
# setup the things
from langchain.vectorstores.astradb import AstraDB
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
import os

embedding = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2023-05-15",
    max_retries=20
)

#chunk_sizes = [128, 256, 512, 1024, 2048] # chunk size of 1024 is too big for astraPy vector store currently. max of 5000 bytes per entry.
chunk_sizes = [128, 256, 512, 768]

names = []
vstores = {}
splitters = {}

for size in chunk_sizes:
    name = f"open_ai_{size}"
    names.append(name)
    vstores[name] = AstraDB(collection_name=name, embedding=embedding, token=os.getenv("ASTRA_DB_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_ENDPOINT"))
    splitters[name] = TokenTextSplitter(chunk_size = size, chunk_overlap=0)

In [None]:
#vstores["open_ai_896"].delete_collection()

In [None]:
# do the work
for name in names:
    print(f"embedding docs for: {name}...")
    chunked_docs = splitters[name].split_documents(docs)
    vstores[name].add_documents(chunked_docs)

# LlamaIndex

## Init the vector store and AzureLLMs

In [None]:
from llama_index.vector_stores import AstraDBVectorStore
from llama_index.llms import AzureOpenAI as AzureChatOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
import os

astra_db_vstore = AstraDBVectorStore(
    collection_name="llama_512",
    api_endpoint=os.getenv("ASTRA_DB_ENDPOINT"),
    token=os.getenv("ASTRA_DB_TOKEN"),
    embedding_dimension=1536,
)

temperature = 0.0

gpt_35_turbo = AzureChatOpenAI(
    deployment_name="gpt-35-turbo",
    model="gpt-35-turbo",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=temperature,
)

embed_model = AzureOpenAIEmbedding(
    deployment_name="text-embedding-ada-002",
    model="text-embedding-ada-002",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    temperature=temperature,
)

## Load Docs into Memory

TODO: This code splits PDFs by page. Update it to include whole PDFs as single documents, to make a better comparison to langChain

In [None]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_dir="data",
    recursive=True,
    required_exts=[".pdf", ".md", ".html", ".txt"]
)
documents = reader.load_data()
len(documents)

## Split the docs into nodes and load into vector store

In [None]:
from llama_index.node_parser import TokenTextSplitter
from llama_index.ingestion import IngestionPipeline

splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0)
pipeline = IngestionPipeline(transformations=[splitter])

nodes = pipeline.run(documents=documents)
len(nodes)

In [None]:
from llama_index import  VectorStoreIndex, StorageContext, ServiceContext

service_context = ServiceContext.from_defaults(
    llm=gpt_35_turbo,
    embed_model=embed_model,
)

storage_context = StorageContext.from_defaults(
    vector_store=astra_db_vstore,
)

index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    service_context=service_context,
)