In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Cache prototype

from llamabot.doc_processor import split_document, magic_load_doc

from pyprojroot import here

fpath = here() / "data/return_label.pdf"

document = magic_load_doc(fpath)

split_docs = split_document(document[0])

In [None]:
len(split_docs)

In [None]:
# Calculate hash of the file.
import hashlib
from pathlib import Path


def compute_file_hash(fpath: Path) -> str:
    file_content = fpath.read_bytes()
    return hashlib.sha256(file_content).hexdigest()


file_hash = compute_file_hash(fpath)

In [None]:
file_hash

In [None]:
# Create one GPTVectorStoreIndex per file.
from llama_index import GPTVectorStoreIndex, LLMPredictor, ServiceContext
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.base import BaseCallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


def make_service_context():
    chat = ChatOpenAI(
        model_name="gpt-4",
        temperature=0.0,
        streaming=True,
        verbose=True,
        callback_manager=BaseCallbackManager([StreamingStdOutCallbackHandler()]),
    )
    llm_predictor = LLMPredictor(llm=chat)
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
    return service_context


service_context = make_service_context()

In [None]:
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.storage_context import StorageContext
from pathlib import Path
import hashlib

In [None]:
from llama_index import load_index_from_storage


def get_persist_dir(file_hash: str):
    persist_dir = Path.home() / ".llamabot" / "cache" / file_hash
    return persist_dir


def load_index(persist_dir, service_context):
    storage_context = StorageContext.from_defaults(
        docstore=SimpleDocumentStore.from_persist_dir(persist_dir=persist_dir),
        vector_store=SimpleVectorStore.from_persist_dir(persist_dir=persist_dir),
        index_store=SimpleIndexStore.from_persist_dir(persist_dir=persist_dir),
    )
    index = load_index_from_storage(storage_context, service_context=service_context)
    if index.summary == "None":
        index.summary = index.as_query_engine().query("Summarize this document.")
        index.storage_context.persist(persist_dir=persist_dir)
    return index

In [None]:
# def build_storage_context()

In [None]:
def make_index(docs, persist_dir, service_context):
    # create parser and parse document into nodes
    parser = SimpleNodeParser()
    nodes = parser.get_nodes_from_documents(docs)

    # create (or load) docstore and add nodes
    storage_context = StorageContext.from_defaults(
        docstore=SimpleDocumentStore(),
        vector_store=SimpleVectorStore(),
        index_store=SimpleIndexStore(),
    )
    storage_context.docstore.add_documents(nodes)

    index = GPTVectorStoreIndex(
        nodes,
        storage_context=storage_context,
        index_id=file_hash,
        service_context=service_context,
    )
    index.summary = index.as_query_engine().query("Summarize this document.")
    index.storage_context.persist(persist_dir=persist_dir)
    return index


def make_or_load_index(fpath):
    file_hash = compute_file_hash(fpath)
    service_context = make_service_context()
    persist_dir = get_persist_dir(file_hash)

    if persist_dir.exists():
        index = load_index(persist_dir, service_context=service_context)
    else:
        persist_dir.mkdir(exist_ok=True, parents=True)
        index = make_index(split_docs, persist_dir, service_context=service_context)
    return index

In [None]:
from llama_index import load_index_from_storage, ComposableGraph

fpaths = [
    here() / "data/return_label.pdf",
    here() / "data/dshiring.pdf",
]

hashes = list(map(compute_file_hash, fpaths))
persist_dirs = list(map(get_persist_dir, hashes))

# index = load_index(persist_dir, service_context=service_context)
indexes = [
    load_index(persist_dir, service_context=service_context)
    for persist_dir in persist_dirs
]

In [None]:
indexes[1].summary

In [None]:
indexes[0].summary