In [None]:
%load_ext autoreload
%autoreload 2

My goal with this notebook is to create the functionality needed to enable QueryBot accept more than just a collection of text files, but instead a collection of arbitrary documents that can be loaded by LlamaHub.

From all of the prototyping down below, it looks like what I need are the following components:

1. A mapper for file extension to LlamaHub data loader.
2. A function that takes in a file path and returns a list of Document objects,
3. 

In [None]:
from pathlib import Path
from llama_index import download_loader
from pyprojroot import here

MarkdownReader = download_loader("MarkdownReader")

loader = MarkdownReader()
documents = loader.load_data(file=here() / Path("docs/index.md"))
documents

In [None]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
loaded_docs = loader.load_data(file=here() / "data/dshiring.pdf")
loaded_docs

In [None]:



final_docs = []
for doc in loaded_docs:
    final_docs.extend(split_document(doc))
final_docs

In [None]:
# Here's a function that takes in a path and a file extension
from llamabot.file_finder import recursive_find
from pyprojroot import here
from llama_index import download_loader
from pathlib import Path

python_files = recursive_find(here(), ".py")
markdown_files = recursive_find(here(), ".md")
# jupyter_files = recursive_find(here(), ".ipynb")
pdf_files = recursive_find(here(), ".pdf")

wanted_files = [] + markdown_files + pdf_files

# Step 0: map file extensions to llamahub loaders
extension_loader_mapping = {
    ".pdf": "PDFReader",
    ".docx": "DocxReader",
    ".pptx": "PptxReader",
    ".xlsx": "PandasExcelReader",
}

# Step 1: Use the appropriate document loader to load the document.
# loaded_docs are named as such because they are loaded from llamahub loaders.
# however, we still will need to split them up further into chunks of 2,000 tokens,
# which will be done later to give us `final_docs`.


def magic_load_doc(file_path) -> List[Document]:
    loader_string: str = extension_loader_mapping.get(Path(file_path).suffix, None)
    if loader_string is not None:
        # Treat this as a document that needs special processing.
        Loader = download_loader(loader_string)
        loader = Loader()
        documents = loader.load_data(file_path)

    else:
        # Treat this as a plain text file.
        with open(file_path, "r+") as f:
            documents = [Document(text=str(file_path) + f.read())]
    return documents


raw_docs = []
for file in wanted_files:
    raw_docs.extend(magic_load_doc(file))

# Step 2: Ensure each doc is 2000 tokens long maximum.
final_docs = []
for doc in loaded_docs:
    final_docs.extend(split_document(doc))
final_docs

In [None]:
# Step 3: Combine all of the documents into a single GPTIndex
from pathlib import Path
from typing import List, Union

from langchain.chat_models import ChatOpenAI
from llama_index import Document, GPTSimpleVectorIndex, LLMPredictor, ServiceContext
from llama_index.response.schema import Response
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import CallbackManager

chat = ChatOpenAI(
    model_name="gpt-4",
    temperature=0.0,
    streaming=True,
    verbose=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)
llm_predictor = LLMPredictor(llm=chat)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
index = GPTSimpleVectorIndex.from_documents(final_docs, service_context=service_context)
index

In [None]:
index.query("What tecnical skills do we need to cover when hiring data scientists?")

In [None]:
from llamabot import ChatBot


programmer = ChatBot("You are a highly skilled Python programmer.")


programmer(
    "Write me a function that takes in a path to a source code repository, amd retirms a list of the paths to Python source files.",
)

In [None]:
programmer(
    "Can you improve the code such that it errors out if the path to the repository is not an actual git repo?"
)

In [None]:
len(documents)