In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from llamabot import ChatBot  # just to enable openai key.


## Module 1: Read in a collection of files and pass them to the correct loader.

In [None]:
from llama_index import SimpleDirectoryReader
import glob 
from pyprojroot import here 

In [None]:
files_to_read = glob.glob(str(here() / "data" / "blog") + "**/**/*.lr")

In [None]:
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader
from pathlib import Path 
from typing import List, Union
from llama_index import LLMPredictor, ServiceContext, Document
from langchain.chat_models import ChatOpenAI 
from langchain.schema import SystemMessage, HumanMessage 
from langchain.text_splitter import TokenTextSplitter

class QueryBot:
    def __init__(self, system_message: str, doc_paths: List[Union[str, Path]] = None, saved_index_path: Union[str, Path] = None):
        """Initialize QueryBot.

        Pass in either the doc_paths or saved_index_path to initialize the QueryBot.

        QueryBot is not designed to have memory.

        Underneath the hood, we 

        :param system_message: The system message to send to the chatbot.
        :param doc_paths: A list of paths to the documents to use for the chatbot.
            These are assumed to be plain text files.
        :param saved_index_path: The path to the saved index to use for the chatbot.
        """

        self.system_message = system_message

        if saved_index_path is not None:
            self.index = GPTSimpleVectorIndex.load_from_disk(saved_index_path)

        else:
            self.doc_paths = doc_paths
            splitter = TokenTextSplitter(chunk_size=2000, chunk_overlap=0)
            documents = []
            for fname in doc_paths:
                with open(fname, 'r') as f:
                    docs = splitter.split_text(f.read())
                    documents.extend([Document(d) for d in docs])
            chat = ChatOpenAI(model_name="gpt-4", temperature=0.5)
            llm_predictor = LLMPredictor(llm=chat)
            service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
            index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)
            self.index = index 

    def __call__(self, query: str, **kwargs) -> str:
        q = ""
        q += self.system_message + "\n\n"
        q += query + "\n\n"
        result = self.index.query(q, **kwargs)
        return result


    def save(self, path: Union[str, Path]):
        self.index.save_to_disk(path)


In [None]:
bot = QueryBot(system_message="You are a Q&A bot.", doc_paths=files_to_read[0:50])


In [None]:
result = bot("Do you have any advice for me on career development?", similarity_top_k=5)


In [None]:
from IPython.display import display, Markdown

In [None]:
display(Markdown(result.response))

In [None]:
result.source_nodes

In [None]:
bot.save(here() / "data" / "blog_index.json")

In [None]:
doc_paths = files_to_read

splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=50)
documents = []
for fname in doc_paths:
    with open(fname, 'r') as f:
        docs = splitter.split_text(f.read())
        documents.extend([Document(d) for d in docs])

In [None]:
len(documents)

In [None]:
bot = QueryBot(system_message="You are a Q&A bot.", doc_paths=files_to_read[0:50])
# bot("Do you have any advice for me on career development?", similarity_top_k=3)

In [None]:
from langchain.text_splitter import TokenTextSplitter
splitter = TokenTextSplitter(encoding_name="gpt2")
splitter.split_text()