In [None]:
import json
import logging
import sys
import os
import langchain
from langchain import HuggingFaceHub
from langchain.cache import SQLiteCache

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

langchain.llm_cache = SQLiteCache(database_path=".langchain.db")

# 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
from getpass import getpass
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass()

from llama_index import (
    GPTKeywordTableIndex,
    GPTVectorStoreIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext
)
from langchain import OpenAI

class Chatbot:
    def __init__(self, query_engine):
        self.query_engine = query_engine
        self.chat_history = []

    def generate_response(self, user_input):
        prompt = "\n".join([f"{message['role']}: {message['content']}" for message in self.chat_history[-5:]])
        prompt += f"\nUser: {user_input}"
        response = self.query_engine.query(user_input)

        message = {"role": "assistant", "content": response.response}
        self.chat_history.append({"role": "user", "content": user_input})
        self.chat_history.append(message)
        return response.response
    
    def load_chat_history(self, filename):
        try:
            with open(filename, 'r') as f:
                self.chat_history = json.load(f)
        except FileNotFoundError:
            pass

    def save_chat_history(self, filename):
        with open(filename, 'w') as f:
            json.dump(self.chat_history, f)

documents = SimpleDirectoryReader('data').load_data()

# define LLM
#flanllm_predictor = LLMPredictor(llm=
# HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":.5, "max_length":64}, 
#   huggingfacehub_api_token="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"))

from llama_index.prompts.prompts import SimpleInputPrompt
system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
""" 
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

from llama_index.llm_predictor import HuggingFaceLLMPredictor
import torch
stablelm_predictor = HuggingFaceLLMPredictor(
    max_input_size=4096, 
    max_new_tokens=256,
    temperature=0.7,
    do_sample=False,
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

service_context = ServiceContext.from_defaults(chunk_size_limit=1024, llm_predictor=stablelm_predictor)

# rebuild storage context
# from llama_index import StorageContext, load_index_from_storage
# storage_context = StorageContext.from_defaults(persist_dir="./storage")
# load index
# index = load_index_from_storage(storage_context)

# build index (first time)
# index = GPTKeywordTableIndex.from_documents(documents, service_context=service_context)
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
index.storage_context.persist()
query_engine = index.as_query_engine(streaming=True)

# get response from query
# response = query_engine.query("What did the author do after his time at Y Combinator?")

bot = Chatbot(query_engine=query_engine)
bot.load_chat_history("chat_history.json")

while True:
    user_input = input("You: ")
    if user_input.lower() in ["bye", "goodbye"]:
        print("Bot: Goodbye!")
        bot.save_chat_history("chat_history.json")
        break
    streaming_resp = bot.generate_response(user_input)
    print(f"Bot: {streaming_resp.print_response_stream()}")