In [69]:
import os
from datetime import datetime

from IPython.display import HTML, display
from ipywidgets import widgets



In [None]:
# pull branch from private gpt demo

In [70]:
%%html
<link rel="stylesheet" 
      href="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap.min.css" 
      integrity="sha384-TX8t27EcRE3e/ihU7zmQxVncDAy5uIKz4rEkgIXeMed4M0jlfIDPvg6uqKI2xXr2" 
      crossorigin="anonymous">
<style>
    body{margin-top:20px;}

    .chat-message-left,
    .chat-message-right {
        display: flex;
        flex-shrink: 0
    }

    .chat-message-left {
        margin-right: auto
    }

    .chat-message-right {
        flex-direction: row-reverse;
        margin-left: auto
    }
</style>




In [71]:
from dotenv import load_dotenv
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.embeddings import LlamaCppEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All, LlamaCpp
import os

load_dotenv()

llama_embeddings_model = os.environ.get("LLAMA_EMBEDDINGS_MODEL")
persist_directory = os.environ.get('PERSIST_DIRECTORY')

model_type = os.environ.get('MODEL_TYPE')
model_path = os.environ.get('MODEL_PATH')
model_n_ctx = os.environ.get('MODEL_N_CTX')

from constants import CHROMA_SETTINGS


llama = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx)
db = Chroma(persist_directory=persist_directory, embedding_function=llama, client_settings=CHROMA_SETTINGS)
retriever = db.as_retriever()
# Prepare the LLM
callbacks = [StreamingStdOutCallbackHandler()]
match model_type:
    case "LlamaCpp":
        llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, callbacks=callbacks, verbose=False)
    case "GPT4All":
        llm = GPT4All(model=model_path, n_ctx=model_n_ctx, backend='gptj', callbacks=callbacks, verbose=False)
    case _default:
        print(f"Model {model_type} not supported!")
        exit;

# Create the chain and retrieve relevant text chunks, only use the relevant text chunks in the language model
# qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
qa = ConversationalRetrievalChain.from_llm(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)






llama.cpp: loading model from models/ggml-model-q4_0.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = 'ggml' (old version with low tokenizer quality and no mmap support)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 1000
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113748.20 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 2052.00 MB per state)
...................................................................................................
.
llama

 done
gptj_model_load: model size =  3609.38 MB / num tensors = 285
gptj_model_load: loading model from 'models/ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 4505.45 MB
gptj_model_load: memory_size =   896.00 MB, n_mem = 57344
gptj_model_load: ...................................

In [72]:
chat_history = []


def text_eventhandler(*args):
    # Needed bc when we "reset" the text input
    # it fires instantly another event since
    # we "changed" it's value to ""
    if args[0]["new"] == "":
        return

    # Show loading animation
    loading_bar.layout.display = "block"

    # Get question
    question = args[0]["new"]

    # Reset text field
    args[0]["owner"].value = ""

    # Formatting question for output
    q = (
        f'<div class="chat-message-right pb-4"><div>'
        + f'<img src="images/bear.png" class="rounded-circle mr-1" width="40" height="40">'
        + f'<div class="text-muted small text-nowrap mt-2">{datetime.now().strftime("%H:%M:%S")}</div></div>'
        + '<div class="flex-shrink-1 bg-light rounded py-2 px-3 ml-3">'
        + f'<div class="font-weight-bold mb-1">You</div>{question}</div>'
    )

    # Display formatted question
    output.append_display_data(HTML(q))

    try:
        response = qa({"question": f"{question}", "chat_history": chat_history})
        answer = response["answer"]
        chat_history.append((question, answer))
    except Exception as e:
        answer = "<b>Error:</b> " + str(e)

    # Formatting answer for output
    # Replacing all $ otherwise matjax would format them in a strange way
    answer_formatted = answer.replace("$", r"\$")
    a = (
        f'<div class="chat-message-left pb-4"><div>'
        + f'<img src="images/cat.png" class="rounded-circle mr-1" width="40" height="40">'
        + f'<div class="text-muted small text-nowrap mt-2">{datetime.now().strftime("%H:%M:%S")}</div></div>'
        + '<div class="flex-shrink-1 bg-light rounded py-2 px-3 ml-3">'
        + f'<div class="font-weight-bold mb-1">LLM</div>{answer_formatted}</div>'
    )

    # Turn off loading animation
    loading_bar.layout.display = "none"

    output.append_display_data(HTML(a))

In [73]:
in_text = widgets.Text()
in_text.continuous_update = False
in_text.observe(text_eventhandler, "value")
output = widgets.Output()

file = open("images/loading.gif", "rb")
image = file.read()
loading_bar = widgets.Image(
    value=image, format="gif", width="20", height="20", layout={"display": "None"}
)

In [74]:
display(
    widgets.HBox(
        [output],
        layout=widgets.Layout(
            width="100%",
            max_height="500px",
            display="inline-flex",
            flex_flow="column-reverse",
        ),
    )
)


display(
    widgets.Box(
        children=[loading_bar, in_text],
        layout=widgets.Layout(display="flex", flex_flow="row"),
    )
)

HBox(children=(Output(),), layout=Layout(display='inline-flex', flex_flow='column-reverse', max_height='500px'…

Box(children=(Image(value=b'GIF89a\xc8\x00\xc8\x00\xf7\x00\x00;Ch\x83\x90\xb7\xcf\xdc\xe8\xda\xec\xf1\xf1\xf2\…


llama_print_timings:        load time =  1447.87 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  1447.77 ms /     2 tokens (  723.88 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  1449.56 ms


In [76]:
r= qa({"question":"what are main risks with 3b5cb84f-ea13-48be-971c-92d76276ab83?","chat_history":chat_history})


llama_print_timings:        load time =  1447.87 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  5009.66 ms /    40 tokens (  125.24 ms per token)
llama_print_timings:        eval time =   115.18 ms /     1 runs   (  115.18 ms per run)
llama_print_timings:       total time =  5138.63 ms


NoIndexException: Index not found, please create an instance before querying

NameError: name 'r' is not defined