### Retriever

In [1]:
from langchain_community.vectorstores import FAISS as VectorStore
from langchain_community.embeddings import GPT4AllEmbeddings

embeddings = GPT4AllEmbeddings()
store = VectorStore.load_local("../../retrieve/vector_store", embeddings, allow_dangerous_deserialization=True)
retriever = store.as_retriever(search_kwargs={"k": 4})

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


### Model

In [2]:
# import dotenv
# dotenv.load_dotenv()

In [3]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="bigcode/starcoder2-3b",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 10},
)

2024-04-16 12:20:40.713967: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

### Chains
https://python.langchain.com/docs/modules/memory/adding_memory_chain_multiple_inputs/

In [4]:
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import PromptTemplate

template = """You are a chatbot having a conversation with a human.

Given the following extracted parts of a long document and a question, create a final answer.

{context}

{chat_history}
Human: {human_input}
Chatbot:"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")

chain = load_qa_chain(llm=llm, chain_type="stuff", memory=memory, prompt=prompt)

In [5]:
%time
query = "How install the chatbots"
docs = retriever.invoke(query)

chain.invoke({"input_documents": docs, "human_input": query}) #, return_only_outputs=True) #17m 22.9s

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 18.1 µs


{'input_documents': [Document(page_content="The chatbots are in their respective folders with an application launcher `autorun.sh` to install each of them without specific knowledge.  \nOn Mac, open the terminal and type:\n```shell\ncd\n```\nDrag the **`folder`** containing the file `autorun.sh`, then press the Enter key (↩︎).  \n_If you have done it correctly, the **`~`** between your machine's name (`name@MacBook-Pro-of-Name`) and the **`%`** sign should display the name of the `folder` instead._  \nExecute the following line of code by pressing the Enter key (↩︎):\n```shell\nsh autorun.sh\n```\nWait a moment, the model should open in your default web browser.", metadata={'Header 1': 'Models', 'Header 2': 'Installation', 'Header 3': 'Streamlit & FastAPI'}),
  Document(page_content='The subfolders listed below contain key steps in our research for creating a functional, convenient, and maintainable chatbot.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Structure'}),
  D