# Lab

```text
- Create a chatbot
```

In [1]:
# Built-in library
import itertools
import re
import json
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd

# Visualization
import matplotlib.pyplot as plt


# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

### Steps

```text
- Load data
- Split data
- Store the data (i.e. embeddings).
- Retrieve the data.
- Add memory
- Engineer prompt template
```

<br>


```sh
pip install jq
```

In [3]:
fp = "../../data/res.json"

with open(fp, "r") as f:
    json_data = json.load(f)

In [4]:
pprint(json_data)

{'data': {'behaviouralAnalysis': {'accountSweep': 'No',
                                  'gamblingRate': 0,
                                  'inflowOutflowRate': 'Negative Cash Flow',
                                  'loanAmount': 74975,
                                  'loanInflowRate': 0.29,
                                  'loanRepaymentInflowRate': 0.22,
                                  'loanRepayments': 58073.03,
                                  'topIncomingTransferAccount': '08165850038',
                                  'topTransferRecipientAccount': 'Faith '
                                                                 'Oluwaseyi'},
          'cashFlowAnalysis': {'accountActivity': 0.44,
                               'averageBalance': 8954.79,
                               'averageCredits': 10469.456400000001,
                               'averageDebits': 3224.3595121951216,
                               'averageMonthlyCredit': [{'amount': 0.71,
                

In [18]:
from langchain.document_loaders import JSONLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)

CHUNK_SIZE = 2_000
CHUNK_OVERLAP = 50

# Load JSON data
loader = JSONLoader(
    file_path=fp,
    jq_schema=".data",
    text_content=False,
)

data = loader.load()

# Embedding model
embedding = OpenAIEmbeddings()
persist_directory = "../../data/doc_db/"

# Split the doc(s)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", r"\{\}", ""],
)
splits = text_splitter.split_documents(documents=data)
len(splits)

2

In [19]:
# Create vectorized db
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory,
)

# Remove previously stored data
!rm -rf ../../data/doc_db/

# Save data
vectordb.persist()

# Number of docs
print(vectordb._collection.count())

2


In [20]:
from langchain.callbacks import get_openai_callback


def count_tokens(chain: Any, query:str) -> Any:
    """This is used to count the number of tokens sent to the LLM."""
    with get_openai_callback() as cb:
        result = chain.run(query)
        print(f"Spent a total of {cb.total_tokens!r} tokens")

    return result

In [21]:
from langchain.memory import (
    ConversationBufferWindowMemory,
    ConversationTokenBufferMemory,
)
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


llm = ChatOpenAI(temperature=0)

# Create memory
memory = ConversationBufferWindowMemory(
    k=5,  # window size
    input_key="question",
    memory_key="chat_history",
)

retriever = vectordb.as_retriever()

In [22]:
# Init Chatbot
decide_bot = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,  # retrieve data from data source
    verbose=False,
    chain_type_kwargs={
        "document_separator": "<<<<>>>>>",
        "memory": memory,
    },
)


question = "What is the maxEMIEligibility of the customer?"
# result = count_tokens(qa_bot, {"query": question})
result = decide_bot({"query": question})

result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'query': 'What is the maxEMIEligibility of the customer?',
 'result': 'The maximum EMI eligibility of the customer is 2898.'}

In [25]:
question = "What is the average monthly debit?"
result = decide_bot({"query": question})

result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'query': 'What is the average monthly debit?',
 'result': 'The average monthly debit is as follows:\n\n- February: ₦3,065.48\n- March: ₦1,599.84\n- April: ₦5,705.34\n- May: ₦5,401.95\n\nPlease note that these values are in Nigerian Naira (NGN).'}

In [26]:
question = "What is the averageMonthlyDebit?"  # It's better to use the exact variable names
result = decide_bot({"query": question})

result


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'query': 'What is the averageMonthlyDebit?',
 'result': 'The averageMonthlyDebit is [{"month": "2022-04", "amount": 5705.34}, {"month": "2022-05", "amount": 5401.95}, {"month": "2022-02", "amount": 383.19}, {"month": "2022-03", "amount": 1599.84}].'}

In [27]:
question = "What is the customer ID and is the customer a salary earner?"
result = decide_bot({"query": question})

result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'query': 'What is the customer ID and is the customer a salary earner?',
 'result': 'The customer ID is not provided in the given context. However, based on the information provided, the customer is a salary earner.'}

In [28]:
question = "Are there any recurringExpenses in the transaction data? If yes, list them"
result = decide_bot({"query": question})

result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'query': 'Are there any recurringExpenses in the transaction data? If yes, list them',
 'result': 'Yes, there are recurring expenses in the transaction data. The recurring expenses are as follows:\n\n1. Amount: 10026.88, Description: NEXTGENUSR TRF/Payment of money/FRM FAITH OLUWASEYI OLODU TO AMEENAH OYINKANSOLA OYESOJI- 033\n2. Amount: 2010.75, Description: NEXTGENUSR TRF/From Benson/FRM FAITH OLUWASEYI OLODU TO CHINENYE JENNIFER MADU- 057\n3. Amount: 2500, Description: FLEXSWITCH WT|KENNETH UGOCHI UNACHI ABAKALIKI NG'}

In [29]:
question = "Which month had the highest totalMonthlyDebit and what was the amount?"
result = decide_bot({"query": question})

result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


{'query': 'Which month had the highest totalMonthlyDebit and what was the amount?',
 'result': 'The month with the highest totalMonthlyDebit was April, and the amount was 114106.87 NGN.'}

In [30]:
# Most recent chat history.
pprint(memory.load_memory_variables({}))

{'chat_history': 'Human: What is the average monthly debit?\n'
                 'AI: The average monthly debit is as follows:\n'
                 '\n'
                 '- February: ₦3,065.48\n'
                 '- March: ₦1,599.84\n'
                 '- April: ₦5,705.34\n'
                 '- May: ₦5,401.95\n'
                 '\n'
                 'Please note that these values are in Nigerian Naira (NGN).\n'
                 'Human: What is the averageMonthlyDebit?\n'
                 'AI: The averageMonthlyDebit is [{"month": "2022-04", '
                 '"amount": 5705.34}, {"month": "2022-05", "amount": 5401.95}, '
                 '{"month": "2022-02", "amount": 383.19}, {"month": "2022-03", '
                 '"amount": 1599.84}].\n'
                 'Human: What is the customer ID and is the customer a salary '
                 'earner?\n'
                 'AI: The customer ID is not provided in the given context. '
                 'However, based on the information provided,