# Install Lib

In [18]:
%pip install llama-index llama-index-core llama-parse openai llama-index-llms-openai nest-asyncio python-dotenv "giskard[llm]" langchain_core ragas pyarrow -q

Note: you may need to restart the kernel to use updated packages.


# Specify API Keys

In [None]:
%pip install --upgrade pip

In [1]:
import nest_asyncio
import warnings
import os
from dotenv import load_dotenv

nest_asyncio.apply()

warnings.filterwarnings('ignore')
_ = load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
LLAMAPARSE_API_KEY = os.getenv('LLAMACLOUD_API_KEY')

# Loading & parsing Insurence Policy File

In [2]:
from llama_parse import LlamaParse

pdf_name = "../docs/pb116349-business-health-select-handbook-1024-pdfa.pdf"
parser = LlamaParse(api_key=LLAMAPARSE_API_KEY, result_type="markdown")
documents = parser.load_data(pdf_name)

Started parsing the file under job_id 2a1a3934-cea5-4203-a52e-f2313904ea01


In [3]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex

splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
vector_index = VectorStoreIndex(nodes, embed_model = embed_model)

llm_gpt4o = OpenAI(model="gpt-4o-mini", api_key = OPENAI_API_KEY)
query_engine_gpt4o = vector_index.as_query_engine(similarity_top_k=3, llm=llm_gpt4o)

In [4]:
print(nodes[0].text)

# Business

# Health Select

# Membership handbook

# October 2024


## Store documents: with embeddings for later retrieval

In [5]:
path = "vector_index"
vector_index.storage_context.persist(persist_dir=path)

## Chatting with the LLMs: GPT-4o-mini

In [6]:
query1 = "Whats the cashback amount for dental appointments?"
resp = query_engine_gpt4o.query(query1)
print("GPT-4o-mini:")
print(str(resp))

GPT-4o-mini:
The cashback amount for dental appointments is 80% of your dentist’s fees, up to £400 a year.


# Giskard AI: 1- Generating testset

## Generate a test set for the policy doc

In [7]:
import pandas as pd

In [8]:
from giskard.rag import KnowledgeBase, generate_testset, QATestset

knowledge_base_df = pd.DataFrame([node.text for node in nodes], columns=["text"])

In [14]:
# #Number of clusters
# import numpy as np
# round(2 + np.log(len(knowledge_base_df)))

7

In [9]:
import giskard
from giskard.llm.client.openai import OpenAIClient

import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

giskard.llm.set_llm_api("openai")
gpt4o_mini = OpenAIClient(model="gpt-4o-mini")
giskard.llm.set_default_client(gpt4o_mini)

knowledge_base = KnowledgeBase(knowledge_base_df, llm_client = giskard.llm.set_default_client(gpt4o_mini))

In [10]:
%%time
testset = generate_testset(knowledge_base,
                           num_questions=12,
                           agent_description="A chatbot answering questions insurence policy document",)

2024-11-05 21:47:19,616 pid:43026 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-05 21:47:23,836 pid:43026 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


Generating questions: 100%|██████████| 12/12 [00:23<00:00,  1.93s/it]

CPU times: user 2.51 s, sys: 103 ms, total: 2.61 s
Wall time: 27.4 s





In [11]:
testset.to_pandas().head(5)

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
50df1144-c2f8-4f84-a23d-934e6447c3f4,What procedures are used for monitoring cancer...,Some cancer patients need procedures to check ...,Document 28: # Follow up procedures that\n\nar...,[],"{'question_type': 'simple', 'seed_document_id'..."
92d41619-37fc-46d5-8995-91e224b9decb,Under what circumstances will you pay for Prot...,We will pay for PBT for cancer when it is in l...,Document 27: # Proton beam therapy\n\n|9 Yes|9...,[],"{'question_type': 'simple', 'seed_document_id'..."
0c65c935-2945-4e5b-b59e-b5c680adae93,If I find myself experiencing menopausal sympt...,Call us on 0800 206 1808 and we will check you...,Document 32: # 4.20 Menopausal symptoms – that...,[],"{'question_type': 'complex', 'seed_document_id..."
dc2f4173-43a8-4f83-9b09-cd7e4e42d335,Could you clarify the maximum annual coverage ...,Up to £500 a year.,Document 8: # Extra cover for Fee limited\n\nP...,[],"{'question_type': 'complex', 'seed_document_id..."
51387627-4886-4bf3-8f88-f8ca1a80ba21,What consequences might you face if you violat...,If you break any terms of your plan that we re...,Document 40: We reserve the right to deduct fr...,[],"{'question_type': 'distracting element', 'seed..."


In [12]:
df_testset = testset.to_pandas()

## Different type of questions: 6

In [13]:
df_testset['question_type']=df_testset['metadata'].apply(lambda x: x['question_type'])

In [14]:
df_testset['question_type'].unique()

array(['simple', 'complex', 'distracting element', 'situational',
       'double', 'conversational'], dtype=object)

In [15]:
df_testset.groupby(['question_type'])['question'].count() #reamember:  num_questions=60, ==> 6 * 10

question_type
complex                2
conversational         2
distracting element    2
double                 2
simple                 2
situational            2
Name: question, dtype: int64

# Giskard AI: 2- Evaluation of the RAG pipeline

https://docs.giskard.ai/en/stable/open_source/testset_generation/rag_evaluation/index.html

In [16]:
from giskard.rag import evaluate, RAGReport
from giskard.rag.metrics.ragas_metrics import ragas_context_recall, ragas_context_precision, ragas_faithfulness, ragas_answer_relevancy

In [17]:
def answer_fn(question):
    answer = query_engine_gpt4o.query(question)
    return str(answer)

report = evaluate(answer_fn,
                testset=testset,
                knowledge_base=knowledge_base,
                metrics=[ragas_context_recall, ragas_context_precision, ragas_faithfulness, ragas_answer_relevancy])

Asking questions to the agent: 100%|██████████| 12/12 [00:21<00:00,  1.77s/it]
CorrectnessMetric evaluation: 100%|██████████| 12/12 [00:09<00:00,  1.26it/s]
RagasMetric evaluation:   0%|          | 0/12 [00:00<?, ?it/s]



RagasMetric evaluation: 100%|██████████| 12/12 [00:00<00:00, 1700.91it/s]
RagasMetric evaluation:   0%|          | 0/12 [00:00<?, ?it/s]



RagasMetric evaluation: 100%|██████████| 12/12 [00:00<00:00, 1502.80it/s]
RagasMetric evaluation:   0%|          | 0/12 [00:00<?, ?it/s]



RagasMetric evaluation: 100%|██████████| 12/12 [00:00<00:00, 1098.30it/s]
RagasMetric evaluation:   0%|          | 0/12 [00:00<?, ?it/s]



RagasMetric evaluation: 100%|██████████| 12/12 [00:00<00:00, 1418.95it/s]


In [18]:
display(report.to_html(embed=True))

# Key Takeaways

**1-** You can specify the LLM you want to use for test set generation (see the Notebook example where I used GPT-4o-mini). If no model is specified, the default LLM is GPT-4, so be mindful of the associated costs.

**2-** It's an interesting approach to use different types of questions. However, I also find it valuable to generate answers based on these question types to achieve more precise responses. For example, I've noticed that some 'simple' and 'complex' questions result in the same answer. I would prefer to see more elaborated answers for the complex questions. In the prompt associated to the "complex", only the question is reformulated based on the context.

**3-** Topics extracted from Amazon financial report are not relevant, I would prefer, for example, find different parts like "Sales", "Liquidity and Capital Resources", "Segments" topics...


- It's really interesting framework to consider!
