In [94]:
# Modules to Import
import re
from io import BytesIO
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader
import pandas as pd
from EvaluateChatbot import ChatBot
from langchain.docstore.document import Document
from giskard.rag import generate_testset
from giskard.rag import KnowledgeBase
from datasets import Dataset, Features, Sequence, Value
from ragas import evaluate

In [95]:

def parse_pdf(file: BytesIO) -> List[str]:
    '''
    preprocessing file pdf.
    input: pdf file path
    
    return: list of string
    '''
    pdf = PdfReader(file) #! read content from pdf
    output = []
    #print(pdf.pages) # pdf.pages will result a list of pages type
    for page in pdf.pages:
        text = page.extract_text() #! get text in each page
        # Merge word which contant dash in the middle. Ex: a-b
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # Fix newlines in the middle of sentences
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        # Remove multiple newlines
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output

def text_to_docs(text: str) -> List[Document]:
    """
    Converts a string or list of strings to a list of Documents
    with metadata.
    """
    if isinstance(text, str): 
        #! this condition is important because related to response of model accuracy
        # Take a single string as one page
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    # Add page numbers as metadata
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    # Split pages into chunks
    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 7000, #! this parameter has been experiment and evaluate so i can conclude chunk size is 7000
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            # Add sources a metadata
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)
    return doc_chunks

#! --------------------- generate test set ----------------------------------------
uploaded_file = "../../pdfData/Cells and Chemistry of Life.pdf"
doc = parse_pdf(uploaded_file)
documentList = text_to_docs(doc)

df = pd.DataFrame([d.page_content for d in documentList], columns=["text"])
knowledge_base = KnowledgeBase(df)
testset = generate_testset(
    knowledge_base,
    num_questions = 50,
    agent_description = "A chatbot answer all questions related to data" #! can be fine tune for better performance
)


test_set_df = testset.to_pandas()        

2024-09-07 22:56:40,783 pid:4720 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2024-09-07 22:56:46,480 pid:4720 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


Generating questions:   0%|          | 0/50 [00:00<?, ?it/s]

In [96]:
test_set_df

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
703ef61c-e632-4729-976d-50d7980734b1,What is the impact of the surface area-to-volu...,The rate of movement of a substance across a c...,Document 11: 24Chapter 2 Cells and the Chemist...,[],"{'question_type': 'simple', 'seed_document_id'..."
858c3410-d59b-40fe-b7c4-b2f80a4ada6a,What is the optimum pH for the activity of pro...,The optimum pH for the activity of protease in...,Document 34: 70 Chapter 4 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
275eae20-5e02-457e-84ed-af3f16e60352,What is the impact of a cell's surface area-to...,The rate of movement of a substance across a c...,Document 11: 24Chapter 2 Cells and the Chemist...,[],"{'question_type': 'simple', 'seed_document_id'..."
c26c7fa8-bab4-42b1-a9dc-535b251dcd2a,What is the difference between diffusion and a...,Diffusion is the net movement of particles fro...,Document 9: 20 Chapter 2 Cells and the Chemist...,[],"{'question_type': 'simple', 'seed_document_id'..."
78e80489-7821-44fe-8f82-23970942b8a3,What is the difference between glucose and fru...,Glucose and fructose are both single sugars wi...,Document 20: 42 Chapter 3 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
c9de0e1f-cbed-4513-ab88-91664ee2456c,Why is it recommended to use lukewarm water in...,The recommendation to use lukewarm water inste...,Document 30: 62 Chapter 4 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
596636cf-fba3-46fb-983d-aaf990d02da7,What is the current system for naming enzymes?,"Nowadays, enzymes are named according to a sci...",Document 30: 62 Chapter 4 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
518bfa70-5b7d-46ce-82ce-4c3661b88786,What is the effect of placing a plant cell in ...,When a plant cell is placed in a solution with...,Document 14: 30 Chapter 2 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
fab581b8-f1f8-40a0-992a-55637bc6643d,What are enzymes and where can they be found?,Enzymes are biological catalysts. They can be ...,Document 27: 56 Chapter 3 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
598f3bf0-a02b-4d03-b87f-a8cba0b51138,What scientific term is used to describe the p...,The process is called osmosis.,Document 13: 28 Chapter 2 Cells and the Chemis...,[],"{'question_type': 'complex', 'seed_document_id..."


In [97]:
questionList = list(test_set_df['question'])
ground_truthList = list(test_set_df['reference_answer'])
ListStringContexts = list(test_set_df["reference_context"])

In [98]:

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision
)
answer_list = []
for i in range(len(questionList)):
    answer_list.append(ChatBot("../../pdfData/Cells and Chemistry of Life.pdf", questionList[i]))

uploaded_file = "../../pdfData/Cells and Chemistry of Life.pdf"
doc = parse_pdf(uploaded_file)

contexts = doc



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use the Personalized QA Chat System to answer this question.
Action: Personalized QA Chat System
Action Input: What is the impact of the surface area-to-volume ratio on a cell's intake of nutrients and oxygen?[0m
Observation: [36;1m[1;3m The surface area-to-volume ratio greatly impacts a cell's ability to intake nutrients and oxygen, with a higher ratio leading to more efficient exchange and a lower ratio resulting in a slower intake of these essential substances.[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: The surface area-to-volume ratio greatly impacts a cell's ability to intake nutrients and oxygen, with a higher ratio leading to more efficient exchange and a lower ratio resulting in a slower intake of these essential substances.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the optimum pH for the activity of

In [99]:
ListStringContexts

['Document 11: 24Chapter 2 Cells and the Chemistry of LifeMovement of SubstancesChapter 2 Let’s Investigate 2.2Calculating Surface Area-to-volume Ratios1Consider three cubes of sides 1 cm, 2 cm, and 3 cm respectively. We use three cubes to represent cells of different sizes.2Complete Table 2.1. Table 2.1 Surface area-to-volume ratio of three cubes representing cells of different sizesCubesides 1 cm sides 2 cmsides 3 cmSurface areaArea of one face= 1 cm × 1 cm = 1 cm2Surface area= number of faces ×area of one face= 6 × 1 cm2= 6 cm2VolumeVolume= 1 cm × 1 cm × 1 cm= 1 cm3Surface area-to-volume ratio6 cm2: 1 cm3 Helpful Note A cube has six square faces. To find the surface area of a cube, find the area of one face and multiply by six. From our calculations, we can observe that as the cube becomes bigger, the surface area does not increase in the same proportion as the volume. The cube of sides 1 cm has 6 cm2of surface area to 1 cm3volume. But the cube of sides 3 cm has only 2 cm2of surface

In [100]:
contextsList = []
for contexts in ListStringContexts:
    contextsList.append([''.join(contexts)])

In [101]:


# Ensure your contexts are lists of strings
data = {
    "question": questionList, #! replace with question list!
    "answer": answer_list,  #! replace with response from chatbot
    "contexts": contextsList,  
    "ground_truth": ground_truthList #! replace with response from EvaluateChatbot
}

# Define the features explicitly to ensure correct data types
features = Features({
    "question": Value("string"),
    "answer": Value("string"),
    "contexts": Sequence(Value("string")),  # Ensuring contexts is treated as a sequence of strings
    "ground_truth": Value("string")
})

# Convert the dictionary to a Dataset with the specified features
dataset = Dataset.from_dict(data, features=features)

# Perform the evaluation using the adjusted dataset
result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall
    ],
)

print(result)


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

{'context_precision': 0.9800, 'faithfulness': 0.5343, 'answer_relevancy': 0.8824, 'context_recall': 1.0000}
