In [25]:
# Modules to Import
import re
from io import BytesIO
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader
import pandas as pd
from EvaluateChatBot import EvalChatBot
from ChatBot import ChatBot
from langchain.docstore.document import Document
from giskard.rag import generate_testset
from giskard.rag import KnowledgeBase
from datasets import Dataset, Features, Sequence, Value
from ragas import evaluate

In [26]:

def parse_pdf(file: BytesIO) -> List[str]:
    '''
    preprocessing file pdf.
    input: pdf file path
    
    return: list of string
    '''
    pdf = PdfReader(file) #! read content from pdf
    output = []
    #print(pdf.pages) # pdf.pages will result a list of pages type
    for page in pdf.pages:
        text = page.extract_text() #! get text in each page
        # Merge word which contant dash in the middle. Ex: a-b
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # Fix newlines in the middle of sentences
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        # Remove multiple newlines
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output

def text_to_docs(text: str) -> List[Document]:
    """
    Converts a string or list of strings to a list of Documents
    with metadata.
    """
    if isinstance(text, str): 
        #! this condition is important because related to response of model accuracy
        # Take a single string as one page
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    # Add page numbers as metadata
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    # Split pages into chunks
    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 7000, #! this parameter has been experiment and evaluate so i can conclude chunk size is 7000
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            # Add sources a metadata
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)
    return doc_chunks

#! --------------------- generate test set ----------------------------------------
uploaded_file = "../../pdfData/Cells and Chemistry of Life.pdf"
doc = parse_pdf(uploaded_file)
documentList = text_to_docs(doc)

df = pd.DataFrame([d.page_content for d in documentList], columns=["text"])
knowledge_base = KnowledgeBase(df)
testset = generate_testset(
    knowledge_base,
    num_questions = 30,
    agent_description = "A chatbot answer all questions related to data" #! can be fine tune for better performance
)


test_set_df = testset.to_pandas()        

2024-09-08 18:50:33,158 pid:18884 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2024-09-08 18:50:39,746 pid:18884 MainThread giskard.rag  INFO     Found 4 topics in the knowledge base.


Generating questions:   0%|          | 0/30 [00:00<?, ?it/s]

In [27]:
test_set_df

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bb73e4b0-7e9d-4035-b950-68ef7e20adae,What is the relationship between the surface a...,The greater the surface area-to-volume ratio o...,Document 11: 24Chapter 2 Cells and the Chemist...,[],"{'question_type': 'simple', 'seed_document_id'..."
d7a421fd-f55b-41ad-8ce4-05a08f38215c,What is cellular totipotency and how is it use...,Cellular totipotency is the amazing ability of...,Document 6: 14 Chapter 1 Cells and the Chemist...,[],"{'question_type': 'simple', 'seed_document_id'..."
67c84a3f-9334-4961-91bb-6075f01735d2,What is the process by which a cell becomes sp...,The process by which a cell becomes specialise...,Document 5: 12 Chapter 1 Cells and the Chemist...,[],"{'question_type': 'simple', 'seed_document_id'..."
82842691-5e16-4fcc-ad85-d8b5da1a28bb,What is the process by which water molecules m...,The process is called osmosis.,Document 13: 28 Chapter 2 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
42768204-8b71-4ada-aa71-dac201491913,What is the term for the diffusion of water mo...,The diffusion of water molecules across a part...,Document 13: 28 Chapter 2 Cells and the Chemis...,[],"{'question_type': 'simple', 'seed_document_id'..."
9ea7b8ca-a60f-47e6-abc2-84d4d4f34384,Could you elaborate on the key distinctions be...,Active transport is the process in which energ...,Document 17: 36 Chapter 2 Cells and the Chemis...,[],"{'question_type': 'complex', 'seed_document_id..."
74ebfbef-4769-4bd2-850a-76e52a39e13f,Could you explain the distinct characteristics...,Glucose and fructose both have the same chemic...,Document 20: 42 Chapter 3 Cells and the Chemis...,[],"{'question_type': 'complex', 'seed_document_id..."
249e11a2-3c58-4d15-b75d-440ddb8e91b5,What is the impact on the functionality of an ...,"When an enzyme is denatured, it loses or alter...",Document 31: 64 Chapter 4 Cells and the Chemis...,[],"{'question_type': 'complex', 'seed_document_id..."
bd3948b1-bfbf-45e6-8d2e-45f20192acfd,What is the impact on the functionality of an ...,Denaturation results in the loss or alteration...,Document 31: 64 Chapter 4 Cells and the Chemis...,[],"{'question_type': 'complex', 'seed_document_id..."
02937847-fe25-41d4-b362-a060904dd472,What are the consequences on an enzyme's funct...,Increasing the temperature above the optimum c...,Document 32: 66 Chapter 4 Cells and the Chemis...,[],"{'question_type': 'complex', 'seed_document_id..."


In [28]:
df = pd.DataFrame(test_set_df)
df.to_csv("./test.csv", index=False)

In [29]:
questionList = list(test_set_df['question'])
ground_truthList = list(test_set_df['reference_answer'])
# ListStringContexts = list(test_set_df["reference_context"])

In [31]:
import time
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision
)
answer_list = []
context_list = []
for i in range(len(questionList)):
    dict = EvalChatBot("../../pdfData/Cells and Chemistry of Life.pdf", questionList[i])
    time.sleep(1)
    answer = ChatBot("../../pdfData/Cells and Chemistry of Life.pdf", questionList[i])
    time.sleep(1)
    answer_list.append(answer)
    context_list.append(dict['contexts'])
uploaded_file = "../../pdfData/Cells and Chemistry of Life.pdf"
doc = parse_pdf(uploaded_file)

contexts = doc



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to provide information on the relationship between the surface area-to-volume ratio of a cell and the rate at which substances move in and out of it.
Action: Personalized QA Chat System
Action Input: What is the relationship between the surface area-to-volume ratio of a cell and the rate at which substances move in and out of it?[0m
Observation: [36;1m[1;3m The greater the surface area-to-volume ratio of a cell, the higher the rate at which substances move in and out of it due to the increased surface area available for absorption.[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: The greater the surface area-to-volume ratio of a cell, the higher the rate at which substances move in and out of it due to the increased surface area available for absorption.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to provide information on cel

In [32]:
# ListStringContexts

In [33]:
# contextsList = []
# for contexts in ListStringContexts:
#     contextsList.append([''.join(contexts)])

In [34]:


# Ensure your contexts are lists of strings
data = {
    "question": questionList, #! replace with question list!
    "answer": answer_list,  #! replace with response from chatbot
    "contexts": context_list,  
    "ground_truth": ground_truthList #! replace with response from EvaluateChatbot
}

# Define the features explicitly to ensure correct data types
features = Features({
    "question": Value("string"),
    "answer": Value("string"),
    "contexts": Sequence(Value("string")),  # Ensuring contexts is treated as a sequence of strings
    "ground_truth": Value("string")
})

# Convert the dictionary to a Dataset with the specified features
dataset = Dataset.from_dict(data, features=features)

# Perform the evaluation using the adjusted dataset
result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall
    ],
)

print(result)


Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

{'context_precision': 0.8241, 'faithfulness': 0.7647, 'answer_relevancy': 0.9182, 'context_recall': 0.9333}
