In [1]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
import time
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import TokenTextSplitter
import pandas as pd
from ragas.langchain.evalchain import RagasEvaluatorChain


# Evaluating Dataset

In [2]:
# Ragas uses gpt3.5 by default - it's possible to change LLM for metrics 
faithfulness.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo", request_timeout=120)
# context_precision.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo")
# answer_relevancy.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo")
context_recall.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo", request_timeout=120)

# Testing RagasEvaluatorChain

In [3]:
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", request_timeout=120)

loader = PyPDFLoader("../data/Batman_wiki.pdf")
pages = loader.load()

# Chunk and Embeddings
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=0)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=600,
#     chunk_overlap=300,
#     separators=["\n\n", "\n", " ", ""],  # adjust these as necessary
# )

texts = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

# Vector Store
db = Chroma.from_documents(documents=texts, embedding=embeddings)

# Initialise RetrievalQA Chain
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(
        search_kwargs={"k": 2}
    ),  # search_type="mmr"),#search_kwargs={"k":3}),
    return_source_documents=True,
    # chain_type_kwargs={"prompt": rag_prompt_selector.get_prompt(llm)},
)

In [4]:
df = pd.read_csv("../data/batman_eval_simple.csv")
df = df.head(2)
eval_questions = df["question"].values.tolist()
eval_answers = df['answer'].values.tolist()

In [5]:
eval_answers

["Catwoman is Batman's most enduring romance throughout the years.",
 'The character Batman was created by artist Bob Kane and writer Bill Finger. He first appeared in Detective Comics on March 30, 1939.']

In [6]:
# #In order to evaluate the qa system we generated a few relevant questions and answers
# eval_questions = [
#     "I have persistent back pain since 4 weeks,I workouut but havent had any sports injury.What might be the cause of the back pain?",
#     "I have shortness of breath and frequently feel nauseated and tired.What can be the possible cause?",
#     "My 12 year old son has Poor coordination Unsteady walk and a tendency to stumble while walking and poor coordination between two hands.What might be the possible cuase?",
#     "What is Baby acne ?",
#     "What is Botulism ?",
# ]

# eval_answers = [
#     "From the symptoms mentioned you might have a disloacted disk",  # incorrect answer
#     "You might have asthama.",  # incorrect answer
#     " Movement and coordination problems associated with cerebral palsy.Please consult a doctor for better diagnosis.",
#     "Baby acne is small, inflamed bumps on a baby's face, neck, back or chest.",
#     "Botulism is a rare and potentially fatal illness caused by a toxin produced by the bacterium Clostridium botulinum.",
# ]

examples = [
    {"query": q, "ground_truths": [eval_answers[i]]}
    for i, q in enumerate(eval_questions)
]
print(examples)

[{'query': "Who is Batman's most enduring romance throughout the years?", 'ground_truths': ["Catwoman is Batman's most enduring romance throughout the years."]}, {'query': 'Who created the character Batman and when did he first appear in Detective Comics?', 'ground_truths': ['The character Batman was created by artist Bob Kane and writer Bill Finger. He first appeared in Detective Comics on March 30, 1939.']}]


In [7]:
# create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_relevancy_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_precision_chain = RagasEvaluatorChain(metric=context_precision)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)

In [8]:
predictions = chain.batch(examples)
predictions

[{'query': "Who is Batman's most enduring romance throughout the years?",
  'ground_truths': ["Catwoman is Batman's most enduring romance throughout the years."],
  'result': "Batman's most enduring romance throughout the years is with Catwoman, also known as Selina Kyle.",
  'source_documents': [Document(page_content="Main article: Catwoman While most of Batman's romantic relationships tend to be short in duration, Catwoman has been his most enduring romance throughout the years.[112] The attraction between Batman and Catwoman, whose real name is Selina Kyle, is present in nearly every version and medium in which the characters appear, including a love story between their two secret identities as early as in the 1966 film Batman. Although Catwoman is typically portrayed as a villain, Batman and Catwoman have worked together in achieving", metadata={'page': 6, 'source': '../data/Batman_wiki.pdf'}),
   Document(page_content=' reputation as a manufactured illusion to support his mission 

In [9]:
time_start = time.time()

In [10]:
faithfulness_scores = faithfulness_chain.evaluate(examples, predictions)
faithfulness_scores

100%|██████████| 1/1 [00:31<00:00, 31.34s/it]


[{'faithfulness_score': 1.0}, {'faithfulness_score': 1.0}]

In [11]:
for i, score in enumerate(faithfulness_scores):
    predictions[i].update(score)

In [12]:
context_precision_scores = context_precision_chain.evaluate(examples, predictions)
context_precision_scores

100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


[{'context_precision_score': 0.14285714285714285},
 {'context_precision_score': 0.125}]

In [13]:
for i, score in enumerate(context_precision_scores):
    predictions[i].update(score)

In [14]:
answer_relevancy_scores = answer_relevancy_chain.evaluate(examples, predictions)
answer_relevancy_scores

100%|██████████| 1/1 [00:04<00:00,  4.52s/it]


[{'answer_relevancy_score': 0.952090879451938},
 {'answer_relevancy_score': 0.9846561788440309}]

In [15]:
for i, score in enumerate(answer_relevancy_scores):
    predictions[i].update(score)

In [16]:
context_recall_scores = context_recall_chain.evaluate(examples, predictions)
context_recall_scores

100%|██████████| 1/1 [00:09<00:00,  9.01s/it]


[{'context_recall_score': 1.0}, {'context_recall_score': 1.0}]

In [17]:
for i, score in enumerate(context_recall_scores):
    predictions[i].update(score)

In [18]:
df_scores = pd.DataFrame(predictions)
df_scores

Unnamed: 0,query,ground_truths,result,source_documents,faithfulness_score,context_precision_score,answer_relevancy_score,context_recall_score
0,Who is Batman's most enduring romance througho...,[Catwoman is Batman's most enduring romance th...,Batman's most enduring romance throughout the ...,"[page_content=""Main article: Catwoman While mo...",1.0,0.142857,0.952091,1.0
1,Who created the character Batman and when did ...,[The character Batman was created by artist Bo...,The character Batman was created by artist Bob...,"[page_content=""Batman[a] is a superhero appear...",1.0,0.125,0.984656,1.0


In [19]:
# Display average scores
mean_faithfulness = df_scores['faithfulness_score'].mean()
mean_context_precision = df_scores['context_precision_score'].mean()
mean_answer_relevancy = df_scores['answer_relevancy_score'].mean()
mean_context_recall = df_scores['context_recall_score'].mean()

print(f"mean_faithfulness: {mean_faithfulness}")
print(f"mean_context_precision: {mean_context_precision}")
print(f"mean_answer_relevancy: {mean_answer_relevancy}")
print(f"mean_context_recall: {mean_context_recall}")

mean_faithfulness: 1.0
mean_context_precision: 0.13392857142857142
mean_answer_relevancy: 0.9683735291479845
mean_context_recall: 1.0


In [20]:
time_to_evaluate = time.time() - time_start
time_to_evaluate

51.124900102615356

# Notes
- Multiple options for evaluation
    1) Run RAG in a loop over each example question.
    - store questions and ground truths (from testsetGenerator), and contexts and answers in separate lists
    - Create Dataset from_dict like dataset = Dataset.from_dict(data_samples)
    - run Ragas Evaluate(Dataset) 
    2) Langchain Integration: Use RagasEvaluatorChain directly on results of QA Chain 

# Scratch

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="../data/batman_eval_simple.csv")
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer', 'question_type', 'episode_done'],
        num_rows: 10
    })
})

In [None]:
fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")
fiqa_eval

DatasetDict({
    baseline: Dataset({
        features: ['question', 'ground_truths', 'answer', 'contexts'],
        num_rows: 30
    })
})

In [None]:
result = evaluate(
    dataset['train'], # fiqa_eval["baseline"].select(range(3)), # selecting only 3
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        # context_recall,
    ],
)

result

ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Value'>

In [None]:
df = result.to_pandas()
df.head()

Unnamed: 0,question,contexts,answer,ground_truths,context_precision,faithfulness,answer_relevancy,context_recall
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,\nThe best way to deposit a cheque issued to a...,[Have the check reissued to the proper payee.J...,0.411765,0.666667,0.977489,0.111111
1,Can I send a money order from USPS as a business?,[Sure you can. You can fill in whatever you w...,"\nYes, you can send a money order from USPS as...",[Sure you can. You can fill in whatever you w...,0.285714,1.0,0.884017,0.8
2,1 EIN doing business under multiple business n...,[You're confusing a lot of things here. Compan...,"\nYes, it is possible to have one EIN doing bu...",[You're confusing a lot of things here. Compan...,0.25,1.0,0.927708,1.0
