In [1]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
import time
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import TokenTextSplitter
import pandas as pd
from ragas.langchain.evalchain import RagasEvaluatorChain
from mlflow import MlflowClient
from pprint import pprint
from langchain.llms import LlamaCpp


# Configure MLflow Tracking Client

From the terminal, start the MLflow server using command `mlflow server`

In [2]:
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")

In [3]:
# Create a new MLflow experiment
experiment_description = (
    "This is the Local RAG project for Oral Care's Innovisor platform. "
    "This experiment contains RAG architectures different components and parameters."
)

experiment_tags = {
    "project_name": "Local RAG",
    "mlflow.note.content": experiment_description,
}

# TODO Check if experiment name already exists
# produce_experiment = client.create_experiment(name="RAG_Experiments", tags=experiment_tags)

In [4]:
import mlflow

# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Sets the current active experiment to the "Apple_Models" experiment and returns the Experiment metadata
my_experiment = mlflow.set_experiment("RAG_Experiments")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "first_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_apples"

# Evaluating Dataset

In [5]:
# Ragas uses gpt3.5 by default - it's possible to change LLM for metrics 
faithfulness.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo", request_timeout=120)
context_precision.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo", request_timeout=120)
answer_relevancy.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo", request_timeout=120)
context_recall.llm.langchain_llm = ChatOpenAI(model="gpt-3.5-turbo", request_timeout=120)

# Testing RagasEvaluatorChain

In [6]:
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", request_timeout=120)

loader = PyPDFLoader("../data/Batman_wiki.pdf")
pages = loader.load()

# Chunk and Embeddings
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=0)

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=600,
#     chunk_overlap=300,
#     separators=["\n\n", "\n", " ", ""],  # adjust these as necessary
# )

texts = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

# Vector Store
db = Chroma.from_documents(documents=texts, embedding=embeddings)

# Initialise RetrievalQA Chain
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(
        search_kwargs={"k": 2}
    ),  # search_type="mmr"),#search_kwargs={"k":3}),
    return_source_documents=True,
    # chain_type_kwargs={"prompt": rag_prompt_selector.get_prompt(llm)},
)

In [7]:
df = pd.read_csv("../data/batman_eval_simple.csv")
df = df.head(2)
eval_questions = df["question"].values.tolist()
eval_answers = df['answer'].values.tolist()

In [8]:
eval_answers

["Catwoman is Batman's most enduring romance throughout the years.",
 'The character Batman was created by artist Bob Kane and writer Bill Finger. He first appeared in Detective Comics on March 30, 1939.']

In [9]:
# #In order to evaluate the qa system we generated a few relevant questions and answers
# eval_questions = [
#     "I have persistent back pain since 4 weeks,I workouut but havent had any sports injury.What might be the cause of the back pain?",
#     "I have shortness of breath and frequently feel nauseated and tired.What can be the possible cause?",
#     "My 12 year old son has Poor coordination Unsteady walk and a tendency to stumble while walking and poor coordination between two hands.What might be the possible cuase?",
#     "What is Baby acne ?",
#     "What is Botulism ?",
# ]

# eval_answers = [
#     "From the symptoms mentioned you might have a disloacted disk",  # incorrect answer
#     "You might have asthama.",  # incorrect answer
#     " Movement and coordination problems associated with cerebral palsy.Please consult a doctor for better diagnosis.",
#     "Baby acne is small, inflamed bumps on a baby's face, neck, back or chest.",
#     "Botulism is a rare and potentially fatal illness caused by a toxin produced by the bacterium Clostridium botulinum.",
# ]

examples = [
    {"query": q, "ground_truths": [eval_answers[i]]}
    for i, q in enumerate(eval_questions)
]
print(examples)

[{'query': "Who is Batman's most enduring romance throughout the years?", 'ground_truths': ["Catwoman is Batman's most enduring romance throughout the years."]}, {'query': 'Who created the character Batman and when did he first appear in Detective Comics?', 'ground_truths': ['The character Batman was created by artist Bob Kane and writer Bill Finger. He first appeared in Detective Comics on March 30, 1939.']}]


In [10]:
# create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_relevancy_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_precision_chain = RagasEvaluatorChain(metric=context_precision)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)

In [11]:
predictions = chain.batch(examples)
predictions

[{'query': "Who is Batman's most enduring romance throughout the years?",
  'ground_truths': ["Catwoman is Batman's most enduring romance throughout the years."],
  'result': "Batman's most enduring romance throughout the years is with Catwoman, also known as Selina Kyle.",
  'source_documents': [Document(page_content="Main article: Catwoman While most of Batman's romantic relationships tend to be short in duration, Catwoman has been his most enduring romance throughout the years.[112] The attraction between Batman and Catwoman, whose real name is Selina Kyle, is present in nearly every version and medium in which the characters appear, including a love story between their two secret identities as early as in the 1966 film Batman. Although Catwoman is typically portrayed as a villain, Batman and Catwoman have worked together in achieving", metadata={'page': 6, 'source': '../data/Batman_wiki.pdf'}),
   Document(page_content=' reputation as a manufactured illusion to support his mission 

In [None]:
time_start = time.time()

In [None]:
faithfulness_scores = faithfulness_chain.evaluate(examples, predictions)
faithfulness_scores

In [None]:
for i, score in enumerate(faithfulness_scores):
    predictions[i].update(score)

In [None]:
context_precision_scores = context_precision_chain.evaluate(examples, predictions)
context_precision_scores

In [None]:
for i, score in enumerate(context_precision_scores):
    predictions[i].update(score)

In [None]:
answer_relevancy_scores = answer_relevancy_chain.evaluate(examples, predictions)
answer_relevancy_scores

In [None]:
for i, score in enumerate(answer_relevancy_scores):
    predictions[i].update(score)

In [None]:
context_recall_scores = context_recall_chain.evaluate(examples, predictions)
context_recall_scores

In [None]:
for i, score in enumerate(context_recall_scores):
    predictions[i].update(score)

In [None]:
df_scores = pd.DataFrame(predictions)
df_scores

In [None]:
# Display average scores
mean_faithfulness = df_scores['faithfulness_score'].mean()
mean_context_precision = df_scores['context_precision_score'].mean()
mean_answer_relevancy = df_scores['answer_relevancy_score'].mean()
mean_context_recall = df_scores['context_recall_score'].mean()

print(f"mean_faithfulness: {mean_faithfulness}")
print(f"mean_context_precision: {mean_context_precision}")
print(f"mean_answer_relevancy: {mean_answer_relevancy}")
print(f"mean_context_recall: {mean_context_recall}")

In [None]:
time_to_evaluate = time.time() - time_start
time_to_evaluate

In [None]:
params = {
    "Input document": "Batman_wiki.pdf",
    "Evaluation questions": "batman_eval_simple.csv",
    "summarization_llm": "gpt-3.5-turbo"
}

metrics = {"mean_faithfulness": mean_faithfulness, 
           "mean_context_precision": mean_context_precision, 
           "mean_answer_relevancy": mean_answer_relevancy, 
           "mean_context_recall": mean_context_recall,
           "time_to_evaluate": time_to_evaluate}

In [None]:
# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # # Log an artifact (output file)
    # if not os.path.exists("outputs"):
    #     os.makedirs("outputs")
    # with open("outputs/test.txt", "w") as f:
    #     f.write("hello world!")
    # mlflow.log_artifacts()

# Notes
- Multiple options for evaluation
    1) Run RAG in a loop over each example question.
    - store questions and ground truths (from testsetGenerator), and contexts and answers in separate lists
    - Create Dataset from_dict like dataset = Dataset.from_dict(data_samples)
    - run Ragas Evaluate(Dataset) 
    2) Langchain Integration: Use RagasEvaluatorChain directly on results of QA Chain 

# Scratch

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="../data/batman_eval_simple.csv")
dataset

In [None]:
fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")
fiqa_eval

In [None]:
result = evaluate(
    dataset['train'], # fiqa_eval["baseline"].select(range(3)), # selecting only 3
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        # context_recall,
    ],
)

result

In [None]:
df = result.to_pandas()
df.head()