# Step 3: Define metrics to use
In general, we want to evaluate 2 things: whether the context retrieved is good, and whether the overall answer is good.

For this example, we will be specifically assessing the **answer** itself and keep the retrieval constant. 


## Assessing the answer

Let's use the following metrics:
- `rouge`
- `levenshtein_ratio`
- `faithfulness` (with `ragas`)
- `answer_correctness` (with `ragas`)
- `toxicity`



In [9]:
import evaluate
from Levenshtein import ratio as levenshtein_ratio
from datasets import Dataset
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_correctness
from langchain_openai import ChatOpenAI


def rouge(ground_truth, answer):
    """ROUGE-L score between the ground_truth and answer"""
    rouge = evaluate.load('rouge')
    return rouge.compute(predictions=[answer], references=[ground_truth])["rougeL"]


def levenshtein(ground_truth, answer):
    """Levenshtein ratio between the ground_truth and answer"""
    return levenshtein_ratio(ground_truth, answer)


def evaluate_w_ragas(item, metrics=[faithfulness, answer_correctness]):
    """Evaluate the answer using the given metrics with ragas"""
    gpt4_llm = ChatOpenAI(model_name="gpt-4-turbo-preview", temperature=0)

    # Format the example into datasets, which ragas takes as inputs
    row_dataset = Dataset.from_pandas(item.to_frame().T)

    # Ragas by default takes in a batch of items and aggregates metrics together
    # So in this example, we need to pass one by one to get individual results.
    # However, it does run faster when you pass all metrics at once.
    ragas_eval_results = ragas_evaluate(row_dataset, metrics=metrics, llm=gpt4_llm)
    return ragas_eval_results


def toxicity(ground_truth, answer):
    """Toxicity score for answer"""
    evaluator = evaluate.load('toxicity')
    return evaluator.compute(predictions=[answer])["toxicity"]
