In [14]:
from ragas.llms.prompt import Prompt
from ragas.metrics import (
    context_recall, context_precision, answer_correctness, answer_relevancy,
    AnswerCorrectness, AnswerRelevancy, ContextEntityRecall, Faithfulness
)

In [2]:
# context_recall.context_recall_prompt.dict()
# context_precision.context_precision_prompt.dict()

In [3]:
# print(context_recall.context_recall_prompt.format(
#     question="Foo?",
#     context="The answer to the question Foo is Bar.",
#     answer="Bar",
# ).prompt_str)
# print(context_precision.context_precision_prompt.format(
#     question="Foo?",
#     context="The answer to the question Foo is Bar.",
#     answer="Bar",
# ).prompt_str)

In [15]:
from datasets import Dataset

dummy_dataset_data = {
    "question": [
        "Foo?",
        "What day is today?",
        "Who is the president of the United States?",
    ],
    "contexts": [
        [ "The answer to the question Foo is Bar." ],
        [ "The answer to the question Foo is Bar." ],
        [ "The answer to the question Foo is Bar." ],
    ],
    "ground_truth": [
        "Bar on Thursday, otherwise Baz.",
        "Thursday.",
        "Joe Biden.",
    ],
    "answer": [
        "Bar.",
        "It is sunny today and it is Wednesday.",
        "It is sunny today and it is Wednesday.",
    ]
}

dataset = Dataset.from_dict(dummy_dataset_data)

In [16]:
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms.base import LangchainLLMWrapper

import os, getpass
%store -r OPENAI_API_KEY
if not "OPENAI_API_KEY" in globals():
    OPENAI_API_KEY = getpass.getpass("Enter your OpenAI API key:")
    %store OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

%store -r LANGCHAIN_API_KEY
if not "LANGCHAIN_API_KEY" in globals():
    LANGCHAIN_API_KEY = getpass.getpass("Enter your LangChain API key:")
    %store LANGCHAIN_API_KEY
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "ragas-test"

In [8]:
from ragas import evaluate
from langchain_core.language_models import BaseLanguageModel

# answer_correctness = AnswerCorrectness(weights=[1.0, 0.0])
# answer_relevancy = AnswerRelevancy(use_langchain_parser=False)
context_entity_recall = ContextEntityRecall(use_langchain_parser=True)
faithfulness = Faithfulness(use_langchain_parser=False)

def eval_with_model(llm: BaseLanguageModel):
    wrapper = LangchainLLMWrapper(llm)
    result = evaluate(
        dataset,
        metrics=[
            # context_recall,
            # context_precision,
            # answer_correctness,
            # answer_relevancy,
            # context_entity_recall,
            faithfulness,
        ],
        llm=wrapper,
    )
    return result

In [9]:
eval_with_model(
    ChatOpenAI(
        model="gpt-4-turbo-preview",
        temperature=1.0,
    )
)

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'context_entity_recall': 0.1111}

In [10]:
eval_with_model(
    ChatOpenAI(
        model="gpt-3.5-turbo",
        temperature=1.0,
    )
)

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'context_entity_recall': 0.1667}

In [11]:
from langchain_community.llms.bedrock import Bedrock
from langchain_community.chat_models import BedrockChat

eval_with_model(
    Bedrock(
        model_id="anthropic.claude-v2",
        streaming=True,
    )
)

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'context_entity_recall': 0.1111}

In [12]:
from langchain_community.llms.bedrock import Bedrock
from langchain_community.chat_models import BedrockChat

eval_with_model(
    Bedrock(
        model_id="anthropic.claude-v2:1",
        streaming=True,
    )
)

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'context_entity_recall': 0.1111}

In [13]:
eval_with_model(
    BedrockChat(
        model_id="anthropic.claude-instant-v1",
        streaming=True,
    )
)

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'context_entity_recall': 0.1111}

In [None]:
# from ragas.llms.prompt2 import get_prompt
# from ragas.metrics._context_recall import ContextRecallClassificationAnswers
# from langchain_core.output_parsers import JsonOutputParser
# from ragas.metrics import context_recall, context_precision

# prompt = get_prompt(
#     instructions=context_recall.context_recall_prompt.instruction,
#     output_parser=JsonOutputParser(pydantic_object=ContextRecallClassificationAnswers),
#     examples=context_recall.context_recall_prompt.examples,
#     input_variables=context_recall.context_recall_prompt.input_keys,
#     output_key=context_recall.context_recall_prompt.output_key,
# )

# print(prompt.format(question="Foo?", answer="Bar!", context="The answer to the question Foo? is Bar!"))

In [None]:
# prompt.save("./test-prompt.yaml")