In [None]:
import sys
sys.path.append('/opt/project/src/evaluate_llm/')
from api_key_config import settings
import os

# os.environ['LANGCHAIN_TRACING_V2'] = settings.LANGCHAIN_TRACING_V2
# os.environ['LANGCHAIN_API_KEY'] = settings.LANGCHAIN_API_KEY

# os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
# os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
# os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT

In [1]:
import sys
sys.path.append('/opt/project/src/evaluate_llm/')
from api_key_config import settings
import os
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ['LANGCHAIN_API_KEY'] = settings.LANGCHAIN_API_KEY 

azure_configs = {
    "base_url": settings.AZURE_OPENAI_ENDPOINT,
    "model_deployment": None,
    "model_name": "gpt-35-turbo",
    "embedding_deployment": None,
    "embedding_name": "text-embedding-ada-002",  # most likely
}

In [2]:
from bs4 import BeautifulSoup as Soup
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_openai.chat_models import AzureChatOpenAI
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from ragas import evaluate

azure_model = AzureChatOpenAI(
    openai_api_version=settings.OPENAI_API_VERSION,
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    validate_base_url=False,
)

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
azure_embeddings = AzureOpenAIEmbeddings(
    openai_api_version=settings.OPENAI_API_VERSION,
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["embedding_deployment"],
    model=azure_configs["embedding_name"],
)

# Load
url = "https://python.langchain.com/docs/expression_language/"
loader = RecursiveUrlLoader(
    url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=azure_embeddings)

# Index
retriever = vectorstore.as_retriever()

In [3]:
### RAG
from langsmith import traceable
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

class RagBot:
    
    def __init__(self, retriever):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._model_gen = AzureChatOpenAI(azure_deployment="gpt-35-turbo", 
                                          openai_api_version=settings.OPENAI_API_VERSION,
                                          azure_endpoint=settings.AZURE_OPENAI_ENDPOINT,
                                          temperature = 0.0)

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def get_answer(self, question: str):
        similar = self.retrieve_docs(question)
        chat_template = ChatPromptTemplate.from_messages(
        [
            SystemMessage(content=("""You are a helpful AI code assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{similar}""")),
            HumanMessagePromptTemplate.from_template("{text}"),
        ]
        )
        messages = chat_template.format_messages(text=question)
        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": self._model_gen.invoke(messages).content,
            "contexts": [str(doc) for doc in similar],
        }


rag_bot = RagBot(retriever)

In [4]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [9]:
from ragas.metrics import (
    context_precision,
    answer_relevancy,
    faithfulness,
    context_recall,
)

from ragas.metrics.critique import harmfulness
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate
from datasets import Dataset, DatasetDict, Features, Value, Sequence
from ragas import evaluate as ragas_evaluate



def ragas_eval(run: Run, example: Example) -> dict:
    question = example.inputs.get("question")
    ground_truth = example.outputs.get("answer")
    answer = run.outputs.get("answer")
    contexts = run.outputs.get("contexts")
    # Define the schema
    features = Features({
        'question': Value('string'),
        'ground_truth': Value('string'),
        'answer': Value('string'),
        'contexts': Sequence(Value('string')),
    })
    
    custom_dataset = Dataset.from_dict({"question": [question],
                                        "ground_truth": [ground_truth],
                                        "answer": [answer],
                                        "contexts": [contexts]},
                                       features=features)
    custom_dataset_dict = DatasetDict({"eval": custom_dataset})
    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        harmfulness,
    ]

    result = ragas_evaluate(
        custom_dataset_dict["eval"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings
    )

    return {
            "results": [
                # Provide the key, score and other relevant information for each metric
                {"key": "faithfulness", "score": result["faithfulness"]},
                {"key": "answer_relevancy", "score": result["answer_relevancy"]},
                {"key": "context_recall", "score": result["context_recall"]},
                {"key": "context_precision", "score": result["context_precision"]},
                {"key": "harmfulness", "score": result["harmfulness"]},
            ]
        }

# Evaluators
qa_evalulator = [ragas_eval]
dataset_name = "RAG_test_LCEL" 

# Run
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-custom-eval-ragas",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'test-dbrx-qa-custom-eval-ragas-537bf2a2' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/e9f3f26b-8d24-4a6d-b39d-433836c8b3f7/compare?selectedSessions=a986ebbb-41d6-48e2-8ae8-d3f37c420306




0it [00:00, ?it/s]

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

Warning: Ragas lib maybe occur error sometime so i recommend you run on deepevall in each metric instant.