How can I define my own custom evaluator?

![Alt text](evaluation_flow.png)

In [None]:
import sys
sys.path.append('/opt/project/src/evaluate_llm/')

from api_key_config import settings
import os

os.environ['LANGCHAIN_TRACING_V2'] = settings.LANGCHAIN_TRACING_V2
os.environ['LANGCHAIN_API_KEY'] = settings.LANGCHAIN_API_KEY

os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT

In [6]:
# Load blog post

import requests
from bs4 import BeautifulSoup

url = "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
text = [p.text for p in soup.find_all("p")]
full_text = "\n".join(text)

In [7]:
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

model_gen = AzureChatOpenAI(azure_deployment="gpt-35-turbo")
model_eval = AzureChatOpenAI(azure_deployment="gpt-35-turbo")

def answer_dbrx_question_llm(inputs: dict) -> dict:
    """
    Generates answers to user questions based on a provided website text using OpenAI API.

    Parameters:
    inputs (dict): A dictionary with a single key 'question', representing the user's question as a string.

    Returns:
    dict: A dictionary with a single key 'output', containing the generated answer as a string.
    """
    
    chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content=(f"Answer user questions in 2-3 sentences about this context: \n\n\n {full_text}")),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
    )
    messages = chat_template.format_messages(text=inputs["question"])

    # Call OpenAI
    response = model_gen.invoke(messages)

    # Response in output dict
    return {"answer": response.content}

In [17]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

def is_answered(run: Run, example: Example) -> dict:
    # Get outputs
    student_answer = run.outputs.get("answer")

    # Check if the student_answer is an empty string
    if not student_answer:
        return {"key": "is_answered", "score": 0}
    else:
        return {"key": "is_answered", "score": 1}


# Evaluators
qa_evalulator = [is_answered]
dataset_name = "DBRX"

# Run
experiment_results = evaluate(
    answer_dbrx_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-custom-eval-is-answered",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'test-dbrx-qa-custom-eval-is-answered-14b3c19c' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/3caf3425-c29e-4dfd-9054-b069fbc2a174/compare?selectedSessions=3f13b003-f0fb-42e6-9ffb-1fbc92b72839




0it [00:00, ?it/s]