In [1]:
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run


@run_evaluator
def is_empty(run: Run, example: Example | None = None):
    model_outputs = run.outputs["output"]
    score = not model_outputs.strip()
    return EvaluationResult(key="is_empty", score=score)

In [2]:
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run

class BlocklistEvaluator(RunEvaluator):
    def __init__(self, blocklist: list[str]):
        self.blocklist = blocklist

    def evaluate_run(
        self, run: Run, example: Example | None = None
    ) -> EvaluationResult:
        model_outputs = run.outputs["output"]
        score = not any([word in model_outputs for word in self.blocklist])
        return EvaluationResult(key="blocklist", score=score)

In [4]:
!pip install evaluate

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0 (from evaluate)
  Obtaining dependency information for fsspec[http]>=2021.05.0 from https://files.pythonhosted.org/packages/e8/f6/3eccfb530aac90ad1301c582da228e4763f19e719ac8200752a4841b0b2d/fsspec-2023.10.0-py3-none-any.whl.metadata
  Using cached fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m667.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached fsspec-2023.10.0-py3-none-any.whl (166 kB)
Installing collected packages

In [5]:
from typing import Optional

from evaluate import load
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run


class PerplexityEvaluator(RunEvaluator):
    def __init__(self, prediction_key: Optional[str] = None, model_id: str = "gpt-2"):
        self.prediction_key = prediction_key
        self.model_id = model_id
        self.metric_fn = load("perplexity", module_type="metric")

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        prediction = run.outputs[self.prediction_key]
        results = self.metric_fn.compute(
            predictions=[prediction], model_id=self.model_id
        )
        ppl = results["perplexities"][0]
        return EvaluationResult(key="Perplexity", score=ppl)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from dotenv import load_dotenv
load_dotenv()

from langsmith import Client
from langchain.smith import RunEvalConfig, run_on_dataset

client = Client()
ds = client.create_dataset("My Dataset")
client.create_examples(
    inputs=[
        {"input": "Hello"},
        {"input": "How are you?"},
    ],
    outputs=[
        {"output": "I'm good, thanks!"},
        {"output": "I'm not doing so well."},
    ],
    dataset_id=ds.id,
)

evaluation_config = RunEvalConfig(
    custom_evaluators = [PerplexityEvaluator(), BlocklistEvaluator(blocklist=["bad", "words"]), is_empty],
)

def my_model(inputs):
    return "This is a bad model"

client.run_on_dataset(
    dataset_name="My Dataset",
    llm_or_chain_factory=my_model,
    evaluation=evaluation_config,
)

HTTPError: [Errno 409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets] {"detail":"Dataset with this name already exists."}

In [12]:
client.run_on_dataset(
    dataset_name="My Dataset",
    llm_or_chain_factory=my_model,
    evaluation=evaluation_config,
)

  warn_deprecated(


View the evaluation results for project 'timely-steel-32' at:
https://smith.langchain.com/o/e6f98c64-2691-54b4-806f-a4aa7d048f66/datasets/ee88df77-014f-4336-b614-edb198eda765/compare?selectedSessions=81c6fc7d-5a05-4125-abfd-c310868849eb

View all tests for Dataset My Dataset at:
https://smith.langchain.com/o/e6f98c64-2691-54b4-806f-a4aa7d048f66/datasets/ee88df77-014f-4336-b614-edb198eda765


Error evaluating run 993c7b80-f74c-4763-a82a-e6ed8834c72d with PerplexityEvaluator: KeyError(None)
Traceback (most recent call last):
  File "/Users/gyliu/py311/lib/python3.11/site-packages/langchain_core/tracers/evaluation.py", line 126, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/m3/cz7sy7cd4v7_mklwrfjs45wm0000gn/T/ipykernel_76209/2150780437.py", line 19, in evaluate_run
    prediction = run.outputs[self.prediction_key]
                 ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
KeyError: None
Error in EvaluatorCallbackHandler.on_chain_end callback: KeyError(None)
Error evaluating run 18a58940-6ec5-4c7d-9e4f-e1d56fe831d1 with PerplexityEvaluator: KeyError(None)
Traceback (most recent call last):
  File "/Users/gyliu/py311/lib/python3.11/site-packages/langchain_core/tracers/evaluation.py", line 126, in _evaluate_in_project
    evaluation_result = evaluator.evaluate_run(
                        ^^^^^

[------------------------------------------------->] 2/2

{'project_name': 'timely-steel-32',
 'results': {'4ad60554-649a-4c43-854a-5f9bdcf547bb': {'input': {'input': 'How are you?'},
   'feedback': [],
   'execution_time': 0.001823,
   'run_id': '993c7b80-f74c-4763-a82a-e6ed8834c72d',
   'output': 'This is a bad model',
   'reference': {'output': "I'm not doing so well."}},
  'f574a403-f3dd-4160-ae5c-2391ea0ccec5': {'input': {'input': 'Hello'},
   'feedback': [],
   'execution_time': 0.001213,
   'run_id': '18a58940-6ec5-4c7d-9e4f-e1d56fe831d1',
   'output': 'This is a bad model',
   'reference': {'output': "I'm good, thanks!"}}}}