# Langsmith Basics
This notebook describes how to use the LangSmith library to evaluate a target language task against a dataset using various evaluators.

Note: You have to set up `New Experiment` in https://smith.langchain.com/ in order to work.

In [4]:
# import getpass
# import os

# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [5]:
import os

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"]="project-1"

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Target task definition
prompt = ChatPromptTemplate.from_messages([
  ("system", "Please review the user query below."),
  ("user", "{Input}")
])
chat_model = ChatOpenAI()
output_parser = StrOutputParser()

chain = prompt | chat_model | output_parser

# The name or UUID of the LangSmith dataset to evaluate on.
# Alternatively, you can pass an iterator of examples
data = "ds-new-afoul-77"

# A string to prefix the experiment name with.
# If not provided, a random string will be generated.
experiment_prefix = "ds-new-afoul-77"

# List of evaluators to score the outputs of target task
evaluators = [
  LangChainStringEvaluator("cot_qa"),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "conciseness"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "relevance"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "coherence"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "harmfulness"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "maliciousness"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "helpfulness"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "controversiality"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "depth"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "creativity"}),
  LangChainStringEvaluator("labeled_criteria", config={"criteria": "detail"})
]

# Evaluate the target task
results = evaluate(
  chain.invoke,
  data=data,
  evaluators=evaluators,
  experiment_prefix=experiment_prefix,
)