In [4]:
pip install -U langsmith groq

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import getpass

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()
os.environ["GROQ_API_KEY"] = getpass.getpass()

 ········
 ········


In [52]:
from langsmith import Client
client = Client()

dataset_name = "QA Example Dataset"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

LangSmithConflictError: Conflict for /datasets. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets', '{"detail":"Dataset with this name already exists."}')

In [58]:
from langchain_groq import ChatGroq
from langchain.schema import SystemMessage, HumanMessage

groq_client = ChatGroq(model="llama3-8b-8192",
                      temperature = 0,
                      )

eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""You are grading the following question:
    {inputs['question']}
    Here is the real answer:
    {reference_outputs['answer']}
    You are grading the following predicted answer:
    {outputs['response']}
    Respond with CORRECT or INCORRECT:
    Grade:
    """
    response = groq_client.invoke([
        SystemMessage(content=eval_instructions),
        HumanMessage(content=user_content),
    ])
    return response.content.strip().upper() == "CORRECT"

In [59]:
def concision(outputs:dict, reference_outputs:dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

In [62]:
from langchain_groq import ChatGroq
from langchain.schema import SystemMessage, HumanMessage

default_instructions = "Respond to the user's question in a short, concise manner (one short sentence)."

def my_app(question:str, model:str="llama3-8b-8192", instructions: str = default_instructions) -> str:
    groq_client = ChatGroq(
        model = model,
        temperature=0,
    )

    response = groq_client.invoke([
    SystemMessage(content = instructions),
    HumanMessage(content=question),
    ])
    return response.content.strip()

In [63]:
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"])}

In [67]:
experiment_results = client.evaluate(
    ls_target,
    data = dataset_name,
    evaluators=[concision,correctness],
    experiment_prefix="llama3-8b-8192",
)

View the evaluation results for experiment: 'llama3-8b-8192-131412d6' at:
https://smith.langchain.com/o/2db5a418-b512-450f-be81-9e23d6cb54fc/datasets/5c106ec0-58e5-4f19-9e23-36e44a6e36b2/compare?selectedSessions=f3ebac25-d6bb-496b-8e3f-260f67f546a5




0it [00:00, ?it/s]

In [66]:
def test_length_score() -> None:
    """Test tht the length score is at least 80%."""
    experiment_results = evaluate(
    ls_target,
    data=dataset_name,
    evaluators=[concision,correctness]
    )
    feedback=client.list_feedback(
    run_ids=[r.id for r in client.list_runs(project_name=experiment_results.experiment_name)],
    feedback_key="concision"
    )
    scores= [f.score for f in feedback]
    assert sum(scores)/len(scores) >= 0.8