In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

In [2]:
from langsmith import Client

client = Client()

#define dataset: these are your test cases
dataset_name = "QA Example Dataset"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    inputs=[
        {"question": "What is LangChain?"},
        {"question": "What is LangSmith?"},
        {"question": "What is OpenAI?"},
        {"question": "What is Google?"},
        {"question": "What is Mistral?"},
    ],
    outputs=[
        {"answer": "A framework for building LLM applications"},
        {"answer": "A platform for observing and evaluating LLM applications"},
        {"answer": "A company that creates Large Language Models"},
        {"answer": "A technology company known for search"},
        {"answer": "A company that creates Large Language Models"},
    ],
    dataset_id=dataset.id,
)

In [3]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator

_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
You are grading the following question:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
Respond with CORRECT or INCORRECT:
Grade:
"""

PROMPT = PromptTemplate(
    input_variables=["query", "answer", "result"], template=_PROMPT_TEMPLATE
)

eval_llm = ChatOpenAI(model='gpt-4o', temperature=0.0)

qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm, "prompt": PROMPT})

In [4]:
from langsmith.schemas import Run, Example

def evaluate_length(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("ouput") or ""
    required = example.outputs.get("answer") or ""
    score = int(len(prediction) < 2 * len(required))
    return {"key":"length", "score":score}

In [5]:
import openai

openai_client = openai.Client()

def my_app(question):
    return openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to the users question in a short, concise manner (one short sentence)."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content

In [6]:
#define a simple wrapper that maps the input keys from our dataset to the function we want to call, 
#and then also maps the output of the function to the output key we expect.

def langsmith_app(inputs):
    output = my_app(inputs['question'])
    return {"output": output}

In [7]:
from langsmith import evaluate

experiment_results = evaluate(
    langsmith_app,
    data=dataset_name,
    evaluators=[evaluate_length, qa_evaluator],
    experiment_prefix='chatbot'
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'chatbot-752ea048' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/d6be29c2-eb80-4935-9351-404b63c7d9be/compare?selectedSessions=05dfb4ae-152d-44fa-b340-dd4a696dfc9c




5it [00:08,  1.63s/it]


In [8]:
import openai

openai_client = openai.Client()

def my_app_1(question):
    return openai_client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to the users question in a short, concise manner (one short sentence)."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content


def langsmith_app_1(inputs):
    output = my_app_1(inputs["question"])
    return {"output": output}

from langsmith.evaluation import evaluate

experiment_results = evaluate(
    langsmith_app_1, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    experiment_prefix="openai-4", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'openai-4-639fe38d' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/d6be29c2-eb80-4935-9351-404b63c7d9be/compare?selectedSessions=c847dd9b-2c97-4b5c-a93d-218854ce2c7a




5it [00:06,  1.31s/it]


In [9]:
import openai

openai_client = openai.Client()

def my_app_2(question):
    return openai_client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to the users question in a short, concise manner (one short sentence). Do NOT use more than ten words."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content


def langsmith_app_2(inputs):
    output = my_app_2(inputs["question"])
    return {"output": output}

from langsmith.evaluation import evaluate

experiment_results = evaluate(
    langsmith_app_2, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[evaluate_length, qa_evaluator], # The evaluators to score the results
    experiment_prefix="strict-openai-4", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'strict-openai-4-eca8b534' at:
https://smith.langchain.com/o/e0a07099-8389-43f1-bdb8-a03361a3989c/datasets/d6be29c2-eb80-4935-9351-404b63c7d9be/compare?selectedSessions=fbb5c914-b4fc-4bf2-81c5-8c1436b1a4ef




5it [00:06,  1.25s/it]
