# Setup

In [1]:
!pip install -U langsmith google-genai

Collecting langsmith
  Downloading langsmith-0.4.5-py3-none-any.whl.metadata (15 kB)
Collecting google-genai
  Downloading google_genai-1.25.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading langsmith-0.4.5-py3-none-any.whl (367 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.8/367.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_genai-1.25.0-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langsmith, google-genai
  Attempting uninstall: langsmith
    Found existing installation: langsmith 0.4.4
    Uninstalling langsmith-0.4.4:
      Successfully uninstalled langsmith-0.4.4
  Attempting uninstall: google-genai
    Found existing installation: google-genai 1.24.0
    Uninstalling google-gena

In [16]:
from google.colab import userdata
import os
os.environ["LANGSMITH_TRACING_V2"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('Smith2')
os.environ["GEMINI_API_KEY"] = userdata.get('gemini')

# Create a dataset

In [None]:
# @title Create and upload datasets to LangSmith

from langsmith import Client

client = Client()

# Define dataset: these are your test cases
dataset_name = "QA Example Dataset1"
dataset = client.create_dataset(dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

# Define metrics

Lets define 2 metrics - correctness & conciseness

In [10]:
import google.generativeai as genai
from langsmith import Client
from langsmith import traceable

# Initialize the Gemini API
gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')


eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

#Define Correctness
@traceable
def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""You are grading the following question:
{inputs['question']}
Here is the real answer:
{reference_outputs['answer']}
You are grading the following predicted answer:
{outputs['response']}
Respond with CORRECT or INCORRECT:
Grade:
"""
    response = gemini_model.generate_content(
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content},
        ],
    ).text
    return response.strip() == "CORRECT"

#Define Concisenss
@traceable
def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

In [22]:
# @title What is this
import google.generativeai as genai
from langsmith import Client
from langsmith import traceable

# Initialize the Gemini API
#gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
gem_client = wrappers.wrap_genai(genai.google())
#openai_client = wrappers.wrap_openai(openai.OpenAI())

eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

#Define Correctness
@traceable
def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    user_content = f"""You are grading the following question:
{inputs['question']}
Here is the real answer:
{reference_outputs['answer']}
You are grading the following predicted answer:
{outputs['response']}
Respond with CORRECT or INCORRECT:
Grade:
"""
    response = gem_client.chat.completions.create(
        model="gemini-1.5-flash-latest",
        temperature=0,
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content},
        ],
    ).text
    return response.strip() == "CORRECT"

#Define Concisenss
@traceable
def concision(outputs: dict, reference_outputs: dict) -> bool:
    return int(len(outputs["response"]) < 2 * len(reference_outputs["answer"]))

NameError: name 'wrappers' is not defined

# Run Evaluations

In [6]:
from google.genai import types

client = genai.Client()

default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."
@traceable
def my_app(question: str, model: str = "gemini-1.5-flash-latest", instructions: str = default_instructions) -> str:
    response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Explain how AI works in a few words",
    config=types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking
    ),
    ).choices[0].message.content

AttributeError: module 'google.generativeai' has no attribute 'Client'

In [11]:
from google import genai
from google.genai import types

client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-flash",
    config=types.GenerateContentConfig(
        system_instruction="You are a cat. Your name is Neko.respond in not more than 7 words"),
    contents="Hello there"
)

print(response.text)

Meow. You have my attention, human.


In [12]:
default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."
@traceable
def my_app(question: str, model: str = "gemini-1.5-flash-latest", instructions: str = default_instructions) -> str:
    return gemini_model.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [13]:
@traceable
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"])}

In [18]:
# Assuming 'client' from langsmith is already initialized in a previous cell
experiment_results = client.evaluate(
    ls_target, # Your AI system
    data=dataset_name, # The data to predict and grade over
    evaluators=[concision, correctness], # The evaluators to score the results
    experiment_prefix="openai-4o-mini1", # A prefix for your experiment names to easily identify them
)

View the evaluation results for experiment: 'openai-4o-mini1-7f61ffa0' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/98bb74d6-34bf-458f-a283-0928614d3218/compare?selectedSessions=cb5d590d-627f-4946-889d-f5b505b6c19b




0it [00:00, ?it/s]

ERROR:langsmith.evaluation._runner:Error running target function: 'GenerativeModel' object has no attribute 'chat'
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/langsmith/evaluation/_runner.py", line 1907, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "/tmp/ipython-input-13-3477953080.py", line 3, in ls_target
    return {"response": my_app(inputs["question"])}
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-12-3414510915.py", line 4, in my_app
    return gemini_model.chat.completions.create(
           ^^^^^^^^^^^^^^^^^
AttributeError: 'GenerativeModel' object has no attribute 'chat'
ERROR:langsmith.evaluation._runner:Error running evaluator <DynamicRunEvaluator concision> on run 383d66bf-56fb-42d9-bc17-5426958e367d: KeyError('response')
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/langsmith/evaluation/_runner.py", line 1603, in _run_evaluators
    evaluator_res