In [1]:
from langsmith import evaluate, Client
from openai import OpenAI
from pydantic import BaseModel, Field

from src.agent import graph
from src.config import DEEPSEEK_API_KEY, DEEPSEEK_URL

### Evaluation Dataset

example_inputs = [
    ("What did Neo purchase at 7-11 according to Cypher's report in The Matrix?", "According to Cypher's report in The Matrix, Neo purchased a six-pack of beer and a box of Captain Crunch at a 7-11."),
    ("What specific action does Wayne take with scraps of bread before attempting to climb out of the underground prison in The Dark Knight Rises?", "In The Dark Knight Rises, before attempting to climb out of the underground prison, Bruce Wayne takes scraps of bread and puts them into a rough coat, which he folds into a makeshift shoulder pack. This serves as his meager supplies for the climb. The prisoners mock him for this, but Wayne proceeds with determination."),
    ("What specific words does Yuri say to the dashboard to start the Bentley in 2012, and what does he tell Jackson afterward?", "In the movie *2012*, Yuri says the following words to the dashboard to start the Bentley:> **Yuri:** Engine... start...After the Bentley starts, Yuri tells Jackson:> **Yuri:** Voice control... Sold me on it."),
    ("What specific item does John select from a small wooden case and what does he do with it in John Wick?", "In the movie *John Wick*, John selects a **silencer** from a small wooden case. He screws it onto a pistol, preparing it for use. After assembling the weapon, he places it alongside other firearms before taking it with him as he leaves the room."),
    ("What specific action does Reese take with the balloon when interacting with the little girl at the service station picnic area in Terminator?", "In the movie *Terminator*, Reese interacts with a little girl at the service station picnic area when she asks him to retrieve her balloon, which is stuck in a tree. Reese pulls the balloon down by its string and crouches to hand it to her. However, when he hears a sudden scream behind him, he reflexively tenses up and accidentally bursts the balloon in his hands. The little girl, upset, punches him on the shoulder and storms off."),
    ("What does Sarah do to her cheeks in the locker room in terminator?", "In the locker room scene in *Terminator*, Sarah Connor pinches her cheeks and smiles vacuously while looking at her reflection in the mirror. She then says, Hi, I'm Sarah and I'll be your waitress, followed by, I'm so wholesome, I could puke. This is part of her transformation into a disguise as a waitress."),
    ("What specific action does Grant take to clean a small piece of bone during the dig scene?", "In the dig scene from *Jurassic Park*, Dr. Alan Grant cleans a small piece of bone by first brushing it with a toothbrush. Then, for a quicker method, he licks the bone to clean it. This action is followed by his excited explanation to the students about the bone's significance."),
    ("Who is the villain in the dark knight?", "It seems that The Dark Knight is not available in the database I can access. Therefore, I can't provide information about the villain in that movie. Let me know if you'd like details about another movie!"),
    ("Do you have John wick 2 movie script?", "It seems I don't have the script for John Wick 2, but I do have the script for the first John Wick movie. Let me know if you'd like details from that or if you're looking for something else!"),
    ("Do you have the second movie from bourne series?", "Yes, I have information about The Bourne Supremacy, which is the second movie in the Bourne series. Let me know if you'd like details or specific scenes from the movie!")
]

# Prepare inputs and outputs for bulk creation
inputs = [{"question": input_prompt} for input_prompt, _ in example_inputs]
outputs = [{"output": output_prompt} for _, output_prompt in example_inputs]

In [2]:
client = Client()
dataset_name = "screenplay-agent-dataset"
dataset_id = "b7035f94-c110-4090-9d8d-89502b1524fc"

# Note: Either dataset_id or dataset_name can be used
client.create_examples(
    inputs=inputs,
    outputs=outputs,
    dataset_id=dataset_id
)

### Evaluation

In [3]:
def correct_label(inputs: dict, reference_outputs: dict, outputs: dict) -> dict:
    score = outputs.get("output") == reference_outputs.get("label")
    return {"score": int(score), "key": "correct_label"}

In [4]:
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_URL)

#### Deepseek does not support structure response format

In [5]:
class SimilarityScore(BaseModel):
    similarity_score: int = Field(
        description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")


def compare_semantic_similarity(inputs: dict, reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]

    completion = client.beta.chat.completions.parse(
        model="deepseek-chat",
        messages=[
            {
                "role": "system",
                "content": "You are a semantic similarity evaluator. Compare the meanings of two responses to a Question, Reference Response and New Response, where Reference is the correct answer, and we are trying to judge if the New Response is identical. Provide a score between 1 and 10, where 1 means completely unrelated and 10 means identical in meaning. Output should be an integer."
            },
            {
                "role": "user",
                "content": f"Question: {input_question}\nReference Response: {reference_response}\nNew Response: {run_response}"
            }
        ],
    )

    similarity_score = completion.choices[0].message.content
    return {"score": similarity_score, "key": "similarity"}

In [48]:
inputs = {
    "question": "Hello, what is your name?"
}
reference_outputs = {
    "output": "My name is Batman."
}
outputs = {
    "output": "I am Superman."
}

similarity_score = compare_semantic_similarity(inputs, reference_outputs, outputs)

In [6]:
def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}


def target_function(inputs: dict):
    return graph.invoke(inputs["question"])


evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="all-MiniLM-L6-v2",
)

View the evaluation results for experiment: 'all-MiniLM-L6-v2-410180e4' at:
https://eu.smith.langchain.com/o/19d15796-7393-4e7e-8878-9e6f0666496c/datasets/b7035f94-c110-4090-9d8d-89502b1524fc/compare?selectedSessions=8dff9442-0e6f-4a54-978f-3ddb9e4ca145




0it [00:00, ?it/s]

Error running target function: Expected dict, got Who is the villain in the dark knight?
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_GRAPH_NODE_RETURN_VALUE
Traceback (most recent call last):
  File "C:\Users\vivek\anaconda3\envs\screenplay-agent\Lib\site-packages\langsmith\evaluation\_runner.py", line 1905, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "C:\Users\vivek\anaconda3\envs\screenplay-agent\Lib\site-packages\langsmith\run_helpers.py", line 647, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\vivek\AppData\Local\Temp\ipykernel_24100\2874891545.py", line 6, in target_function
    return graph.invoke(inputs["question"])
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\vivek\anaconda3\envs\screenplay-agent\Lib\site-packages\langgraph\pregel\__init__.py", line 2719, in invoke

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.wrapper,execution_time,example_id,id
0,Who is the villain in the dark knight?,,"InvalidUpdateError('Expected dict, got Who is ...",It seems that The Dark Knight is not available...,,0.023505,3001f52d-783f-4c22-a8b3-c42214bcbc53,13237f2f-81d0-47be-a080-50fede434871
1,What does Sarah do to her cheeks in the locker...,,"InvalidUpdateError('Expected dict, got What do...","In the locker room scene in *Terminator*, Sara...",,0.00498,7f08fddf-370d-4232-904d-9aadb3f72d70,705e4fc8-634a-4de2-943b-f0e9d3601acd
2,Do you have the second movie from bourne series?,,"InvalidUpdateError('Expected dict, got Do you ...","Yes, I have information about The Bourne Supre...",,0.004991,f2d8ab5f-2710-4e7d-a943-afc35247b29e,b99f3d50-a111-40ae-a16b-d5f5dd719886
3,Do you have Batman Begins Script?,,"InvalidUpdateError('Expected dict, got Do you ...","It looks like I don't have the script for ""Bat...",,0.005002,05540c96-a0ed-4195-a824-f4e60e516ca0,dcde4772-c4bb-4ea4-b49b-bd4bb578159a
4,What specific action does Wayne take with scra...,,"InvalidUpdateError('Expected dict, got What sp...","In The Dark Knight Rises, before attempting to...",,0.003999,31609919-de30-4380-91f9-c7daa06a191f,d19b2fec-c84a-44b8-8203-f33e7e970564
5,What specific action does Reese take with the ...,,"InvalidUpdateError('Expected dict, got What sp...","In the movie *Terminator*, Reese interacts wit...",,0.006145,4b138e50-49a6-46dc-b51a-068a57bbbfcf,5d56eb55-178f-45f8-91b3-d5a5d18dd22f
6,What specific item does John select from a sma...,,"InvalidUpdateError('Expected dict, got What sp...","In the movie *John Wick*, John selects a **sil...",,0.007577,6e5f4d0d-9cdd-4bb5-8fa0-6c5d2db84c7a,f21788df-4c37-46fe-9c96-8d8ead6fe9be
7,What did Neo purchase at 7-11 according to Cyp...,,"InvalidUpdateError(""Expected dict, got What di...","According to Cypher's report in The Matrix, Ne...",,0.007002,7865fa1d-a44d-49c7-a9a1-d75d6b96bc8e,792f2d25-365b-41f0-af22-4926ccf2b0ad
8,What specific words does Yuri say to the dashb...,,"InvalidUpdateError('Expected dict, got What sp...","In the movie *2012*, Yuri says the following w...",,0.005212,ddb2a335-fa58-4296-a74b-a3064f98e977,c287511e-da50-429a-8c3e-8e5a619c7d76
9,What specific action does Grant take to clean ...,,"InvalidUpdateError('Expected dict, got What sp...","In the dig scene from *Jurassic Park*, Dr. Ala...",,0.006002,df5cf711-7e6b-4a4c-892d-e0e3700175c9,bd0c5768-4afd-44a1-813f-f127b8a7f809
