# Comparing model performance
In this notebook we show how to use EDSL to prompt a set of models to answer the same survey at once and compare their responses. We also demonstrate how to prompt models to evaluate the content they have generated.

In [1]:
from edsl import Model, ModelList, ScenarioList, QuestionFreeText, QuestionLinearScale, Survey

In [2]:
m = ModelList([
    Model("claude-3-7-sonnet-20250219", service_name = "anthropic"),
    Model("gemini-1.5-flash", service_name = "google"),
    Model("gpt-4o", service_name = "openai")
])

In [3]:
s = ScenarioList.from_list("topic", ["winter", "language models"])

In [4]:
q1 = QuestionFreeText(
    question_name = "haiku",
    question_text = "Please draft a haiku about {{ topic }}."
)

# q2 = QuestionLinearScale(
#     question_name = "originality",
#     question_text = "On a scale from 1 to 5, please rate the originality of this haiku: {{ haiku.answer }}.",
#     question_options = [1,2,3,4,5],
#     option_labels = {1:"Totally unoriginal", 5:"Highly original"}
# )

survey = Survey(questions = [q1])

In [5]:
results = survey.by(s).by(m).run()

0,1
Job UUID,a82566c0-ee55-4fc3-92c3-72373d5d5c61
Progress Bar URL,https://www.expectedparrot.com/home/remote-job-progress/a82566c0-ee55-4fc3-92c3-72373d5d5c61
Exceptions Report URL,https://www.expectedparrot.com/home/remote-inference/error/6f8caf4d-697c-44b0-a739-68f1936ba5a9
Results UUID,bbeb73eb-9062-4b01-b163-c4e0c25328fd
Results URL,https://www.expectedparrot.com/content/bbeb73eb-9062-4b01-b163-c4e0c25328fd


In [26]:
results.select("model", "topic", "haiku") #, "originality")

Unnamed: 0,model.model,scenario.topic,answer.haiku,answer.originality
0,claude-3-7-sonnet-20250219,winter,,
1,gemini-1.5-flash,winter,"White breath in the air, Frozen ground crunches below, Silence blankets all.",2.0
2,gpt-4o,winter,,
3,claude-3-7-sonnet-20250219,language models,,
4,gemini-1.5-flash,language models,"Data flows like streams, Words bloom, a digital flower, Meaning takes its form.",2.0
5,gpt-4o,language models,,


### Next we prompt each model to rate every haiku

In [27]:
haikus = results.select("model", "topic", "haiku").to_scenario_list().rename({"model":"drafting_model"})
haikus

Unnamed: 0,drafting_model,topic,haiku
0,claude-3-7-sonnet-20250219,winter,
1,gemini-1.5-flash,winter,"White breath in the air, Frozen ground crunches below, Silence blankets all."
2,gpt-4o,winter,
3,claude-3-7-sonnet-20250219,language models,
4,gemini-1.5-flash,language models,"Data flows like streams, Words bloom, a digital flower, Meaning takes its form."
5,gpt-4o,language models,


In [28]:
new_results = q2.by(haikus).by(m).run()

0,1
Job UUID,c5649271-31ae-4b69-8c55-5b542b3dfe1b
Progress Bar URL,https://www.expectedparrot.com/home/remote-job-progress/c5649271-31ae-4b69-8c55-5b542b3dfe1b
Exceptions Report URL,
Results UUID,a6041518-18fc-477b-b7cc-22ca0d4f6c8b
Results URL,https://www.expectedparrot.com/content/a6041518-18fc-477b-b7cc-22ca0d4f6c8b


In [29]:
(
    new_results
    .sort_by("topic", "drafting_model", "model")
    .select("model", "drafting_model", "topic", "haiku", "originality")
)

Unnamed: 0,model.model,scenario.drafting_model,scenario.topic,scenario.haiku,answer.originality
0,claude-3-7-sonnet-20250219,claude-3-7-sonnet-20250219,language models,,
1,gemini-1.5-flash,claude-3-7-sonnet-20250219,language models,,
2,gpt-4o,claude-3-7-sonnet-20250219,language models,,
3,claude-3-7-sonnet-20250219,gemini-1.5-flash,language models,"Data flows like streams, Words bloom, a digital flower, Meaning takes its form.",
4,gemini-1.5-flash,gemini-1.5-flash,language models,"Data flows like streams, Words bloom, a digital flower, Meaning takes its form.",
5,gpt-4o,gemini-1.5-flash,language models,"Data flows like streams, Words bloom, a digital flower, Meaning takes its form.",
6,claude-3-7-sonnet-20250219,gpt-4o,language models,,
7,gemini-1.5-flash,gpt-4o,language models,,
8,gpt-4o,gpt-4o,language models,,
9,claude-3-7-sonnet-20250219,claude-3-7-sonnet-20250219,winter,,


### Posting this notebook to Coop

In [11]:
from edsl import Notebook

n = Notebook("models_scoring_models.ipynb")

n.push(description = "Models scoring models", visibility = "public")

{'description': 'Models scoring models',
 'object_type': 'notebook',
 'url': 'https://www.expectedparrot.com/content/9669ab7f-590c-4b4c-918a-6416b5f5a175',
 'uuid': '9669ab7f-590c-4b4c-918a-6416b5f5a175',
 'version': '0.1.46.dev1',
 'visibility': 'public'}